The Battle for Wesnoth  1.15.0-dev
tokenizer.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2018 by David White <dave@whitevine.net>
3  Part of the Battle for Wesnoth Project http://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 #include "formula/tokenizer.hpp"
16 
17 #include <locale>
18 #include <sstream>
19 
20 namespace wfl
21 {
22 namespace tokenizer
23 {
24 
25 namespace {
26 
27 [[noreturn]] void raise_exception(iterator& i1, iterator i2, std::string str) {
28  std::ostringstream expr;
29  while( (i1 != i2) && (*i1 != '\n') ) {
30  if( (*i1 != '\t') )
31  expr << *i1;
32  ++i1;
33  }
34 
35  if( str.empty() )
36  throw token_error("Unrecognized token", expr.str() );
37  else
38  throw token_error(str, expr.str() );
39 }
40 
41 }
42 
43 token get_token(iterator& i1, const iterator i2) {
44 
45  iterator it = i1;
46  if( *i1 >= 'A' ) {
47  //current character is >= 'A', limit search to the upper-half of the ASCII table
48 
49  // check if we parse now TOKEN_IDENTIFIER or TOKEN_OPERATOR/KEYWORD based on string
50  if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
51 
52  while(i1 != i2 && (std::isalpha(*i1, std::locale::classic()) || *i1 == '_'))
53  ++i1;
54 
55  int diff = i1 - it;
57 
58  //check if this string matches any keyword or an operator
59  //possible operators and keywords:
60  // d, or, in, def, and, not, wfl, where, wflend, functions
61  if( diff == 1 ) {
62  if( *it == 'd' )
63  t = TOKEN_OPERATOR;
64  } else if( diff == 2 ) {
65  if( *it == 'o' && *(it+1) == 'r' )
66  t = TOKEN_OPERATOR;
67  else if( *it == 'i' && *(it+1) == 'n' )
68  t = TOKEN_OPERATOR;
69  } else if( diff == 3 ) {
70  if( *it == 'd' ) { //def
71  if( *(it+1) == 'e' && *(it+2) == 'f' )
72  t = TOKEN_KEYWORD;
73  } else if( *it == 'a' ) { //and
74  if( *(it+1) == 'n' && *(it+2) == 'd' )
75  t = TOKEN_OPERATOR;
76  } else if( *it == 'n' ) { //not
77  if( *(it+1) == 'o' && *(it+2) == 't' )
78  t = TOKEN_OPERATOR;
79  } else if( *it == 'f' ) { //fai
80  if( *(it+1) == 'a' && *(it+2) == 'i' )
81  t = TOKEN_KEYWORD;
82  } else if( *it == 'w' ) { //wfl
83  if( *(it+1) == 'f' && *(it+2) == 'l' )
84  t = TOKEN_KEYWORD;
85  }
86  } else if( diff == 5 ) {
87  std::string s(it, i1);
88  if( s == "where" )
89  t = TOKEN_OPERATOR;
90  } else if( diff == 6 ) {
91  std::string s(it, i1);
92  if( s == "faiend" )
93  t = TOKEN_KEYWORD;
94  else if( s == "wflend" )
95  t = TOKEN_KEYWORD;
96  } else if( diff == 9 ) {
97  std::string s(it, i1);
98  if( s == "functions" )
99  t = TOKEN_KEYWORD;
100  }
101 
102  return token( it, i1, t);
103  } else {
104  //at this point only 3 chars left to check:
105  if( *i1 == '[' )
106  return token( it, ++i1, TOKEN_LSQUARE );
107 
108  if( *i1 == ']' )
109  return token( it, ++i1, TOKEN_RSQUARE );
110 
111  if( *i1 == '^' )
112  return token( it, ++i1, TOKEN_OPERATOR );
113 
114  if( *i1 == '~' )
115  return token( it, ++i1, TOKEN_OPERATOR );
116 
117  //unused characters in this range:
118  // \ ` { | }
119  // Note: {} should never be used since they play poorly with WML preprocessor
120  }
121  } else {
122  //limit search to the lower-half of the ASCII table
123  //start by checking for whitespaces/end of line char
124  if( *i1 <= ' ' ) {
125  if( *i1 == '\n' ) {
126  return token( it, ++i1, TOKEN_EOL);
127  } else {
128 
129  while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
130  ++i1;
131 
132  return token( it, i1, TOKEN_WHITESPACE );
133  }
134  //try to further limit number of characters that we need to check:
135  } else if ( *i1 >= '0' ){
136  //current character is between '0' and '@'
137  if( *i1 <= '9' ) {
138  //we parse integer or decimal number
139  ++i1;
140  bool dot = false;
141 
142  while( i1 != i2 ) {
143  if( *i1 >= '0' && *i1 <= '9' ) {
144  //do nothing
145  } else {
146  //look for '.' in case of decimal number
147  if( *i1 == '.' ) {
148  //allow only one dot in such expression
149  if( !dot )
150  dot = true;
151  else
152  raise_exception(it, i2, "Multiple dots near decimal expression");
153  } else
154  break;
155  }
156  ++i1;
157  }
158 
159  if( dot )
160  return token( it, i1, TOKEN_DECIMAL );
161  else
162  return token( it, i1, TOKEN_INTEGER );
163 
164  } else {
165  //current character is between ':' and '@'
166  //possible tokens at this point that we are interested in:
167  // ; < = > <= >=
168  //unused characters in this range:
169  // : ? @
170 
171  if( *i1 == ';' ) {
172  return token( it, ++i1, TOKEN_SEMICOLON);
173  } else if( *i1 == '=' ) {
174  return token( it, ++i1, TOKEN_OPERATOR);
175  } else if( *i1 == '<' ) {
176  ++i1;
177  if( i1 != i2 ) {
178  if( *i1 == '=' )
179  return token( it, ++i1, TOKEN_OPERATOR);
180  else
181  return token( it, i1, TOKEN_OPERATOR);
182  } else
183  return token( it, i1, TOKEN_OPERATOR);
184  } else if( *i1 == '>' ) {
185  ++i1;
186  if( i1 != i2 ) {
187  if( *i1 == '=' )
188  return token( it, ++i1, TOKEN_OPERATOR);
189  else
190  return token( it, i1, TOKEN_OPERATOR);
191  } else
192  return token( it, i1, TOKEN_OPERATOR);
193  }
194  }
195  //current character is between '!' and '/'
196  //possible tokens:
197  // , . .+ .- .* ./ .. ( ) ' # + - -> * / % !=
198  //unused characters:
199  // ! " $ &
200  // ! is used only as part of !=
201  // Note: " should never be used since it plays poorly with WML
202  } else if ( *i1 == ',' ) {
203  return token( it, ++i1, TOKEN_COMMA);
204 
205  } else if ( *i1 == '.' ) {
206  ++i1;
207 
208  if( i1 != i2 ) {
209  if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/' || *i1 == '.')
210  return token( it, ++i1, TOKEN_OPERATOR );
211  else
212  return token( it, i1, TOKEN_OPERATOR );
213  } else {
214  return token( it, i1, TOKEN_OPERATOR);
215  }
216 
217  } else if ( *i1 == '(' ) {
218  return token( it, ++i1, TOKEN_LPARENS);
219 
220  } else if ( *i1 == ')' ) {
221  return token( it, ++i1, TOKEN_RPARENS);
222 
223  } else if ( *i1 == '\'' ) {
224  int bracket_depth = 0;
225  ++i1;
226  while (i1 != i2) {
227  if (*i1 == '[') {
228  bracket_depth++;
229  } else if(bracket_depth > 0 && *i1 == ']') {
230  bracket_depth--;
231  } else if(bracket_depth == 0 && *i1 == '\'') {
232  break;
233  }
234  ++i1;
235  }
236 
237  if( i1 != i2 ) {
238  return token( it, ++i1, TOKEN_STRING_LITERAL );
239  } else {
240  raise_exception(it, i2, "Missing closing ' for formula string");
241  }
242 
243  } else if ( *i1 == '#' ) {
244  ++i1;
245  while( i1 != i2 && *i1 != '#' )
246  ++i1;
247 
248  if( i1 != i2 ) {
249  return token( it, ++i1, TOKEN_COMMENT );
250  } else {
251  raise_exception(it, i2, "Missing closing # for formula comment");
252  }
253 
254  } else if ( *i1 == '+' ) {
255  return token( it, ++i1, TOKEN_OPERATOR);
256 
257  } else if ( *i1 == '-' ) {
258  ++i1;
259 
260  if( i1 != i2 ) {
261  if( *i1 == '>' )
262  return token( it, ++i1, TOKEN_POINTER );
263  else
264  return token( it, i1, TOKEN_OPERATOR );
265  } else {
266  return token( it, i1, TOKEN_OPERATOR);
267  }
268 
269  } else if ( *i1 == '*' ) {
270  return token( it, ++i1, TOKEN_OPERATOR);
271 
272  } else if ( *i1 == '/' ) {
273  return token( it, ++i1, TOKEN_OPERATOR);
274 
275  } else if ( *i1 == '%' ) {
276  return token( it, ++i1, TOKEN_OPERATOR);
277 
278  } else if ( *i1 == '!' ) {
279  ++i1;
280  if( *i1 == '=' )
281  return token( it, ++i1, TOKEN_OPERATOR);
282  else
283  raise_exception(it, i2, std::string() );
284  }
285  }
286  raise_exception(it, i2, std::string());
287 }
288 
289 }
290 
291 }
TOKEN_TYPE
TOKEN_TYPE is already defined in a Winnt.h (a windows header which is included under some conditions...
Definition: tokenizer.hpp:27
Abstract baseclass for the tokenizer.
Definition: tokenizer.hpp:55
token get_token(iterator &i1, const iterator i2)
Definition: tokenizer.cpp:43
static void expr(LexState *ls, expdesc *v)
Definition: lparser.cpp:1078
static map_location::DIRECTION s
double t
Definition: astarsearch.cpp:63
Definition: contexts.hpp:43
std::string::const_iterator iterator
Definition: tokenizer.hpp:24