The Battle for Wesnoth  1.13.10+dev
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tokenizer.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2017 by David White <dave@whitevine.net>
3  Part of the Battle for Wesnoth Project http://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 #include <sstream>
16 
17 #include "formula/tokenizer.hpp"
18 
19 namespace wfl
20 {
21 namespace tokenizer
22 {
23 
24 namespace {
25 
26 void raise_exception(iterator& i1, iterator i2, std::string str) {
27  std::ostringstream expr;
28  while( (i1 != i2) && (*i1 != '\n') ) {
29  if( (*i1 != '\t') )
30  expr << *i1;
31  ++i1;
32  }
33 
34  if( str.empty() )
35  throw token_error("Unrecognized token", expr.str() );
36  else
37  throw token_error(str, expr.str() );
38 }
39 
40 }
41 
42 token get_token(iterator& i1, const iterator i2) {
43 
44  iterator it = i1;
45  if( *i1 >= 'A' ) {
46  //current character is >= 'A', limit search to the upper-half of the ASCII table
47 
48  // check if we parse now TOKEN_IDENTIFIER or TOKEN_OPERATOR/KEYWORD based on string
49  if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
50 
51  while( i1 != i2 && ( ( *i1 >= 'a' && *i1 <= 'z' ) || *i1 == '_' || ( *i1 >= 'A' && *i1 <= 'Z' ) ) )
52  ++i1;
53 
54  int diff = i1 - it;
56 
57  //check if this string matches any keyword or an operator
58  //possible operators and keywords:
59  // d, or, in, def, and, not, wfl, where, wflend, functions
60  if( diff == 1 ) {
61  if( *it == 'd' )
62  t = TOKEN_OPERATOR;
63  } else if( diff == 2 ) {
64  if( *it == 'o' && *(it+1) == 'r' )
65  t = TOKEN_OPERATOR;
66  else if( *it == 'i' && *(it+1) == 'n' )
67  t = TOKEN_OPERATOR;
68  } else if( diff == 3 ) {
69  if( *it == 'd' ) { //def
70  if( *(it+1) == 'e' && *(it+2) == 'f' )
71  t = TOKEN_KEYWORD;
72  } else if( *it == 'a' ) { //and
73  if( *(it+1) == 'n' && *(it+2) == 'd' )
74  t = TOKEN_OPERATOR;
75  } else if( *it == 'n' ) { //not
76  if( *(it+1) == 'o' && *(it+2) == 't' )
77  t = TOKEN_OPERATOR;
78  } else if( *it == 'f' ) { //fai
79  if( *(it+1) == 'a' && *(it+2) == 'i' )
80  t = TOKEN_KEYWORD;
81  } else if( *it == 'w' ) { //wfl
82  if( *(it+1) == 'f' && *(it+2) == 'l' )
83  t = TOKEN_KEYWORD;
84  }
85  } else if( diff == 5 ) {
86  std::string s(it, i1);
87  if( s == "where" )
88  t = TOKEN_OPERATOR;
89  } else if( diff == 6 ) {
90  std::string s(it, i1);
91  if( s == "faiend" )
92  t = TOKEN_KEYWORD;
93  else if( s == "wflend" )
94  t = TOKEN_KEYWORD;
95  } else if( diff == 9 ) {
96  std::string s(it, i1);
97  if( s == "functions" )
98  t = TOKEN_KEYWORD;
99  }
100 
101  return token( it, i1, t);
102  } else {
103  //at this point only 3 chars left to check:
104  if( *i1 == '[' )
105  return token( it, ++i1, TOKEN_LSQUARE );
106 
107  if( *i1 == ']' )
108  return token( it, ++i1, TOKEN_RSQUARE );
109 
110  if( *i1 == '^' )
111  return token( it, ++i1, TOKEN_OPERATOR );
112 
113  if( *i1 == '~' )
114  return token( it, ++i1, TOKEN_OPERATOR );
115 
116  //unused characters in this range:
117  // \ ` { | }
118  // Note: {} should never be used since they play poorly with WML preprocessor
119  }
120  } else {
121  //limit search to the lower-half of the ASCII table
122  //start by checking for whitespaces/end of line char
123  if( *i1 <= ' ' ) {
124  if( *i1 == '\n' ) {
125  return token( it, ++i1, TOKEN_EOL);
126  } else {
127 
128  while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
129  ++i1;
130 
131  return token( it, i1, TOKEN_WHITESPACE );
132  }
133  //try to further limit number of characters that we need to check:
134  } else if ( *i1 >= '0' ){
135  //current character is between '0' and '@'
136  if( *i1 <= '9' ) {
137  //we parse integer or decimal number
138  ++i1;
139  bool dot = false;
140 
141  while( i1 != i2 ) {
142  if( *i1 >= '0' && *i1 <= '9' ) {
143  //do nothing
144  } else {
145  //look for '.' in case of decimal number
146  if( *i1 == '.' ) {
147  //allow only one dot in such expression
148  if( !dot )
149  dot = true;
150  else
151  raise_exception(it, i2, "Multiple dots near decimal expression");
152  } else
153  break;
154  }
155  ++i1;
156  }
157 
158  if( dot )
159  return token( it, i1, TOKEN_DECIMAL );
160  else
161  return token( it, i1, TOKEN_INTEGER );
162 
163  } else {
164  //current character is between ':' and '@'
165  //possible tokens at this point that we are interested in:
166  // ; < = > <= >=
167  //unused characters in this range:
168  // : ? @
169 
170  if( *i1 == ';' ) {
171  return token( it, ++i1, TOKEN_SEMICOLON);
172  } else if( *i1 == '=' ) {
173  return token( it, ++i1, TOKEN_OPERATOR);
174  } else if( *i1 == '<' ) {
175  ++i1;
176  if( i1 != i2 ) {
177  if( *i1 == '=' )
178  return token( it, ++i1, TOKEN_OPERATOR);
179  else
180  return token( it, i1, TOKEN_OPERATOR);
181  } else
182  return token( it, i1, TOKEN_OPERATOR);
183  } else if( *i1 == '>' ) {
184  ++i1;
185  if( i1 != i2 ) {
186  if( *i1 == '=' )
187  return token( it, ++i1, TOKEN_OPERATOR);
188  else
189  return token( it, i1, TOKEN_OPERATOR);
190  } else
191  return token( it, i1, TOKEN_OPERATOR);
192  }
193  }
194  //current character is between '!' and '/'
195  //possible tokens:
196  // , . .+ .- .* ./ .. ( ) ' # + - -> * / % !=
197  //unused characters:
198  // ! " $ &
199  // ! is used only as part of !=
200  // Note: " should never be used since it plays poorly with WML
201  } else if ( *i1 == ',' ) {
202  return token( it, ++i1, TOKEN_COMMA);
203 
204  } else if ( *i1 == '.' ) {
205  ++i1;
206 
207  if( i1 != i2 ) {
208  if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/' || *i1 == '.')
209  return token( it, ++i1, TOKEN_OPERATOR );
210  else
211  return token( it, i1, TOKEN_OPERATOR );
212  } else {
213  return token( it, i1, TOKEN_OPERATOR);
214  }
215 
216  } else if ( *i1 == '(' ) {
217  return token( it, ++i1, TOKEN_LPARENS);
218 
219  } else if ( *i1 == ')' ) {
220  return token( it, ++i1, TOKEN_RPARENS);
221 
222  } else if ( *i1 == '\'' ) {
223  int bracket_depth = 0;
224  ++i1;
225  while (i1 != i2) {
226  if (*i1 == '[') {
227  bracket_depth++;
228  } else if(bracket_depth > 0 && *i1 == ']') {
229  bracket_depth--;
230  } else if(bracket_depth == 0 && *i1 == '\'') {
231  break;
232  }
233  ++i1;
234  }
235 
236  if( i1 != i2 ) {
237  return token( it, ++i1, TOKEN_STRING_LITERAL );
238  } else {
239  raise_exception(it, i2, "Missing closing ' for formula string");
240  }
241 
242  } else if ( *i1 == '#' ) {
243  ++i1;
244  while( i1 != i2 && *i1 != '#' )
245  ++i1;
246 
247  if( i1 != i2 ) {
248  return token( it, ++i1, TOKEN_COMMENT );
249  } else {
250  raise_exception(it, i2, "Missing closing # for formula comment");
251  }
252 
253  } else if ( *i1 == '+' ) {
254  return token( it, ++i1, TOKEN_OPERATOR);
255 
256  } else if ( *i1 == '-' ) {
257  ++i1;
258 
259  if( i1 != i2 ) {
260  if( *i1 == '>' )
261  return token( it, ++i1, TOKEN_POINTER );
262  else
263  return token( it, i1, TOKEN_OPERATOR );
264  } else {
265  return token( it, i1, TOKEN_OPERATOR);
266  }
267 
268  } else if ( *i1 == '*' ) {
269  return token( it, ++i1, TOKEN_OPERATOR);
270 
271  } else if ( *i1 == '/' ) {
272  return token( it, ++i1, TOKEN_OPERATOR);
273 
274  } else if ( *i1 == '%' ) {
275  return token( it, ++i1, TOKEN_OPERATOR);
276 
277  } else if ( *i1 == '!' ) {
278  ++i1;
279  if( *i1 == '=' )
280  return token( it, ++i1, TOKEN_OPERATOR);
281  else
282  raise_exception(it, i2, std::string() );
283  }
284  }
285  raise_exception(it, i2, std::string() );
286  return token();
287 }
288 
289 }
290 
291 }
TOKEN_TYPE
TOKEN_TYPE is already defined in a Winnt.h (a windows header wich is included under some conditions...
Definition: tokenizer.hpp:27
std::vector< char_t > string
Abstract baseclass for the tokenizer.
Definition: tokenizer.hpp:55
token get_token(iterator &i1, const iterator i2)
Definition: tokenizer.cpp:42
static void expr(LexState *ls, expdesc *v)
Definition: lparser.cpp:1078
static map_location::DIRECTION s
double t
Definition: astarsearch.cpp:64
Definition: contexts.hpp:42
std::string::const_iterator iterator
Definition: tokenizer.hpp:24