The Battle for Wesnoth  1.19.13+dev
tokenizer.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2025
3  by David White <dave@whitevine.net>
4  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY.
12 
13  See the COPYING file for more details.
14 */
15 
16 #include "formula/tokenizer.hpp"
17 
18 #include <locale>
19 #include <sstream>
20 
21 namespace wfl
22 {
23 namespace tokenizer
24 {
25 
26 namespace {
27 
28 [[noreturn]] void raise_exception(iterator& i1, iterator i2, const std::string& str) {
29  std::ostringstream expr;
30  while( (i1 != i2) && (*i1 != '\n') ) {
31  if( (*i1 != '\t') )
32  expr << *i1;
33  ++i1;
34  }
35 
36  if( str.empty() )
37  throw token_error("Unrecognized token", expr.str() );
38  else
39  throw token_error(str, expr.str() );
40 }
41 
42 }
43 
44 token get_token(iterator& i1, const iterator i2) {
45 
46  iterator it = i1;
47  if( *i1 >= 'A' ) {
48  //current character is >= 'A', limit search to the upper-half of the ASCII table
49 
50  // check if we parse now token_type::identifier or token_type::operator_token/keyword based on string
51  if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
52 
53  while(i1 != i2 && (std::isalpha(*i1, std::locale::classic()) || *i1 == '_'))
54  ++i1;
55 
56  int diff = i1 - it;
58 
59  //check if this string matches any keyword or an operator
60  //possible operators and keywords:
61  // d, or, in, def, and, not, wfl, where, wflend, functions
62  if( diff == 1 ) {
63  if( *it == 'd' )
65  } else if( diff == 2 ) {
66  if( *it == 'o' && *(it+1) == 'r' )
68  else if( *it == 'i' && *(it+1) == 'n' )
70  } else if( diff == 3 ) {
71  if( *it == 'd' ) { //def
72  if( *(it+1) == 'e' && *(it+2) == 'f' )
74  } else if( *it == 'a' ) { //and
75  if( *(it+1) == 'n' && *(it+2) == 'd' )
77  } else if( *it == 'n' ) { //not
78  if( *(it+1) == 'o' && *(it+2) == 't' )
80  } else if( *it == 'w' ) { //wfl
81  if( *(it+1) == 'f' && *(it+2) == 'l' )
83  }
84  } else if( diff == 5 ) {
85  std::string s(it, i1);
86  if( s == "where" )
88  } else if( diff == 6 ) {
89  std::string s(it, i1);
90  if( s == "wflend" )
92  } else if( diff == 9 ) {
93  std::string s(it, i1);
94  if( s == "functions" )
96  }
97 
98  return token( it, i1, t);
99  } else {
100  //at this point only 3 chars left to check:
101  if( *i1 == '[' )
102  return token( it, ++i1, token_type::lsquare );
103 
104  if( *i1 == ']' )
105  return token( it, ++i1, token_type::rsquare );
106 
107  if( *i1 == '^' )
108  return token( it, ++i1, token_type::operator_token );
109 
110  if( *i1 == '~' )
111  return token( it, ++i1, token_type::operator_token );
112 
113  //unused characters in this range:
114  // \ ` { | }
115  // Note: {} should never be used since they play poorly with WML preprocessor
116  }
117  } else {
118  //limit search to the lower-half of the ASCII table
119  //start by checking for whitespaces/end of line char
120  if( *i1 <= ' ' ) {
121  if( *i1 == '\n' ) {
122  return token( it, ++i1, token_type::eol);
123  } else {
124 
125  while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
126  ++i1;
127 
128  return token( it, i1, token_type::whitespace );
129  }
130  //try to further limit number of characters that we need to check:
131  } else if ( *i1 >= '0' ){
132  //current character is between '0' and '@'
133  if( *i1 <= '9' ) {
134  //we parse integer or decimal number
135  ++i1;
136  bool dot = false;
137 
138  while( i1 != i2 ) {
139  if( *i1 >= '0' && *i1 <= '9' ) {
140  //do nothing
141  } else {
142  //look for '.' in case of decimal number
143  if( *i1 == '.' ) {
144  //allow only one dot in such expression
145  if( !dot )
146  dot = true;
147  else
148  raise_exception(it, i2, "Multiple dots near decimal expression");
149  } else
150  break;
151  }
152  ++i1;
153  }
154 
155  if( dot )
156  return token( it, i1, token_type::decimal );
157  else
158  return token( it, i1, token_type::integer );
159 
160  } else {
161  //current character is between ':' and '@'
162  //possible tokens at this point that we are interested in:
163  // ; < = > <= >=
164  //unused characters in this range:
165  // : ? @
166 
167  if( *i1 == ';' ) {
168  return token( it, ++i1, token_type::semicolon);
169  } else if( *i1 == '=' ) {
170  return token( it, ++i1, token_type::operator_token);
171  } else if( *i1 == '<' ) {
172  ++i1;
173  if( i1 != i2 ) {
174  if( *i1 == '=' )
175  return token( it, ++i1, token_type::operator_token);
176  else
177  return token( it, i1, token_type::operator_token);
178  } else
179  return token( it, i1, token_type::operator_token);
180  } else if( *i1 == '>' ) {
181  ++i1;
182  if( i1 != i2 ) {
183  if( *i1 == '=' )
184  return token( it, ++i1, token_type::operator_token);
185  else
186  return token( it, i1, token_type::operator_token);
187  } else
188  return token( it, i1, token_type::operator_token);
189  }
190  }
191  //current character is between '!' and '/'
192  //possible tokens:
193  // , . .+ .- .* ./ .. ( ) ' # + - -> * / % !=
194  //unused characters:
195  // ! " $ &
196  // ! is used only as part of !=
197  // Note: " should never be used since it plays poorly with WML
198  } else if ( *i1 == ',' ) {
199  return token( it, ++i1, token_type::comma);
200 
201  } else if ( *i1 == '.' ) {
202  ++i1;
203 
204  if( i1 != i2 ) {
205  if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/' || *i1 == '.')
206  return token( it, ++i1, token_type::operator_token );
207  else
208  return token( it, i1, token_type::operator_token );
209  } else {
210  return token( it, i1, token_type::operator_token);
211  }
212 
213  } else if ( *i1 == '(' ) {
214  return token( it, ++i1, token_type::lparens);
215 
216  } else if ( *i1 == ')' ) {
217  return token( it, ++i1, token_type::rparens);
218 
219  } else if ( *i1 == '\'' ) {
220  int bracket_depth = 0;
221  ++i1;
222  while (i1 != i2) {
223  if (*i1 == '[') {
224  bracket_depth++;
225  } else if(bracket_depth > 0 && *i1 == ']') {
226  bracket_depth--;
227  } else if(bracket_depth == 0 && *i1 == '\'') {
228  break;
229  }
230  ++i1;
231  }
232 
233  if( i1 != i2 ) {
234  return token( it, ++i1, token_type::string_literal );
235  } else {
236  raise_exception(it, i2, "Missing closing ' for formula string");
237  }
238 
239  } else if ( *i1 == '#' ) {
240  ++i1;
241  while( i1 != i2 && *i1 != '#' )
242  ++i1;
243 
244  if( i1 != i2 ) {
245  return token( it, ++i1, token_type::comment );
246  } else {
247  raise_exception(it, i2, "Missing closing # for formula comment");
248  }
249 
250  } else if ( *i1 == '+' ) {
251  return token( it, ++i1, token_type::operator_token);
252 
253  } else if ( *i1 == '-' ) {
254  ++i1;
255 
256  if( i1 != i2 ) {
257  if( *i1 == '>' )
258  return token( it, ++i1, token_type::pointer );
259  else
260  return token( it, i1, token_type::operator_token );
261  } else {
262  return token( it, i1, token_type::operator_token);
263  }
264 
265  } else if ( *i1 == '*' ) {
266  return token( it, ++i1, token_type::operator_token);
267 
268  } else if ( *i1 == '/' ) {
269  return token( it, ++i1, token_type::operator_token);
270 
271  } else if ( *i1 == '%' ) {
272  return token( it, ++i1, token_type::operator_token);
273 
274  } else if ( *i1 == '!' ) {
275  ++i1;
276  if( *i1 == '=' )
277  return token( it, ++i1, token_type::operator_token);
278  else
279  raise_exception(it, i2, std::string() );
280  }
281  }
282  raise_exception(it, i2, std::string());
283 }
284 
285 }
286 
287 }
double t
Definition: astarsearch.cpp:63
class responsible for parsing the provided text into tokens and tracking information about the curren...
Definition: tokenizer.hpp:99
boost::variant< constant, n_var, boost::recursive_wrapper< not_op >, boost::recursive_wrapper< ternary_op > > expr
token get_token(iterator &i1, const iterator i2)
Definition: tokenizer.cpp:44
std::string::const_iterator iterator
Definition: tokenizer.hpp:25
Definition: callable.hpp:26
static map_location::direction s