The Battle for Wesnoth  1.19.0-dev
tokenizer.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2024
3  by David White <dave@whitevine.net>
4  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY.
12 
13  See the COPYING file for more details.
14 */
15 
16 #include "formula/tokenizer.hpp"
17 
18 #include <locale>
19 #include <sstream>
20 
21 namespace wfl
22 {
23 namespace tokenizer
24 {
25 
26 namespace {
27 
28 [[noreturn]] void raise_exception(iterator& i1, iterator i2, std::string str) {
29  std::ostringstream expr;
30  while( (i1 != i2) && (*i1 != '\n') ) {
31  if( (*i1 != '\t') )
32  expr << *i1;
33  ++i1;
34  }
35 
36  if( str.empty() )
37  throw token_error("Unrecognized token", expr.str() );
38  else
39  throw token_error(str, expr.str() );
40 }
41 
42 }
43 
44 token get_token(iterator& i1, const iterator i2) {
45 
46  iterator it = i1;
47  if( *i1 >= 'A' ) {
48  //current character is >= 'A', limit search to the upper-half of the ASCII table
49 
50  // check if we parse now token_type::identifier or token_type::operator_token/keyword based on string
51  if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
52 
53  while(i1 != i2 && (std::isalpha(*i1, std::locale::classic()) || *i1 == '_'))
54  ++i1;
55 
56  int diff = i1 - it;
58 
59  //check if this string matches any keyword or an operator
60  //possible operators and keywords:
61  // d, or, in, def, and, not, wfl, where, wflend, functions
62  if( diff == 1 ) {
63  if( *it == 'd' )
65  } else if( diff == 2 ) {
66  if( *it == 'o' && *(it+1) == 'r' )
68  else if( *it == 'i' && *(it+1) == 'n' )
70  } else if( diff == 3 ) {
71  if( *it == 'd' ) { //def
72  if( *(it+1) == 'e' && *(it+2) == 'f' )
74  } else if( *it == 'a' ) { //and
75  if( *(it+1) == 'n' && *(it+2) == 'd' )
77  } else if( *it == 'n' ) { //not
78  if( *(it+1) == 'o' && *(it+2) == 't' )
80  } else if( *it == 'f' ) { //fai
81  if( *(it+1) == 'a' && *(it+2) == 'i' )
83  } else if( *it == 'w' ) { //wfl
84  if( *(it+1) == 'f' && *(it+2) == 'l' )
86  }
87  } else if( diff == 5 ) {
88  std::string s(it, i1);
89  if( s == "where" )
91  } else if( diff == 6 ) {
92  std::string s(it, i1);
93  if( s == "faiend" )
95  else if( s == "wflend" )
97  } else if( diff == 9 ) {
98  std::string s(it, i1);
99  if( s == "functions" )
101  }
102 
103  return token( it, i1, t);
104  } else {
105  //at this point only 3 chars left to check:
106  if( *i1 == '[' )
107  return token( it, ++i1, token_type::lsquare );
108 
109  if( *i1 == ']' )
110  return token( it, ++i1, token_type::rsquare );
111 
112  if( *i1 == '^' )
113  return token( it, ++i1, token_type::operator_token );
114 
115  if( *i1 == '~' )
116  return token( it, ++i1, token_type::operator_token );
117 
118  //unused characters in this range:
119  // \ ` { | }
120  // Note: {} should never be used since they play poorly with WML preprocessor
121  }
122  } else {
123  //limit search to the lower-half of the ASCII table
124  //start by checking for whitespaces/end of line char
125  if( *i1 <= ' ' ) {
126  if( *i1 == '\n' ) {
127  return token( it, ++i1, token_type::eol);
128  } else {
129 
130  while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
131  ++i1;
132 
133  return token( it, i1, token_type::whitespace );
134  }
135  //try to further limit number of characters that we need to check:
136  } else if ( *i1 >= '0' ){
137  //current character is between '0' and '@'
138  if( *i1 <= '9' ) {
139  //we parse integer or decimal number
140  ++i1;
141  bool dot = false;
142 
143  while( i1 != i2 ) {
144  if( *i1 >= '0' && *i1 <= '9' ) {
145  //do nothing
146  } else {
147  //look for '.' in case of decimal number
148  if( *i1 == '.' ) {
149  //allow only one dot in such expression
150  if( !dot )
151  dot = true;
152  else
153  raise_exception(it, i2, "Multiple dots near decimal expression");
154  } else
155  break;
156  }
157  ++i1;
158  }
159 
160  if( dot )
161  return token( it, i1, token_type::decimal );
162  else
163  return token( it, i1, token_type::integer );
164 
165  } else {
166  //current character is between ':' and '@'
167  //possible tokens at this point that we are interested in:
168  // ; < = > <= >=
169  //unused characters in this range:
170  // : ? @
171 
172  if( *i1 == ';' ) {
173  return token( it, ++i1, token_type::semicolon);
174  } else if( *i1 == '=' ) {
175  return token( it, ++i1, token_type::operator_token);
176  } else if( *i1 == '<' ) {
177  ++i1;
178  if( i1 != i2 ) {
179  if( *i1 == '=' )
180  return token( it, ++i1, token_type::operator_token);
181  else
182  return token( it, i1, token_type::operator_token);
183  } else
184  return token( it, i1, token_type::operator_token);
185  } else if( *i1 == '>' ) {
186  ++i1;
187  if( i1 != i2 ) {
188  if( *i1 == '=' )
189  return token( it, ++i1, token_type::operator_token);
190  else
191  return token( it, i1, token_type::operator_token);
192  } else
193  return token( it, i1, token_type::operator_token);
194  }
195  }
196  //current character is between '!' and '/'
197  //possible tokens:
198  // , . .+ .- .* ./ .. ( ) ' # + - -> * / % !=
199  //unused characters:
200  // ! " $ &
201  // ! is used only as part of !=
202  // Note: " should never be used since it plays poorly with WML
203  } else if ( *i1 == ',' ) {
204  return token( it, ++i1, token_type::comma);
205 
206  } else if ( *i1 == '.' ) {
207  ++i1;
208 
209  if( i1 != i2 ) {
210  if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/' || *i1 == '.')
211  return token( it, ++i1, token_type::operator_token );
212  else
213  return token( it, i1, token_type::operator_token );
214  } else {
215  return token( it, i1, token_type::operator_token);
216  }
217 
218  } else if ( *i1 == '(' ) {
219  return token( it, ++i1, token_type::lparens);
220 
221  } else if ( *i1 == ')' ) {
222  return token( it, ++i1, token_type::rparens);
223 
224  } else if ( *i1 == '\'' ) {
225  int bracket_depth = 0;
226  ++i1;
227  while (i1 != i2) {
228  if (*i1 == '[') {
229  bracket_depth++;
230  } else if(bracket_depth > 0 && *i1 == ']') {
231  bracket_depth--;
232  } else if(bracket_depth == 0 && *i1 == '\'') {
233  break;
234  }
235  ++i1;
236  }
237 
238  if( i1 != i2 ) {
239  return token( it, ++i1, token_type::string_literal );
240  } else {
241  raise_exception(it, i2, "Missing closing ' for formula string");
242  }
243 
244  } else if ( *i1 == '#' ) {
245  ++i1;
246  while( i1 != i2 && *i1 != '#' )
247  ++i1;
248 
249  if( i1 != i2 ) {
250  return token( it, ++i1, token_type::comment );
251  } else {
252  raise_exception(it, i2, "Missing closing # for formula comment");
253  }
254 
255  } else if ( *i1 == '+' ) {
256  return token( it, ++i1, token_type::operator_token);
257 
258  } else if ( *i1 == '-' ) {
259  ++i1;
260 
261  if( i1 != i2 ) {
262  if( *i1 == '>' )
263  return token( it, ++i1, token_type::pointer );
264  else
265  return token( it, i1, token_type::operator_token );
266  } else {
267  return token( it, i1, token_type::operator_token);
268  }
269 
270  } else if ( *i1 == '*' ) {
271  return token( it, ++i1, token_type::operator_token);
272 
273  } else if ( *i1 == '/' ) {
274  return token( it, ++i1, token_type::operator_token);
275 
276  } else if ( *i1 == '%' ) {
277  return token( it, ++i1, token_type::operator_token);
278 
279  } else if ( *i1 == '!' ) {
280  ++i1;
281  if( *i1 == '=' )
282  return token( it, ++i1, token_type::operator_token);
283  else
284  raise_exception(it, i2, std::string() );
285  }
286  }
287  raise_exception(it, i2, std::string());
288 }
289 
290 }
291 
292 }
double t
Definition: astarsearch.cpp:63
Abstract baseclass for the tokenizer.
Definition: tokenizer.hpp:57
boost::variant< constant, n_var, boost::recursive_wrapper< not_op >, boost::recursive_wrapper< ternary_op > > expr
token get_token(iterator &i1, const iterator i2)
Definition: tokenizer.cpp:44
std::string::const_iterator iterator
Definition: tokenizer.hpp:25
Definition: contexts.hpp:43
static map_location::DIRECTION s