The Battle for Wesnoth  1.19.5+dev
tokenizer.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2010 - 2024
3  by Guillaume Melquiond <guillaume.melquiond@gmail.com>
4  Copyright (C) 2004 - 2009 by Philippe Plantier <ayin@anathas.org>
5  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
6 
7  This program is free software; you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation; either version 2 of the License, or
10  (at your option) any later version.
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY.
13 
14  See the COPYING file for more details.
15 */
16 
18 #include "wesconfig.h"
19 
20 tokenizer::tokenizer(std::istream& in) :
21  current_(EOF),
22  lineno_(1),
23  startlineno_(0),
24  textdomain_(PACKAGE),
25  file_(),
26  token_(),
27  in_(in)
28 {
29  for (int c = 0; c < END_STANDARD_ASCII; ++c)
30  {
32  if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
33  t = TOK_ALPHA;
34  } else if (c >= '0' && c <= '9') {
35  t = TOK_NUMERIC;
36  } else if (c == ' ' || c == '\t') {
37  t = TOK_SPACE;
38  }
39  char_types_[c] = t;
40  }
41  in_.stream().exceptions(std::ios_base::badbit);
43 }
44 
46 {
47  in_.stream().clear(std::ios_base::goodbit);
48  in_.stream().exceptions(std::ios_base::goodbit);
49 }
50 
52 {
53 #ifdef DEBUG_TOKENIZER
54  previous_token_ = token_;
55 #endif
56  token_.value.clear();
57 
58  // Dump spaces and inlined comments
59  while(true)
60  {
61  while (is_space(current_)) {
63  }
65  break;
66  skip_comment();
67  // skip the line end
69  }
70 
71  // skip comments on their own line
72  if (current_ == token::POUND)
73  skip_comment();
74 
75  // set the line number the next token will start on
77 
78  switch(current_) {
79  // we reached the end of the file being read
80  case EOF:
82  break;
83 
84  // handle open/closed angle brackets
85  // most commonly used for enclosing lua code
86  // more generally is used to indicate the preprocessor should skip over a particular block of text
88  // if there aren't double left angle brackets, there is no extra handling needed - this is just a regular left angle bracket
92  break;
93  }
94 
95  // else, treat this like a quoted string
98 
99  // keep getting characters and appending them to the current token's value until either the file ends or double right angle brackets are found
100  // finding the end of the file first is an error since double left angle brackets must always be closed by double right angle brackets
101  for (;;) {
102  next_char();
103  if (current_ == EOF) {
105  break;
108  break;
109  }
110  token_.value += current_;
111  }
112  break;
113 
114  // very similar to the double left+right angle bracket handling
115  // the main difference is the need to handle INLINED_PREPROCESS_DIRECTIVE_CHAR since double quotes don't affect the preprocessor
116  case token::DOUBLE_QUOTE:
118 
119  for (;;) {
120  next_char();
121  if (current_ == EOF) {
123  break;
124  } else if (current_ == token::DOUBLE_QUOTE) {
125  if (peek_char() != token::DOUBLE_QUOTE) {
126  break;
127  } else {
129  }
130  }
131 
132  // ignore this line and decrement the current line number
134  skip_comment();
135  --lineno_;
136  continue;
137  }
138 
139  token_.value += current_;
140  }
141  break;
142 
143  // tag name delimiters
144  case token::OPEN_BRACKET:
146  // closing tag
147  case token::SLASH:
148  case token::NEWLINE:
149  case token::EQUALS:
150  // handles multiple attributes on the same line
151  // ie: x,y = 5,5
152  case token::COMMA:
153  // tag merge aka node append, or string concatenation
154  case token::PLUS:
155  token_.type = static_cast<token::token_type>(current_);
157  break;
158 
159  // when in front of a QSTRING, indicates that the string is translatable
160  case token::UNDERSCORE:
161  // this check seems off - there are certainly other non-alphanumeric characters that shouldn't mean anything - but it looks like the parser handles those cases
162  if (!is_alnum(peek_char())) {
165  break;
166  }
167  [[fallthrough]];
168 
169  // everything else
170  default:
171  // if alphanumeric (regular text) or the dollar sign (variable)
172  // not quite sure how this works with non-ascii text particularly since the parser doesn't reference token_type::MISC
173  // but maybe the default handling does what's needed
176 
177  do {
178  token_.value += current_;
180 
182  skip_comment();
184  }
185  } while (is_alnum(current_) || current_ == token::DOLLAR);
186  } else {
188  token_.value += current_;
189  next_char();
190  }
191  return token_;
192  }
193 
194  // if this isn't the end of the file, get the next character in preparation for the next call to this method
195  if (current_ != EOF) {
196  next_char();
197  }
198 
199  return token_;
200 }
201 
202 bool tokenizer::skip_command(char const *cmd)
203 {
204  // check that the character match the provided text, else return false
205  for (; *cmd; ++cmd) {
207  if (current_ != *cmd) {
208  return false;
209  }
210  }
211 
212  // check that it's followed by a space, else return false
214  if (!is_space(current_)) {
215  return false;
216  }
217 
219  return true;
220 }
221 
223 {
224  // nothing to do if the line ends or the file ends
226  if (current_ == token::NEWLINE || current_ == EOF) {
227  return;
228  }
229 
230  // used to point to either textdomain_ or file_, and populate that field with the value following the respective command
231  std::string *dst = nullptr;
232 
233  // if this is a #textdomain, point to textdomain_
234  if (current_ == 't')
235  {
236  if (!skip_command("extdomain")) {
237  goto not_a_command;
238  }
239  dst = &textdomain_;
240  }
241  // else if this is a #line, determine the line number and then point to file_
242  else if (current_ == 'l')
243  {
244  if (!skip_command("ine")) {
245  goto not_a_command;
246  }
247 
248  lineno_ = 0;
249  while (is_num(current_)) {
250  // ie if the line number is 587
251  // (0 * 10) + 5 = 5
252  // (5 * 10) + 8 = 58
253  // (58 * 10) + 7 = 587
254  lineno_ = lineno_ * 10 + (current_ - '0');
256  }
257 
258  if (!is_space(current_)) {
259  goto not_a_command;
260  }
261 
263  dst = &file_;
264  }
265  // else this turned out to not be a #textdomain or a #line, then this is a normal comment so just read off characters until finding the next line or the end of the file
266  else
267  {
268  not_a_command:
269  while (current_ != token::NEWLINE && current_ != EOF) {
271  }
272  return;
273  }
274 
275  // clear the current value of either textdomain_ or file_ and populate it with the new value
276  dst->clear();
277  while (current_ != token::NEWLINE && current_ != EOF) {
278  *dst += current_;
280  }
281 }
double t
Definition: astarsearch.cpp:63
std::istream & stream()
Returns the owned stream.
bool is_space(int c) const
Definition: tokenizer.hpp:191
std::string file_
Definition: tokenizer.hpp:218
int peek_char()
return the next character without incrementing the current position in the istream
Definition: tokenizer.hpp:169
std::array< character_type, END_STANDARD_ASCII > char_types_
Definition: tokenizer.hpp:224
buffered_istream in_
Definition: tokenizer.hpp:223
bool is_num(int c) const
Definition: tokenizer.hpp:196
void skip_comment()
handles skipping over comments (inline and on a separate line) as well as the special processing need...
Definition: tokenizer.cpp:222
const token & next_token()
Reads characters off of in_ to return the next token type and its value.
Definition: tokenizer.cpp:51
bool skip_command(char const *cmd)
Returns true if the next characters are the one from cmd followed by a space.
Definition: tokenizer.cpp:202
int current_
Definition: tokenizer.hpp:138
bool is_alnum(int c) const
Definition: tokenizer.hpp:201
std::string textdomain_
Definition: tokenizer.hpp:217
void next_char()
increments the line number if the current character is a newline set current_ to the next character t...
Definition: tokenizer.hpp:146
void next_char_skip_cr()
set current_ to the next character skip the \r in the \r\n Windows-style line endings the test_cvs_20...
Definition: tokenizer.hpp:158
token token_
Definition: tokenizer.hpp:219
int startlineno_
Definition: tokenizer.hpp:140
character_type
the different types of characters while parsing TOK_NONE is also the default for anything beyond stan...
Definition: tokenizer.hpp:179
unsigned in
If equal to search_counter, the node is off the list.
constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR
Definition: tokenizer.hpp:29
constexpr int END_STANDARD_ASCII
Definition: tokenizer.hpp:33
rect dst
Location on the final composed sheet.
contains the current text being parsed as well as the token_type of what's being parsed.
Definition: tokenizer.hpp:41
token_type
used for a token's type field
Definition: tokenizer.hpp:51
@ SLASH
Definition: tokenizer.hpp:67
@ QSTRING
quoted string, contained within double quotes or by less than/greater than symbols
Definition: tokenizer.hpp:56
@ COMMA
Definition: tokenizer.hpp:65
@ PLUS
Definition: tokenizer.hpp:66
@ MISC
any characters that don't have special meaning
Definition: tokenizer.hpp:60
@ CLOSE_BRACKET
Definition: tokenizer.hpp:69
@ UNTERMINATED_QSTRING
reached end of file without finding the closing character for a QSTRING
Definition: tokenizer.hpp:58
@ NEWLINE
Definition: tokenizer.hpp:63
@ EQUALS
Definition: tokenizer.hpp:64
@ UNDERSCORE
Definition: tokenizer.hpp:70
@ OPEN_BRACKET
Definition: tokenizer.hpp:68
@ END
set when EOF is returned by the input stream
Definition: tokenizer.hpp:73
@ STRING
unquoted text
Definition: tokenizer.hpp:54
@ DOUBLE_QUOTE
Definition: tokenizer.hpp:84
@ DOLLAR
Definition: tokenizer.hpp:85
@ LEFT_ANGLE_BRACKET
Definition: tokenizer.hpp:82
@ RIGHT_ANGLE_BRACKET
Definition: tokenizer.hpp:83
@ POUND
Definition: tokenizer.hpp:81
token_type type
Definition: tokenizer.hpp:88
std::string value
the token's value, can be either a single character or multiple characters
Definition: tokenizer.hpp:90
mock_char c
Some defines: VERSION, PACKAGE, MIN_SAVEGAME_VERSION.
#define PACKAGE
Definition: wesconfig.h:23