The Battle for Wesnoth  1.19.15+dev
tokenizer.hpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2010 - 2025
3  by Guillaume Melquiond <guillaume.melquiond@gmail.com>
4  Copyright (C) 2004 - 2009 by Philippe Plantier <ayin@anathas.org>
5  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
6 
7  This program is free software; you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation; either version 2 of the License, or
10  (at your option) any later version.
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY.
13 
14  See the COPYING file for more details.
15 */
16 
17 #pragma once
18 
19 //#define DEBUG_TOKENIZER
20 
21 #include "buffered_istream.hpp"
22 
23 #include <array>
24 #include <istream>
25 #include <string>
26 
27 // use of illegal utf8 character for this purpose was added in a76be7ef1e921dabacd99f16ef440bf9673b8d98
28 // added by the preprocessor to allow special handling for #line and #textdomain commands
29 constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR = 254;
30 
31 // normal ascii is 0-127
32 // extended ascii is from 128-255, none of which need any special handling
33 constexpr int END_STANDARD_ASCII = 128;
34 
35 /**
36  * contains the current text being parsed as well as the token_type of what's being parsed.
37  * multi-character token types will have a value that's a string with zero or more characters in it.
38  * single character token types are a single character with special meaning for a config
39  */
40 struct token
41 {
42  /**
43  * used for a token's type field
44  */
46  {
47  // multi-character
48  /** unquoted text */
50  /** quoted string, contained within double quotes or by less than/greater than symbols */
52  /** reached end of file without finding the closing character for a QSTRING */
54  /** any characters that don't have special meaning */
56 
57  // single characters
58  NEWLINE = '\n',
59  EQUALS = '=',
60  COMMA = ',',
61  PLUS = '+',
62  SLASH = '/',
63  OPEN_BRACKET = '[',
65  UNDERSCORE = '_',
66 
67  /** set when EOF is returned by the input stream */
68  END = 256
69  };
70 
71  /**
72  * not used for a token's type field
73  */
75  {
76  POUND = '#',
79  DOUBLE_QUOTE = '"',
80  DOLLAR = '$',
81  };
82 
84  /** the token's value, can be either a single character or multiple characters */
85  std::string value;
86 };
87 
88 /**
89  * class responsible for parsing the provided text into tokens and tracking information about the current token.
90  * can also track the previous token when built with the DEBUG_TOKENIZER compiler define.
91  * does not otherwise keep track of the processing history.
92  */
93 class tokenizer
94 {
95 public:
96  tokenizer(std::istream& in);
97  ~tokenizer();
98 
99  /**
100  * Reads characters off of @a in_ to return the next token type and its value.
101  */
102  const token &next_token();
103 
104  const token &current_token() const
105  {
106  return token_;
107  }
108 
109 #ifdef DEBUG_TOKENIZER
110  const token &previous_token() const
111  {
112  return previous_token_;
113  }
114 #endif
115 
116  const std::string &textdomain() const
117  {
118  return textdomain_;
119  }
120 
121  const std::string &get_file() const
122  {
123  return file_;
124  }
125 
126  int get_start_line() const
127  {
128  return startlineno_;
129  }
130 
131 private:
133  int current_;
134  int lineno_;
136 
137  /**
138  * increments the line number if the current character is a newline
139  * set current_ to the next character that's not `\r`
140  */
141  void next_char()
142  {
143  if (current_ == token::NEWLINE)
144  ++lineno_;
146  }
147 
148  /**
149  * set current_ to the next character
150  * skip the `\r` in the `\r\n` Windows-style line endings
151  * the test_cvs_2018_1999023_2.cfg file also uses `\r\n` line endings for some reason - otherwise that check isn't needed on non-Windows platforms since `\r` characters are removed from cfg files on upload
152  */
154  {
155  current_ = in_.get();
156  if(current_ == '\r') {
157  current_ = in_.get();
158  }
159  }
160 
161  /**
162  * return the next character without incrementing the current position in the istream
163  */
164  int peek_char()
165  {
166  return in_.peek();
167  }
168 
169  /**
170  * the different types of characters while parsing
171  * TOK_NONE is also the default for anything beyond standard ascii
172  */
174  {
175  TOK_NONE = 0,
178  TOK_ALPHA = 3
179  };
180 
181  character_type char_type(unsigned c) const
182  {
183  return c < END_STANDARD_ASCII ? char_types_[c] : TOK_NONE;
184  }
185 
186  bool is_space(int c) const
187  {
188  return char_type(c) == TOK_SPACE;
189  }
190 
191  bool is_num(int c) const
192  {
193  return char_type(c) == TOK_NUMERIC;
194  }
195 
196  bool is_alnum(int c) const
197  {
198  return char_type(c) > TOK_SPACE;
199  }
200 
201  /**
202  * handles skipping over comments (inline and on a separate line) as well as the special processing needed for \#textdomain and \#line
203  */
204  void skip_comment();
205 
206  /**
207  * Returns true if the next characters are the one from @a cmd followed by a space. Skips all the matching characters.
208  * Currently only used by \#textdomain (specified by the WML) and \#line (added by the preprocessor)
209  */
210  bool skip_command(char const *cmd);
211 
212  std::string textdomain_;
213  std::string file_;
215 #ifdef DEBUG_TOKENIZER
216  token previous_token_;
217 #endif
219  std::array<character_type, END_STANDARD_ASCII> char_types_;
220 };
unsigned in
If equal to search_counter, the node is off the list.
Definition: astarsearch.cpp:70
Helper class for buffering a std::istream.
Helper class for buffering a std::istream.
int get()
Gets and consumes a character from the buffer.
int peek()
Gets a character from the buffer.
class responsible for parsing the provided text into tokens and tracking information about the curren...
Definition: tokenizer.hpp:94
bool is_space(int c) const
Definition: tokenizer.hpp:186
std::string file_
Definition: tokenizer.hpp:213
int peek_char()
return the next character without incrementing the current position in the istream
Definition: tokenizer.hpp:164
std::array< character_type, END_STANDARD_ASCII > char_types_
Definition: tokenizer.hpp:219
buffered_istream in_
Definition: tokenizer.hpp:218
bool is_num(int c) const
Definition: tokenizer.hpp:191
void skip_comment()
handles skipping over comments (inline and on a separate line) as well as the special processing need...
Definition: tokenizer.cpp:222
const std::string & textdomain() const
Definition: tokenizer.hpp:116
const token & next_token()
Reads characters off of in_ to return the next token type and its value.
Definition: tokenizer.cpp:51
bool skip_command(char const *cmd)
Returns true if the next characters are the one from cmd followed by a space.
Definition: tokenizer.cpp:202
int current_
Definition: tokenizer.hpp:133
bool is_alnum(int c) const
Definition: tokenizer.hpp:196
const token & current_token() const
Definition: tokenizer.hpp:104
std::string textdomain_
Definition: tokenizer.hpp:212
void next_char()
increments the line number if the current character is a newline set current_ to the next character t...
Definition: tokenizer.hpp:141
void next_char_skip_cr()
set current_ to the next character skip the \r in the \r\n Windows-style line endings the test_cvs_20...
Definition: tokenizer.hpp:153
token token_
Definition: tokenizer.hpp:214
int startlineno_
Definition: tokenizer.hpp:135
const std::string & get_file() const
Definition: tokenizer.hpp:121
character_type
the different types of characters while parsing TOK_NONE is also the default for anything beyond stan...
Definition: tokenizer.hpp:174
character_type char_type(unsigned c) const
Definition: tokenizer.hpp:181
int get_start_line() const
Definition: tokenizer.hpp:126
constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR
Definition: tokenizer.hpp:29
constexpr int END_STANDARD_ASCII
Definition: tokenizer.hpp:33
contains the current text being parsed as well as the token_type of what's being parsed.
Definition: tokenizer.hpp:41
token_type
used for a token's type field
Definition: tokenizer.hpp:46
@ SLASH
Definition: tokenizer.hpp:62
@ QSTRING
quoted string, contained within double quotes or by less than/greater than symbols
Definition: tokenizer.hpp:51
@ COMMA
Definition: tokenizer.hpp:60
@ PLUS
Definition: tokenizer.hpp:61
@ MISC
any characters that don't have special meaning
Definition: tokenizer.hpp:55
@ CLOSE_BRACKET
Definition: tokenizer.hpp:64
@ UNTERMINATED_QSTRING
reached end of file without finding the closing character for a QSTRING
Definition: tokenizer.hpp:53
@ NEWLINE
Definition: tokenizer.hpp:58
@ EQUALS
Definition: tokenizer.hpp:59
@ UNDERSCORE
Definition: tokenizer.hpp:65
@ OPEN_BRACKET
Definition: tokenizer.hpp:63
@ END
set when EOF is returned by the input stream
Definition: tokenizer.hpp:68
@ STRING
unquoted text
Definition: tokenizer.hpp:49
source_chars
not used for a token's type field
Definition: tokenizer.hpp:75
@ DOUBLE_QUOTE
Definition: tokenizer.hpp:79
@ DOLLAR
Definition: tokenizer.hpp:80
@ LEFT_ANGLE_BRACKET
Definition: tokenizer.hpp:77
@ RIGHT_ANGLE_BRACKET
Definition: tokenizer.hpp:78
@ POUND
Definition: tokenizer.hpp:76
token_type type
Definition: tokenizer.hpp:83
std::string value
the token's value, can be either a single character or multiple characters
Definition: tokenizer.hpp:85
mock_char c