The Battle for Wesnoth  1.19.5+dev
tokenizer.hpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2010 - 2024
3  by Guillaume Melquiond <guillaume.melquiond@gmail.com>
4  Copyright (C) 2004 - 2009 by Philippe Plantier <ayin@anathas.org>
5  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
6 
7  This program is free software; you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation; either version 2 of the License, or
10  (at your option) any later version.
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY.
13 
14  See the COPYING file for more details.
15 */
16 
17 #pragma once
18 
19 //#define DEBUG_TOKENIZER
20 
21 #include "buffered_istream.hpp"
22 
23 #include <array>
24 #include <istream>
25 #include <string>
26 
27 // use of illegal utf8 character for this purpose was added in a76be7ef1e921dabacd99f16ef440bf9673b8d98
28 // added by the preprocessor to allow special handling for #line and #textdomain commands
29 constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR = 254;
30 
31 // normal ascii is 0-127
32 // extended ascii is from 128-255, none of which need any special handling
33 constexpr int END_STANDARD_ASCII = 128;
34 
35 /**
36  * contains the current text being parsed as well as the token_type of what's being parsed.
37  * multi-character token types will have a value that's a string with zero or more characters in it.
38  * single character token types are a single character with special meaning for a config
39  */
40 struct token
41 {
42  token() :
43  type(END),
44  value()
45  {}
46 
47  /**
48  * used for a token's type field
49  */
51  {
52  // multi-character
53  /** unquoted text */
55  /** quoted string, contained within double quotes or by less than/greater than symbols */
57  /** reached end of file without finding the closing character for a QSTRING */
59  /** any characters that don't have special meaning */
61 
62  // single characters
63  NEWLINE = '\n',
64  EQUALS = '=',
65  COMMA = ',',
66  PLUS = '+',
67  SLASH = '/',
68  OPEN_BRACKET = '[',
70  UNDERSCORE = '_',
71 
72  /** set when EOF is returned by the input stream */
73  END = 256
74  };
75 
76  /**
77  * not used for a token's type field
78  */
80  {
81  POUND = '#',
84  DOUBLE_QUOTE = '"',
85  DOLLAR = '$',
86  };
87 
89  /** the token's value, can be either a single character or multiple characters */
90  std::string value;
91 };
92 
93 /**
94  * class responsible for parsing the provided text into tokens and tracking information about the current token.
95  * can also track the previous token when built with the DEBUG_TOKENIZER compiler define.
96  * does not otherwise keep track of the processing history.
97  */
98 class tokenizer
99 {
100 public:
101  tokenizer(std::istream& in);
102  ~tokenizer();
103 
104  /**
105  * Reads characters off of @a in_ to return the next token type and its value.
106  */
107  const token &next_token();
108 
109  const token &current_token() const
110  {
111  return token_;
112  }
113 
114 #ifdef DEBUG_TOKENIZER
115  const token &previous_token() const
116  {
117  return previous_token_;
118  }
119 #endif
120 
121  const std::string &textdomain() const
122  {
123  return textdomain_;
124  }
125 
126  const std::string &get_file() const
127  {
128  return file_;
129  }
130 
131  int get_start_line() const
132  {
133  return startlineno_;
134  }
135 
136 private:
138  int current_;
139  int lineno_;
141 
142  /**
143  * increments the line number if the current character is a newline
144  * set current_ to the next character that's not `\r`
145  */
146  void next_char()
147  {
148  if (current_ == token::NEWLINE)
149  ++lineno_;
151  }
152 
153  /**
154  * set current_ to the next character
155  * skip the `\r` in the `\r\n` Windows-style line endings
156  * the test_cvs_2018_1999023_2.cfg file also uses `\r\n` line endings for some reason - otherwise that check isn't needed on non-Windows platforms since `\r` characters are removed from cfg files on upload
157  */
159  {
160  current_ = in_.get();
161  if(current_ == '\r') {
162  current_ = in_.get();
163  }
164  }
165 
166  /**
167  * return the next character without incrementing the current position in the istream
168  */
169  int peek_char()
170  {
171  return in_.peek();
172  }
173 
174  /**
175  * the different types of characters while parsing
176  * TOK_NONE is also the default for anything beyond standard ascii
177  */
179  {
180  TOK_NONE = 0,
183  TOK_ALPHA = 3
184  };
185 
186  character_type char_type(unsigned c) const
187  {
188  return c < END_STANDARD_ASCII ? char_types_[c] : TOK_NONE;
189  }
190 
191  bool is_space(int c) const
192  {
193  return char_type(c) == TOK_SPACE;
194  }
195 
196  bool is_num(int c) const
197  {
198  return char_type(c) == TOK_NUMERIC;
199  }
200 
201  bool is_alnum(int c) const
202  {
203  return char_type(c) > TOK_SPACE;
204  }
205 
206  /**
207  * handles skipping over comments (inline and on a separate line) as well as the special processing needed for \#textdomain and \#line
208  */
209  void skip_comment();
210 
211  /**
212  * Returns true if the next characters are the one from @a cmd followed by a space. Skips all the matching characters.
213  * Currently only used by \#textdomain (specified by the WML) and \#line (added by the preprocessor)
214  */
215  bool skip_command(char const *cmd);
216 
217  std::string textdomain_;
218  std::string file_;
220 #ifdef DEBUG_TOKENIZER
221  token previous_token_;
222 #endif
224  std::array<character_type, END_STANDARD_ASCII> char_types_;
225 };
Helper class for buffering a std::istream.
Helper class for buffering a std::istream.
int get()
Gets and consumes a character from the buffer.
int peek()
Gets a character from the buffer.
class responsible for parsing the provided text into tokens and tracking information about the curren...
Definition: tokenizer.hpp:99
bool is_space(int c) const
Definition: tokenizer.hpp:191
std::string file_
Definition: tokenizer.hpp:218
int peek_char()
return the next character without incrementing the current position in the istream
Definition: tokenizer.hpp:169
std::array< character_type, END_STANDARD_ASCII > char_types_
Definition: tokenizer.hpp:224
buffered_istream in_
Definition: tokenizer.hpp:223
bool is_num(int c) const
Definition: tokenizer.hpp:196
void skip_comment()
handles skipping over comments (inline and on a separate line) as well as the special processing need...
Definition: tokenizer.cpp:222
const std::string & textdomain() const
Definition: tokenizer.hpp:121
const token & next_token()
Reads characters off of in_ to return the next token type and its value.
Definition: tokenizer.cpp:51
bool skip_command(char const *cmd)
Returns true if the next characters are the one from cmd followed by a space.
Definition: tokenizer.cpp:202
int current_
Definition: tokenizer.hpp:138
bool is_alnum(int c) const
Definition: tokenizer.hpp:201
const token & current_token() const
Definition: tokenizer.hpp:109
std::string textdomain_
Definition: tokenizer.hpp:217
void next_char()
increments the line number if the current character is a newline set current_ to the next character t...
Definition: tokenizer.hpp:146
void next_char_skip_cr()
set current_ to the next character skip the \r in the \r\n Windows-style line endings the test_cvs_20...
Definition: tokenizer.hpp:158
token token_
Definition: tokenizer.hpp:219
int startlineno_
Definition: tokenizer.hpp:140
const std::string & get_file() const
Definition: tokenizer.hpp:126
character_type
the different types of characters while parsing TOK_NONE is also the default for anything beyond stan...
Definition: tokenizer.hpp:179
character_type char_type(unsigned c) const
Definition: tokenizer.hpp:186
int get_start_line() const
Definition: tokenizer.hpp:131
unsigned in
If equal to search_counter, the node is off the list.
constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR
Definition: tokenizer.hpp:29
constexpr int END_STANDARD_ASCII
Definition: tokenizer.hpp:33
contains the current text being parsed as well as the token_type of what's being parsed.
Definition: tokenizer.hpp:41
token()
Definition: tokenizer.hpp:42
token_type
used for a token's type field
Definition: tokenizer.hpp:51
@ SLASH
Definition: tokenizer.hpp:67
@ QSTRING
quoted string, contained within double quotes or by less than/greater than symbols
Definition: tokenizer.hpp:56
@ COMMA
Definition: tokenizer.hpp:65
@ PLUS
Definition: tokenizer.hpp:66
@ MISC
any characters that don't have special meaning
Definition: tokenizer.hpp:60
@ CLOSE_BRACKET
Definition: tokenizer.hpp:69
@ UNTERMINATED_QSTRING
reached end of file without finding the closing character for a QSTRING
Definition: tokenizer.hpp:58
@ NEWLINE
Definition: tokenizer.hpp:63
@ EQUALS
Definition: tokenizer.hpp:64
@ UNDERSCORE
Definition: tokenizer.hpp:70
@ OPEN_BRACKET
Definition: tokenizer.hpp:68
@ END
set when EOF is returned by the input stream
Definition: tokenizer.hpp:73
@ STRING
unquoted text
Definition: tokenizer.hpp:54
source_chars
not used for a token's type field
Definition: tokenizer.hpp:80
@ DOUBLE_QUOTE
Definition: tokenizer.hpp:84
@ DOLLAR
Definition: tokenizer.hpp:85
@ LEFT_ANGLE_BRACKET
Definition: tokenizer.hpp:82
@ RIGHT_ANGLE_BRACKET
Definition: tokenizer.hpp:83
@ POUND
Definition: tokenizer.hpp:81
token_type type
Definition: tokenizer.hpp:88
std::string value
the token's value, can be either a single character or multiple characters
Definition: tokenizer.hpp:90
mock_char c