serialization_2tokenizer_8hpp_source.html

 /*

     Copyright (C) 2010 - 2025

     by Guillaume Melquiond <guillaume.melquiond@gmail.com>

     Copyright (C) 2004 - 2009 by Philippe Plantier <ayin@anathas.org>

     Part of the Battle for Wesnoth Project https://www.wesnoth.org/


     This program is free software; you can redistribute it and/or modify

     it under the terms of the GNU General Public License as published by

     the Free Software Foundation; either version 2 of the License, or

     (at your option) any later version.

     This program is distributed in the hope that it will be useful,

     but WITHOUT ANY WARRANTY.


     See the COPYING file for more details.

 */


 #pragma once


 //#define DEBUG_TOKENIZER


 #include "buffered_istream.hpp"


 #include <array>

 #include <istream>

 #include <string>


 // use of illegal utf8 character for this purpose was added in a76be7ef1e921dabacd99f16ef440bf9673b8d98

 // added by the preprocessor to allow special handling for #line and #textdomain commands

 constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR = 254;


 // normal ascii is 0-127

 // extended ascii is from 128-255, none of which need any special handling

 constexpr int END_STANDARD_ASCII = 128;


 /**

  * contains the current text being parsed as well as the token_type of what's being parsed.

  * multi-character token types will have a value that's a string with zero or more characters in it.

  * single character token types are a single character with special meaning for a config

  */

 struct token

 {

     token() :

         type(END),

         value()

     {}


     /**

      * used for a token's type field

      */

     enum token_type

     {

         // multi-character

         /** unquoted text */

         STRING,

         /** quoted string, contained within double quotes or by less than/greater than symbols */

         QSTRING,

         /** reached end of file without finding the closing character for a QSTRING */

         UNTERMINATED_QSTRING,

         /** any characters that don't have special meaning */

         MISC,


         // single characters

         NEWLINE = '\n',

         EQUALS = '=',

         COMMA = ',',

         PLUS = '+',

         SLASH = '/',

         OPEN_BRACKET = '[',

         CLOSE_BRACKET = ']',

         UNDERSCORE = '_',


         /** set when EOF is returned by the input stream */

         END = 256

     };


     /**

      * not used for a token's type field

      */

     enum source_chars

     {

         POUND = '#',

         LEFT_ANGLE_BRACKET = '<',

         RIGHT_ANGLE_BRACKET = '>',

         DOUBLE_QUOTE = '"',

         DOLLAR = '$',

     };


     token_type type;

     /** the token's value, can be either a single character or multiple characters */

     std::string value;

 };


 /**

  * class responsible for parsing the provided text into tokens and tracking information about the current token.

  * can also track the previous token when built with the DEBUG_TOKENIZER compiler define.

  * does not otherwise keep track of the processing history.

  */

 class tokenizer

 {

 public:

     tokenizer(std::istream& in);

     ~tokenizer();


     /**

      * Reads characters off of @a in_ to return the next token type and its value.

      */

     const token &next_token();


     const token &current_token() const

     {

         return token_;

     }


 #ifdef DEBUG_TOKENIZER

     const token &previous_token() const

     {

         return previous_token_;

     }

 #endif


     const std::string &textdomain() const

     {

         return textdomain_;

     }


     const std::string &get_file() const

     {

         return file_;

     }


     int get_start_line() const

     {

         return startlineno_;

     }


 private:

     tokenizer();

     int current_;

     int lineno_;

     int startlineno_;


     /**

      * increments the line number if the current character is a newline

      * set current_ to the next character that's not `\r`

      */

     void next_char()

     {

         if (current_ == token::NEWLINE)

             ++lineno_;

         next_char_skip_cr();

     }


     /**

      * set current_ to the next character

      * skip the `\r` in the `\r\n` Windows-style line endings

      * the test_cvs_2018_1999023_2.cfg file also uses `\r\n` line endings for some reason - otherwise that check isn't needed on non-Windows platforms since `\r` characters are removed from cfg files on upload

      */

     void next_char_skip_cr()

     {

         current_ = in_.get();

         if(current_ == '\r') {

             current_ = in_.get();

         }

     }


     /**

      * return the next character without incrementing the current position in the istream

      */

     int peek_char()

     {

         return in_.peek();

     }


     /**

      * the different types of characters while parsing

      * TOK_NONE is also the default for anything beyond standard ascii

      */

     enum character_type

     {

         TOK_NONE = 0,

         TOK_SPACE = 1,

         TOK_NUMERIC = 2,

         TOK_ALPHA = 3

     };


     character_type char_type(unsigned c) const

     {

         return c < END_STANDARD_ASCII ? char_types_[c] : TOK_NONE;

     }


     bool is_space(int c) const

     {

         return char_type(c) == TOK_SPACE;

     }


     bool is_num(int c) const

     {

         return char_type(c) == TOK_NUMERIC;

     }


     bool is_alnum(int c) const

     {

         return char_type(c) > TOK_SPACE;

     }


     /**

      * handles skipping over comments (inline and on a separate line) as well as the special processing needed for \#textdomain and \#line

      */

     void skip_comment();


     /**

      * Returns true if the next characters are the one from @a cmd followed by a space. Skips all the matching characters.

      * Currently only used by \#textdomain (specified by the WML) and \#line (added by the preprocessor)

      */

     bool skip_command(char const *cmd);


     std::string textdomain_;

     std::string file_;

     token token_;

 #ifdef DEBUG_TOKENIZER

     token previous_token_;

 #endif

     buffered_istream in_;

     std::array<character_type, END_STANDARD_ASCII> char_types_;

 };

buffered_istream.hpp
Helper class for buffering a std::istream.

buffered_istream
Helper class for buffering a std::istream.
Definition: buffered_istream.hpp:42

buffered_istream::get
int get()
Gets and consumes a character from the buffer.
Definition: buffered_istream.hpp:60

buffered_istream::peek
int peek()
Gets a character from the buffer.
Definition: buffered_istream.hpp:87

tokenizer
class responsible for parsing the provided text into tokens and tracking information about the curren...
Definition: tokenizer.hpp:99

tokenizer::is_space
bool is_space(int c) const
Definition: tokenizer.hpp:191

tokenizer::file_
std::string file_
Definition: tokenizer.hpp:218

tokenizer::peek_char
int peek_char()
return the next character without incrementing the current position in the istream
Definition: tokenizer.hpp:169

tokenizer::tokenizer
tokenizer()

tokenizer::char_types_
std::array< character_type, END_STANDARD_ASCII > char_types_
Definition: tokenizer.hpp:224

tokenizer::in_
buffered_istream in_
Definition: tokenizer.hpp:223

tokenizer::is_num
bool is_num(int c) const
Definition: tokenizer.hpp:196

tokenizer::skip_comment
void skip_comment()
handles skipping over comments (inline and on a separate line) as well as the special processing need...
Definition: tokenizer.cpp:222

tokenizer::~tokenizer
~tokenizer()
Definition: tokenizer.cpp:45

tokenizer::textdomain
const std::string & textdomain() const
Definition: tokenizer.hpp:121

tokenizer::next_token
const token & next_token()
Reads characters off of in_ to return the next token type and its value.
Definition: tokenizer.cpp:51

tokenizer::lineno_
int lineno_
Definition: tokenizer.hpp:139

tokenizer::skip_command
bool skip_command(char const *cmd)
Returns true if the next characters are the one from cmd followed by a space.
Definition: tokenizer.cpp:202

tokenizer::current_
int current_
Definition: tokenizer.hpp:138

tokenizer::is_alnum
bool is_alnum(int c) const
Definition: tokenizer.hpp:201

tokenizer::current_token
const token & current_token() const
Definition: tokenizer.hpp:109

tokenizer::textdomain_
std::string textdomain_
Definition: tokenizer.hpp:217

tokenizer::next_char
void next_char()
increments the line number if the current character is a newline set current_ to the next character t...
Definition: tokenizer.hpp:146

tokenizer::next_char_skip_cr
void next_char_skip_cr()
set current_ to the next character skip the \r in the \r\n Windows-style line endings the test_cvs_20...
Definition: tokenizer.hpp:158

tokenizer::token_
token token_
Definition: tokenizer.hpp:219

tokenizer::startlineno_
int startlineno_
Definition: tokenizer.hpp:140

tokenizer::get_file
const std::string & get_file() const
Definition: tokenizer.hpp:126

tokenizer::character_type
character_type
the different types of characters while parsing TOK_NONE is also the default for anything beyond stan...
Definition: tokenizer.hpp:179

tokenizer::TOK_NUMERIC
@ TOK_NUMERIC
Definition: tokenizer.hpp:182

tokenizer::TOK_SPACE
@ TOK_SPACE
Definition: tokenizer.hpp:181

tokenizer::TOK_NONE
@ TOK_NONE
Definition: tokenizer.hpp:180

tokenizer::TOK_ALPHA
@ TOK_ALPHA
Definition: tokenizer.hpp:183

tokenizer::char_type
character_type char_type(unsigned c) const
Definition: tokenizer.hpp:186

tokenizer::get_start_line
int get_start_line() const
Definition: tokenizer.hpp:131

in
unsigned in
If equal to search_counter, the node is off the list.
Definition: function_table.cpp:160

INLINED_PREPROCESS_DIRECTIVE_CHAR
constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR
Definition: tokenizer.hpp:29

END_STANDARD_ASCII
constexpr int END_STANDARD_ASCII
Definition: tokenizer.hpp:33

token
contains the current text being parsed as well as the token_type of what's being parsed.
Definition: tokenizer.hpp:41

token::token
token()
Definition: tokenizer.hpp:42

token::token_type
token_type
used for a token's type field
Definition: tokenizer.hpp:51

token::SLASH
@ SLASH
Definition: tokenizer.hpp:67

token::QSTRING
@ QSTRING
quoted string, contained within double quotes or by less than/greater than symbols
Definition: tokenizer.hpp:56

token::COMMA
@ COMMA
Definition: tokenizer.hpp:65

token::PLUS
@ PLUS
Definition: tokenizer.hpp:66

token::MISC
@ MISC
any characters that don't have special meaning
Definition: tokenizer.hpp:60

token::CLOSE_BRACKET
@ CLOSE_BRACKET
Definition: tokenizer.hpp:69

token::UNTERMINATED_QSTRING
@ UNTERMINATED_QSTRING
reached end of file without finding the closing character for a QSTRING
Definition: tokenizer.hpp:58

token::NEWLINE
@ NEWLINE
Definition: tokenizer.hpp:63

token::EQUALS
@ EQUALS
Definition: tokenizer.hpp:64

token::UNDERSCORE
@ UNDERSCORE
Definition: tokenizer.hpp:70

token::OPEN_BRACKET
@ OPEN_BRACKET
Definition: tokenizer.hpp:68

token::END
@ END
set when EOF is returned by the input stream
Definition: tokenizer.hpp:73

token::STRING
@ STRING
unquoted text
Definition: tokenizer.hpp:54

token::source_chars
source_chars
not used for a token's type field
Definition: tokenizer.hpp:80

token::DOUBLE_QUOTE
@ DOUBLE_QUOTE
Definition: tokenizer.hpp:84

token::DOLLAR
@ DOLLAR
Definition: tokenizer.hpp:85

token::LEFT_ANGLE_BRACKET
@ LEFT_ANGLE_BRACKET
Definition: tokenizer.hpp:82

token::RIGHT_ANGLE_BRACKET
@ RIGHT_ANGLE_BRACKET
Definition: tokenizer.hpp:83

token::POUND
@ POUND
Definition: tokenizer.hpp:81

token::type
token_type type
Definition: tokenizer.hpp:88

token::value
std::string value
the token's value, can be either a single character or multiple characters
Definition: tokenizer.hpp:90

c
mock_char c
Definition: test_formula_core.cpp:67