serialization_2tokenizer_8cpp_source.html

 /*

     Copyright (C) 2010 - 2025

     by Guillaume Melquiond <guillaume.melquiond@gmail.com>

     Copyright (C) 2004 - 2009 by Philippe Plantier <ayin@anathas.org>

     Part of the Battle for Wesnoth Project https://www.wesnoth.org/


     This program is free software; you can redistribute it and/or modify

     it under the terms of the GNU General Public License as published by

     the Free Software Foundation; either version 2 of the License, or

     (at your option) any later version.

     This program is distributed in the hope that it will be useful,

     but WITHOUT ANY WARRANTY.


     See the COPYING file for more details.

 */


 #include "serialization/tokenizer.hpp"

 #include "wesconfig.h"


 tokenizer::tokenizer(std::istream& in) :

     current_(EOF),

     lineno_(1),

     startlineno_(0),

     textdomain_(PACKAGE),

     file_(),

     token_(),

     in_(in)

 {

     for (int c = 0; c < END_STANDARD_ASCII; ++c)

     {

         character_type t = TOK_NONE;

         if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {

             t = TOK_ALPHA;

         } else if (c >= '0' && c <= '9') {

             t = TOK_NUMERIC;

         } else if (c == ' ' || c == '\t') {

             t = TOK_SPACE;

         }

         char_types_[c] = t;

     }

     in_.stream().exceptions(std::ios_base::badbit);

     next_char_skip_cr();

 }


 tokenizer::~tokenizer()

 {

     in_.stream().clear(std::ios_base::goodbit);

     in_.stream().exceptions(std::ios_base::goodbit);

 }


 const token &tokenizer::next_token()

 {

 #ifdef DEBUG_TOKENIZER

     previous_token_ = token_;

 #endif

     token_.value.clear();


     // Dump spaces and inlined comments

     while(true)

     {

         while (is_space(current_)) {

             next_char_skip_cr();

         }

         if (current_ != INLINED_PREPROCESS_DIRECTIVE_CHAR)

             break;

         skip_comment();

         // skip the line end

         next_char_skip_cr();

     }


     // skip comments on their own line

     if (current_ == token::POUND)

         skip_comment();


     // set the line number the next token will start on

     startlineno_ = lineno_;


     switch(current_) {

     // we reached the end of the file being read

     case EOF:

         token_.type = token::END;

         break;


     // handle open/closed angle brackets

     // most commonly used for enclosing lua code

     // more generally is used to indicate the preprocessor should skip over a particular block of text

     case token::LEFT_ANGLE_BRACKET:

         // if there aren't double left angle brackets, there is no extra handling needed - this is just a regular left angle bracket

         if (peek_char() != token::LEFT_ANGLE_BRACKET) {

             token_.type = token::MISC;

             token_.value += current_;

             break;

         }


         // else, treat this like a quoted string

         token_.type = token::QSTRING;

         next_char_skip_cr();


         // keep getting characters and appending them to the current token's value until either the file ends or double right angle brackets are found

         // finding the end of the file first is an error since double left angle brackets must always be closed by double right angle brackets

         for (;;) {

             next_char();

             if (current_ == EOF) {

                 token_.type = token::UNTERMINATED_QSTRING;

                 break;

             } else if (current_ == token::RIGHT_ANGLE_BRACKET && peek_char() == token::RIGHT_ANGLE_BRACKET) {

                 next_char_skip_cr();

                 break;

             }

             token_.value += current_;

         }

         break;


     // very similar to the double left+right angle bracket handling

     // the main difference is the need to handle INLINED_PREPROCESS_DIRECTIVE_CHAR since double quotes don't affect the preprocessor

     case token::DOUBLE_QUOTE:

         token_.type = token::QSTRING;


         for (;;) {

             next_char();

             if (current_ == EOF) {

                 token_.type = token::UNTERMINATED_QSTRING;

                 break;

             } else if (current_ == token::DOUBLE_QUOTE) {

                 if (peek_char() != token::DOUBLE_QUOTE) {

                     break;

                 } else {

                     next_char_skip_cr();

                 }

             }


             // ignore this line and decrement the current line number

             if (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {

                 skip_comment();

                 --lineno_;

                 continue;

             }


             token_.value += current_;

         }

         break;


     // tag name delimiters

     case token::OPEN_BRACKET:

     case token::CLOSE_BRACKET:

     // closing tag

     case token::SLASH:

     case token::NEWLINE:

     case token::EQUALS:

     // handles multiple attributes on the same line

     // ie: x,y = 5,5

     case token::COMMA:

     // tag merge aka node append, or string concatenation

     case token::PLUS:

         token_.type = static_cast<token::token_type>(current_);

         token_.value = current_;

         break;


     // when in front of a QSTRING, indicates that the string is translatable

     case token::UNDERSCORE:

         // this check seems off - there are certainly other non-alphanumeric characters that shouldn't mean anything - but it looks like the parser handles those cases

         if (!is_alnum(peek_char())) {

             token_.type = token::UNDERSCORE;

             token_.value = current_;

             break;

         }

         [[fallthrough]];


     // everything else

     default:

         // if alphanumeric (regular text) or the dollar sign (variable)

         // not quite sure how this works with non-ascii text particularly since the parser doesn't reference token_type::MISC

         // but maybe the default handling does what's needed

         if (is_alnum(current_) || current_ == token::DOLLAR) {

             token_.type = token::STRING;


             do {

                 token_.value += current_;

                 next_char_skip_cr();


                 while (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {

                     skip_comment();

                     next_char_skip_cr();

                 }

             } while (is_alnum(current_) || current_ == token::DOLLAR);

         } else {

             token_.type = token::MISC;

             token_.value += current_;

             next_char();

         }

         return token_;

     }


     // if this isn't the end of the file, get the next character in preparation for the next call to this method

     if (current_ != EOF) {

         next_char();

     }


     return token_;

 }


 bool tokenizer::skip_command(char const *cmd)

 {

     // check that the character match the provided text, else return false

     for (; *cmd; ++cmd) {

         next_char_skip_cr();

         if (current_ != *cmd) {

             return false;

         }

     }


     // check that it's followed by a space, else return false

     next_char_skip_cr();

     if (!is_space(current_)) {

         return false;

     }


     next_char_skip_cr();

     return true;

 }


 void tokenizer::skip_comment()

 {

     // nothing to do if the line ends or the file ends

     next_char_skip_cr();

     if (current_ == token::NEWLINE || current_ == EOF) {

         return;

     }


     // used to point to either textdomain_ or file_, and populate that field with the value following the respective command

     std::string *dst = nullptr;


     // if this is a #textdomain, point to textdomain_

     if (current_ == 't')

     {

         if (!skip_command("extdomain")) {

             goto not_a_command;

         }

         dst = &textdomain_;

     }

     // else if this is a #line, determine the line number and then point to file_

     else if (current_ == 'l')

     {

         if (!skip_command("ine")) {

             goto not_a_command;

         }


         lineno_ = 0;

         while (is_num(current_)) {

             // ie if the line number is 587

             // (0 * 10) + 5 = 5

             // (5 * 10) + 8 = 58

             // (58 * 10) + 7 = 587

             lineno_ = lineno_ * 10 + (current_ - '0');

             next_char_skip_cr();

         }


         if (!is_space(current_)) {

             goto not_a_command;

         }


         next_char_skip_cr();

         dst = &file_;

     }

     // else this turned out to not be a #textdomain or a #line, then this is a normal comment so just read off characters until finding the next line or the end of the file

     else

     {

         not_a_command:

         while (current_ != token::NEWLINE && current_ != EOF) {

             next_char_skip_cr();

         }

         return;

     }


     // clear the current value of either textdomain_ or file_ and populate it with the new value

     dst->clear();

     while (current_ != token::NEWLINE && current_ != EOF) {

         *dst += current_;

         next_char_skip_cr();

     }

 }

t
double t
Definition: astarsearch.cpp:63

buffered_istream::stream
std::istream & stream()
Returns the owned stream.
Definition: buffered_istream.hpp:106

tokenizer::is_space
bool is_space(int c) const
Definition: tokenizer.hpp:191

tokenizer::file_
std::string file_
Definition: tokenizer.hpp:218

tokenizer::peek_char
int peek_char()
return the next character without incrementing the current position in the istream
Definition: tokenizer.hpp:169

tokenizer::tokenizer
tokenizer()

tokenizer::char_types_
std::array< character_type, END_STANDARD_ASCII > char_types_
Definition: tokenizer.hpp:224

tokenizer::in_
buffered_istream in_
Definition: tokenizer.hpp:223

tokenizer::is_num
bool is_num(int c) const
Definition: tokenizer.hpp:196

tokenizer::skip_comment
void skip_comment()
handles skipping over comments (inline and on a separate line) as well as the special processing need...
Definition: tokenizer.cpp:222

tokenizer::~tokenizer
~tokenizer()
Definition: tokenizer.cpp:45

tokenizer::next_token
const token & next_token()
Reads characters off of in_ to return the next token type and its value.
Definition: tokenizer.cpp:51

tokenizer::lineno_
int lineno_
Definition: tokenizer.hpp:139

tokenizer::skip_command
bool skip_command(char const *cmd)
Returns true if the next characters are the one from cmd followed by a space.
Definition: tokenizer.cpp:202

tokenizer::current_
int current_
Definition: tokenizer.hpp:138

tokenizer::is_alnum
bool is_alnum(int c) const
Definition: tokenizer.hpp:201

tokenizer::textdomain_
std::string textdomain_
Definition: tokenizer.hpp:217

tokenizer::next_char
void next_char()
increments the line number if the current character is a newline set current_ to the next character t...
Definition: tokenizer.hpp:146

tokenizer::next_char_skip_cr
void next_char_skip_cr()
set current_ to the next character skip the \r in the \r\n Windows-style line endings the test_cvs_20...
Definition: tokenizer.hpp:158

tokenizer::token_
token token_
Definition: tokenizer.hpp:219

tokenizer::startlineno_
int startlineno_
Definition: tokenizer.hpp:140

tokenizer::character_type
character_type
the different types of characters while parsing TOK_NONE is also the default for anything beyond stan...
Definition: tokenizer.hpp:179

tokenizer::TOK_NUMERIC
@ TOK_NUMERIC
Definition: tokenizer.hpp:182

tokenizer::TOK_SPACE
@ TOK_SPACE
Definition: tokenizer.hpp:181

tokenizer::TOK_NONE
@ TOK_NONE
Definition: tokenizer.hpp:180

tokenizer::TOK_ALPHA
@ TOK_ALPHA
Definition: tokenizer.hpp:183

in
unsigned in
If equal to search_counter, the node is off the list.
Definition: function_table.cpp:160

tokenizer.hpp

INLINED_PREPROCESS_DIRECTIVE_CHAR
constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR
Definition: tokenizer.hpp:29

END_STANDARD_ASCII
constexpr int END_STANDARD_ASCII
Definition: tokenizer.hpp:33

dst
rect dst
Location on the final composed sheet.
Definition: spritesheet_generator.cpp:73

token
contains the current text being parsed as well as the token_type of what's being parsed.
Definition: tokenizer.hpp:41

token::token_type
token_type
used for a token's type field
Definition: tokenizer.hpp:51

token::SLASH
@ SLASH
Definition: tokenizer.hpp:67

token::QSTRING
@ QSTRING
quoted string, contained within double quotes or by less than/greater than symbols
Definition: tokenizer.hpp:56

token::COMMA
@ COMMA
Definition: tokenizer.hpp:65

token::PLUS
@ PLUS
Definition: tokenizer.hpp:66

token::MISC
@ MISC
any characters that don't have special meaning
Definition: tokenizer.hpp:60

token::CLOSE_BRACKET
@ CLOSE_BRACKET
Definition: tokenizer.hpp:69

token::UNTERMINATED_QSTRING
@ UNTERMINATED_QSTRING
reached end of file without finding the closing character for a QSTRING
Definition: tokenizer.hpp:58

token::NEWLINE
@ NEWLINE
Definition: tokenizer.hpp:63

token::EQUALS
@ EQUALS
Definition: tokenizer.hpp:64

token::UNDERSCORE
@ UNDERSCORE
Definition: tokenizer.hpp:70

token::OPEN_BRACKET
@ OPEN_BRACKET
Definition: tokenizer.hpp:68

token::END
@ END
set when EOF is returned by the input stream
Definition: tokenizer.hpp:73

token::STRING
@ STRING
unquoted text
Definition: tokenizer.hpp:54

token::DOUBLE_QUOTE
@ DOUBLE_QUOTE
Definition: tokenizer.hpp:84

token::DOLLAR
@ DOLLAR
Definition: tokenizer.hpp:85

token::LEFT_ANGLE_BRACKET
@ LEFT_ANGLE_BRACKET
Definition: tokenizer.hpp:82

token::RIGHT_ANGLE_BRACKET
@ RIGHT_ANGLE_BRACKET
Definition: tokenizer.hpp:83

token::POUND
@ POUND
Definition: tokenizer.hpp:81

token::type
token_type type
Definition: tokenizer.hpp:88

token::value
std::string value
the token's value, can be either a single character or multiple characters
Definition: tokenizer.hpp:90

c
mock_char c
Definition: test_formula_core.cpp:68

wesconfig.h
Some defines: VERSION, PACKAGE, MIN_SAVEGAME_VERSION.

PACKAGE
#define PACKAGE
Definition: wesconfig.h:23