The Battle for Wesnoth  1.19.17+dev
markup.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2024 - 2025
3  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 #include "serialization/markup.hpp"
16 
17 #include "config.hpp"
18 #include "formatter.hpp"
19 #include "game_config.hpp"
20 #include "gettext.hpp"
23 #include "serialization/unicode_cast.hpp" // for unicode_cast
24 #include "utils/general.hpp"
25 
26 #include <algorithm>
27 
28 namespace markup {
29 
30 std::string make_link(const std::string& text, const std::string& dst)
31 {
32  // some sorting done on list of links may rely on the fact that text is first
33  return formatter() << "<ref dst='" << dst << "'>" << text << "</ref>";
34 }
35 
36 std::string img(const std::string& src, const std::string& align, bool floating)
37 {
38  return formatter()
39  << "<img src='" << src << "' "
40  << "float='" << std::boolalpha << floating << "' "
41  << "align='" << align << "' "
42  << "/>";
43 }
44 
45 //
46 // Markup Parser
47 //
48 
49 /*
50 
51 Here's a little mini-grammar of the markup language:
52 
53 DOCUMENT ::= (TEXT | TAG)*
54 TEXT ::= ([^<&\] | ENTITY | ESCAPE)*
55 ESCAPE ::= '\' [:unicode-char:]
56 ENTITY ::= '&' '#' [0-9]+ ';'
57 ENTITY ::= '&' 'x' [0-9a-fA-F]+ ';'
58 ENTITY ::= '&' NAME ';'
59 TAG ::= '<' NAME ATTRIBUTE* '/' '>'
60 TAG ::= '<' NAME ATTRIBUTE* '>' DOCUMENT '<' '/' NAME '>' ## NB: the names must match!
61 TAG ::= '<' NAME '>' ATTRIBUTE* TEXT? '<' '/' NAME '>' ## NB: the names must match!
62 ATTRIBUTE ::= NAME
63 ATTRIBUTE ::= NAME '=' [^'" ]*
64 ATTRIBUTE ::= NAME '=' "'" TEXT "'"
65 ATTRIBUTE ::= NAME '=' '"' TEXT '"'
66 NAME ::= [_0-9a-zA-Z]+
67 
68 Notes:
69 * Entities and the first two tag formats are Pango-style. The tags can be nested inside each other.
70 * Escapes and the third tag format are for compatibility with the old help markup. Tags cannot be nested.
71 * This mostly doesn't attempt to define the meaning of specific tags or entity names. It does however substitute numeric entities, as well as some very basic named entities: lt, gt, amp, quot, apos.
72 * The definition of TEXT is left a bit nebulous, but just think of it as "non-greedy"
73 * Attributes without a value are only supported in Pango-style tags
74 * Some restrictions may apply beyond what the grammar specifies. For example, arbitrary named entities are not supported in attribute values (numeric ones and the 5 special ones work though).
75 
76 ------
77 
78 The result of the parsing is represented in the format of a WML config.
79 Text spans are represented as a [text] tag, and character entities as a [character_entity] tag.
80 All other tags are represented by a tag of the same name.
81 Any attributes on a tag become key-value pairs within the tag.
82 Old-style help markup tags with text at the end put the text in a "text" key in the tag.
83 The same approach is used for new-style Pango tags, but only if there are no nested tags or entities.
84 If there ARE nested tags or entities, the contents of the tag is broken down into spans as subtags of the parent tag.
85 Thus, a tag with content has EITHER a text attribute OR some subtags.
86 
87 Note: Only unrecognized named entities count for the above purposes!
88 Numerical entities and the special five lt, gt, amp, apos, quot are directly substituted in-place.
89 
90 Also, text spans will be broken up on paragraph breaks (double newlines).
91 This means that adjacent [text] tags should be rendered with a paragraph break between them.
92 However, no paragraph break should be used when [text] is followed by something else.
93 It is possible to have empty text spans in some cases, for example given a run of more than 2 newlines,
94 or a character entity directly followed by a paragraph break.
95 
96 */
97 
98 namespace
99 {
100 using namespace std::string_literals;
101 const std::array old_style_tags{ "bold"s, "italic"s, "header"s, "format"s, "img"s, "ref"s, "jump"s };
102 const std::array old_style_attr{ /*ref*/ "dst"s, "text"s, "force"s, /*jump*/ "to"s, "amount"s, /*img*/ "src"s, "align"s, "float"s, /*format*/ "bold"s, "italic"s, "color"s, "font_size"s };
103 }
104 
105 static std::string position_info(const std::string::const_iterator& text_start, const std::string::const_iterator& error_position)
106 {
107  // line numbers start from 1
108  int lines = std::count(text_start, error_position, '\n') + 1;
109  // Find the start position of the line where the current position is.
110  // We do this by searching in reverse from cursor_position toward text_start.
111  auto pos = error_position;
112  for(; pos != text_start && *pos != '\n'; pos--);
113  return formatter()
114  << "line " << lines
115  << ", character " << utf8::size(pos, error_position);
116 }
117 
118 static config parse_entity(std::string::const_iterator& beg, std::string::const_iterator end)
119 {
120  config entity;
121  std::stringstream s;
122  enum { UNKNOWN, NAMED, HEX, DECIMAL } type = UNKNOWN;
123  assert(*beg == '&');
124  ++beg;
125  for(; beg != end && *beg != ';'; ++beg) {
126  switch(type) {
127  case UNKNOWN:
128  if(*beg == '#') {
129  type = DECIMAL;
130  } else if(isalnum(*beg) || *beg == '_') {
131  type = NAMED;
132  s << *beg;
133  } else {
134  throw parse_error(beg, "invalid entity: unexpected characters after '&', alphanumeric characters, '#' or '_' expected.");
135  }
136  break;
137  case NAMED:
138  if(!isalnum(*beg)) {
139  throw parse_error(beg, "invalid entity: non-alphanumeric characters after '&'.");
140  }
141  s << *beg;
142  break;
143  case DECIMAL:
144  if(*beg == 'x') {
145  type = HEX;
146  } else if(isdigit(*beg)) {
147  s << *beg;
148  } else {
149  throw parse_error(beg, "invalid entity: unexpected characters after '&#', numbers or 'x' expected.");
150  }
151  break;
152  case HEX:
153  if(isxdigit(*beg)) {
154  s << *beg;
155  } else {
156  throw parse_error(beg, "invalid entity: unexpected characters after '&#x', hexadecimal digits expected.");
157  }
158  break;
159  }
160  }
161  if(type == NAMED) {
162  std::string name = s.str();
163  entity["name"] = name;
164  if(name == "lt") {
165  entity["code_point"] = '<';
166  } else if(name == "gt") {
167  entity["code_point"] = '>';
168  } else if(name == "apos") {
169  entity["code_point"] = '\'';
170  } else if(name == "quot") {
171  entity["code_point"] = '"';
172  } else if(name == "amp") {
173  entity["code_point"] = '&';
174  }
175  } else {
176  s.seekg(0);
177  if(type == HEX) {
178  s >> std::hex;
179  }
180  int n;
181  s >> n;
182  entity["code_point"] = n;
183  }
184  return entity;
185 }
186 
187 static char parse_escape(std::string::const_iterator& beg, std::string::const_iterator end)
188 {
189  assert(*beg == '\\');
190  // An escape at the end of stream is just treated as a literal.
191  // Otherwise, take the next character as a literal and be done with it.
192  if((beg + 1) != end) {
193  ++beg;
194  }
195  return *beg;
196 }
197 
198 static config parse_text_until(std::string::const_iterator& beg, std::string::const_iterator end, char close)
199 {
200  // In practice, close will be one of < ' "
201  // Parsing will go until either close or end of stream, and will emit one or more text and character_entity tags.
202  // However, recognized character entities will be collapsed into the text tags.
203  std::ostringstream s;
204  bool saw_newline = false;
205  config res;
206  for(; beg != end && *beg != close; ++beg) {
207  if(*beg == '&') {
208  auto entity = parse_entity(beg, end);
209  if(beg == end) {
210  throw parse_error(beg, "unexpected end of stream after entity");
211  }
212  if(entity.has_attribute("code_point")) {
213  s << unicode_cast<std::string>(entity["code_point"].to_int());
214  } else {
215  // TODO: Adding the text here seems wrong in the case that the stream BEGINS with an entity...
216  res.add_child("text", config("text", s.str()));
217  res.add_child("character_entity", entity);
218  s.str("");
219  }
220  } else if(*beg == '\\') {
221  s << parse_escape(beg, end);
222  } else if(*beg == '\n') {
223  if(saw_newline) {
224  res.add_child("text", config("text", s.str()));
225  s.str("");
226  } else {
227  saw_newline = true;
228  continue;
229  }
230  } else {
231  if(saw_newline) {
232  s << '\n';
233  }
234  s << *beg;
235  }
236  saw_newline = false;
237  }
238  // If the span ended in a newline, preserve it
239  if(saw_newline) {
240  s << '\n';
241  }
242  res.add_child("text", config("text", s.str()));
243  assert(beg == end || *beg == close);
244  return res;
245 }
246 
247 static std::string parse_name(std::string::const_iterator& beg, std::string::const_iterator end)
248 {
249  std::ostringstream s;
250  for(; beg != end && (isalnum(*beg) || *beg == '_'); ++beg) {
251  s << *beg;
252  }
253  return s.str();
254 }
255 
256 static std::pair<std::string, std::string> parse_attribute(std::string::const_iterator& beg, std::string::const_iterator end, bool old_style)
257 {
258  std::string attr = parse_name(beg, end);
259  if(attr.empty()) {
260  throw parse_error(beg, "missing attribute name");
261  }
262  if(old_style && !utils::contains(old_style_attr, attr)) {
263  throw parse_error(beg, "dummy error: not an old-style attribute name"); // old-style=true caller ignores parse errors
264  }
265 
266  while(isspace(*beg)) ++beg;
267 
268  if(*beg != '=') {
269  if(old_style) {
270  throw parse_error(beg, "attribute missing value in old-style tag");
271  } else {
272  // The caller expects beg to point to the last character of the attribute upon return.
273  // But in this path, we're now pointing to the character AFTER that.
274  --beg;
275  return {attr, ""};
276  }
277  }
278  ++beg;
279  while(isspace(*beg)) ++beg;
280 
281  std::string value;
282  if(*beg == '\'' || *beg == '"') {
283  config res = parse_text_until(beg, end, *beg++);
284  if(res.has_child("character_entity")) {
285  throw parse_error(beg, "unsupported entity in attribute value");
286  } else if(res.all_children_count() > 1) {
287  throw parse_error(beg, "paragraph break in attribute value");
288  }
289  if(auto t = res.optional_child("text")) {
290  value = t["text"].str();
291  }
292  } else {
293  std::ostringstream s;
294  bool found_slash = false;
295  for(; beg != end && *beg != '>' && *beg != '<' && !isspace(*beg); ++beg) {
296  if(*beg == '&') {
297  auto entity = parse_entity(beg, end);
298  if(beg == end) {
299  throw parse_error(beg, "unexpected end of stream after entity");
300  }
301  if(entity.has_attribute("code_point")) {
302  s << unicode_cast<std::string>(entity["code_point"].to_int());
303  } else {
304  throw parse_error(beg, "unsupported entity in attribute value");
305  }
306  } else if(*beg == '\\') {
307  s << parse_escape(beg, end);
308  } else if(*beg == '/') {
309  found_slash = true;
310  } else {
311  if(found_slash) {
312  s << '/';
313  found_slash = false;
314  }
315  s << *beg;
316  }
317  }
318  value = s.str();
319  // The caller expects beg to point to the last character of the attribute upon return.
320  // But in this path, we're now pointing to the character AFTER that.
321  --beg;
322  if(found_slash) --beg;
323  }
324  return {attr, value};
325 }
326 
327 static void check_closing_tag(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view tag_name)
328 {
329  std::size_t remaining = end - beg;
330  assert(remaining >= 2 && *beg == '<' && *(beg + 1) == '/');
331  if(remaining < tag_name.size() + 3) {
332  throw parse_error(beg, "Unexpected end of stream in closing tag");
333  }
334  beg += 2;
335  if(!std::equal(tag_name.begin(), tag_name.end(), beg)) {
336  throw parse_error(beg, "Mismatched closing tag " + std::string(tag_name));
337  }
338  beg += tag_name.size();
339  if(*beg != '>') {
340  throw parse_error(beg, "Unterminated closing tag " + std::string(tag_name));
341  }
342  ++beg;
343 }
344 
345 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end);
346 static config parse_tag_contents(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view tag_name, bool check_for_attributes)
347 {
348  assert(*beg == '>');
349  ++beg;
350 
351  if(!utils::contains(old_style_tags, tag_name)) {
352  check_for_attributes = false;
353  }
354 
355  // This also parses the matching closing tag!
356  config res;
357  for(; check_for_attributes && beg != end && *beg != '<'; ++beg) {
358  if(isspace(*beg)) continue;
359  auto save_beg = beg;
360  try {
361  auto [key, val] = parse_attribute(beg, end, true);
362  res[key] = val;
363  } catch(parse_error&) {
364  beg = save_beg;
365  while(beg != end && isspace(*beg)) ++beg;
366  break;
367  }
368  }
369  if(res.has_attribute("text")) {
370  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
371  throw parse_error(beg, "Extra text at the end of old-style tag with explicit 'text' attribute");
372  }
373  check_closing_tag(beg, end, tag_name);
374  return res;
375  } else if(res.attribute_count() > 0) {
376  config text = parse_text_until(beg, end, '<');
377  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
378  throw parse_error(beg, "Extra text at the end of old-style tag with explicit 'text' attribute");
379  }
380  if(text.all_children_count() == 1 && text.has_child("text")) {
381  res["text"] = text.mandatory_child("text")["text"];
382  } else {
383  res.append_children(text);
384  }
385  check_closing_tag(beg, end, tag_name);
386  return res;
387  }
388  while(true) {
389  config text = parse_text_until(beg, end, '<');
390  if(beg == end || beg + 1 == end) {
391  throw parse_error(beg, "Missing closing tag for " + std::string(tag_name));
392  }
393  res.append_children(text);
394  if(*(beg + 1) == '/') {
395  check_closing_tag(beg, end, tag_name);
396  break;
397  }
398  auto [tag, contents] = parse_tag(beg, end);
399  res.add_child(tag, contents);
400  }
401  if(res.all_children_count() == 1 && res.has_child("text")) {
402  return res.mandatory_child("text");
403  }
404  return res;
405 }
406 
407 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end)
408 {
409  assert(*beg == '<');
410  ++beg;
411  std::string tag_name = parse_name(beg, end);
412  if(tag_name.empty()) {
413  throw parse_error(beg, "missing tag name");
414  }
415  bool auto_closed = false;
416  config elem;
417  for(; beg != end && *beg != '>'; ++beg) {
418  if(isspace(*beg)) continue;
419  if(*beg == '/' && (beg + 1) != end && *(beg + 1) == '>') {
420  auto_closed = true;
421  } else if(isalnum(*beg) || *beg == '_') {
422  const auto& [key, value] = parse_attribute(beg, end, false);
423  if(beg == end) {
424  throw parse_error(beg, "unexpected end of stream following attribute");
425  }
426  elem[key] = value;
427  }
428  }
429  if(auto_closed) {
430  assert(*beg == '>');
431  ++beg;
432  } else {
433  config contents = parse_tag_contents(beg, end, tag_name, elem.attribute_count() == 0);
434  if(contents.all_children_count() == 0 && contents.attribute_count() == 1 && contents.has_attribute("text")) {
435  elem["text"] = contents["text"];
436  } else {
437  elem.append(contents);
438  }
439  }
440  return {tag_name, elem};
441 }
442 
443 config parse_text(const std::string& text)
444 {
445  config res;
446  auto beg = text.begin(), end = text.end();
447  try {
448  while(beg != end) {
449  if(*beg == '<') {
450  auto [tag, contents] = parse_tag(beg, end);
451  res.add_child(tag, contents);
452  } else {
453  config text = parse_text_until(beg, end, '<');
454  res.append_children(text);
455  }
456  }
457  } catch(parse_error& e) {
458  // NOTE: The text.begin() itor is in scope here, so we add the error location info
459  // to the error message here and rethrow. Both itors used in the call to
460  // position_info below can go out of scope otherwise.
461  e.message = position_info(text.begin(), e.error_location()) + ": " + e.message;
462  throw e;
463  }
464  return res;
465 }
466 
467 }
double t
Definition: astarsearch.cpp:63
A config object defines a single node in a WML file, with access to child nodes.
Definition: config.hpp:157
config & add_child(std::string_view key)
Definition: config.cpp:435
void append(const config &cfg)
Append data from another config object to this one.
Definition: config.cpp:187
std::size_t attribute_count() const
Count the number of non-blank attributes.
Definition: config.cpp:306
optional_config_impl< config > optional_child(std::string_view key, int n=0)
Equivalent to mandatory_child, but returns an empty optional if the nth child was not found.
Definition: config.cpp:379
std::size_t all_children_count() const
Definition: config.cpp:301
bool has_attribute(std::string_view key) const
Definition: config.cpp:156
bool has_child(std::string_view key) const
Determine whether a config has a child or not.
Definition: config.cpp:311
void append_children(const config &cfg)
Adds children from cfg.
Definition: config.cpp:166
config & mandatory_child(std::string_view key, int n=0)
Returns the nth child with the given key, or throws an error if there is none.
Definition: config.cpp:361
std::ostringstream wrapper.
Definition: formatter.hpp:40
Definitions for the interface to Wesnoth Markup Language (WML).
static std::string parse_name(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:247
std::string img(const std::string &src, const std::string &align, bool floating)
Generates a Help markup tag corresponding to an image.
Definition: markup.cpp:36
std::string make_link(const std::string &text, const std::string &dst)
Generates a Help markup tag corresponding to a reference or link.
Definition: markup.cpp:30
static std::string position_info(const std::string::const_iterator &text_start, const std::string::const_iterator &error_position)
Definition: markup.cpp:105
static config parse_tag_contents(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view tag_name, bool check_for_attributes)
Definition: markup.cpp:346
static void check_closing_tag(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view tag_name)
Definition: markup.cpp:327
static std::pair< std::string, config > parse_tag(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:407
static char parse_escape(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:187
static config parse_text_until(std::string::const_iterator &beg, std::string::const_iterator end, char close)
Definition: markup.cpp:198
std::string tag(std::string_view tag, Args &&... data)
Wraps the given data in the specified tag.
Definition: markup.hpp:45
static config parse_entity(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:118
static std::pair< std::string, std::string > parse_attribute(std::string::const_iterator &beg, std::string::const_iterator end, bool old_style)
Definition: markup.cpp:256
config parse_text(const std::string &text)
Parse a xml style marked up text string.
Definition: markup.cpp:443
std::size_t size(std::string_view str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:81
bool contains(const Container &container, const Value &value)
Returns true iff value is found in container.
Definition: general.hpp:87
rect dst
Location on the final composed sheet.
rect src
Non-transparent portion of the surface to compose.
Thrown when the help system fails to parse something.
Definition: markup.hpp:213
static map_location::direction n
static map_location::direction s
#define e