The Battle for Wesnoth  1.19.16+dev
markup.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2024 - 2025
3  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 #include "serialization/markup.hpp"
16 
17 #include "config.hpp"
18 #include "formatter.hpp"
19 #include "game_config.hpp"
20 #include "gettext.hpp"
23 #include "serialization/unicode_cast.hpp" // for unicode_cast
24 
25 #include <algorithm>
26 
27 namespace markup {
28 
29 std::string make_link(const std::string& text, const std::string& dst)
30 {
31  // some sorting done on list of links may rely on the fact that text is first
32  return formatter() << "<ref dst='" << dst << "'>" << text << "</ref>";
33 }
34 
35 std::string img(const std::string& src, const std::string& align, bool floating)
36 {
37  return formatter()
38  << "<img src='" << src << "' "
39  << "float='" << std::boolalpha << floating << "' "
40  << "align='" << align << "' "
41  << "/>";
42 }
43 
44 //
45 // Markup Parser
46 //
47 
48 /*
49 
50 Here's a little mini-grammar of the markup language:
51 
52 DOCUMENT ::= (TEXT | TAG)*
53 TEXT ::= ([^<&\] | ENTITY | ESCAPE)*
54 ESCAPE ::= '\' [:unicode-char:]
55 ENTITY ::= '&' '#' [0-9]+ ';'
56 ENTITY ::= '&' 'x' [0-9a-fA-F]+ ';'
57 ENTITY ::= '&' NAME ';'
58 TAG ::= '<' NAME ATTRIBUTE* '/' '>'
59 TAG ::= '<' NAME ATTRIBUTE* '>' DOCUMENT '<' '/' NAME '>' ## NB: the names must match!
60 TAG ::= '<' NAME '>' ATTRIBUTE* TEXT? '<' '/' NAME '>' ## NB: the names must match!
61 ATTRIBUTE ::= NAME
62 ATTRIBUTE ::= NAME '=' [^'" ]*
63 ATTRIBUTE ::= NAME '=' "'" TEXT "'"
64 ATTRIBUTE ::= NAME '=' '"' TEXT '"'
65 NAME ::= [_0-9a-zA-Z]+
66 
67 Notes:
68 * Entities and the first two tag formats are Pango-style. The tags can be nested inside each other.
69 * Escapes and the third tag format are for compatibility with the old help markup. Tags cannot be nested.
70 * This mostly doesn't attempt to define the meaning of specific tags or entity names. It does however substitute numeric entities, as well as some very basic named entities: lt, gt, amp, quot, apos.
71 * The definition of TEXT is left a bit nebulous, but just think of it as "non-greedy"
72 * Attributes without a value are only supported in Pango-style tags
73 * Some restrictions may apply beyond what the grammar specifies. For example, arbitrary named entities are not supported in attribute values (numeric ones and the 5 special ones work though).
74 
75 ------
76 
77 The result of the parsing is represented in the format of a WML config.
78 Text spans are represented as a [text] tag, and character entities as a [character_entity] tag.
79 All other tags are represented by a tag of the same name.
80 Any attributes on a tag become key-value pairs within the tag.
81 Old-style help markup tags with text at the end put the text in a "text" key in the tag.
82 The same approach is used for new-style Pango tags, but only if there are no nested tags or entities.
83 If there ARE nested tags or entities, the contents of the tag is broken down into spans as subtags of the parent tag.
84 Thus, a tag with content has EITHER a text attribute OR some subtags.
85 
86 Note: Only unrecognized named entities count for the above purposes!
87 Numerical entities and the special five lt, gt, amp, apos, quot are directly substituted in-place.
88 
89 Also, text spans will be broken up on paragraph breaks (double newlines).
90 This means that adjacent [text] tags should be rendered with a paragraph break between them.
91 However, no paragraph break should be used when [text] is followed by something else.
92 It is possible to have empty text spans in some cases, for example given a run of more than 2 newlines,
93 or a character entity directly followed by a paragraph break.
94 
95 */
96 
97 static std::string position_info(const std::string::const_iterator& text_start, const std::string::const_iterator& error_position)
98 {
99  // line numbers start from 1
100  int lines = std::count(text_start, error_position, '\n') + 1;
101  // Find the start position of the line where the current position is.
102  // We do this by searching in reverse from cursor_position toward text_start.
103  auto pos = error_position;
104  for(; pos != text_start && *pos != '\n'; pos--);
105  return formatter()
106  << "line " << lines
107  << ", character " << utf8::size(pos, error_position);
108 }
109 
110 static config parse_entity(std::string::const_iterator& beg, std::string::const_iterator end)
111 {
112  config entity;
113  std::stringstream s;
114  enum { UNKNOWN, NAMED, HEX, DECIMAL } type = UNKNOWN;
115  assert(*beg == '&');
116  ++beg;
117  for(; beg != end && *beg != ';'; ++beg) {
118  switch(type) {
119  case UNKNOWN:
120  if(*beg == '#') {
121  type = DECIMAL;
122  } else if(isalnum(*beg) || *beg == '_') {
123  type = NAMED;
124  s << *beg;
125  } else {
126  throw parse_error(beg, "invalid entity: unexpected characters after '&', alphanumeric characters, '#' or '_' expected.");
127  }
128  break;
129  case NAMED:
130  if(!isalnum(*beg)) {
131  throw parse_error(beg, "invalid entity: non-alphanumeric characters after '&'.");
132  }
133  s << *beg;
134  break;
135  case DECIMAL:
136  if(*beg == 'x') {
137  type = HEX;
138  } else if(isdigit(*beg)) {
139  s << *beg;
140  } else {
141  throw parse_error(beg, "invalid entity: unexpected characters after '&#', numbers or 'x' expected.");
142  }
143  break;
144  case HEX:
145  if(isxdigit(*beg)) {
146  s << *beg;
147  } else {
148  throw parse_error(beg, "invalid entity: unexpected characters after '&#x', hexadecimal digits expected.");
149  }
150  break;
151  }
152  }
153  if(type == NAMED) {
154  std::string name = s.str();
155  entity["name"] = name;
156  if(name == "lt") {
157  entity["code_point"] = '<';
158  } else if(name == "gt") {
159  entity["code_point"] = '>';
160  } else if(name == "apos") {
161  entity["code_point"] = '\'';
162  } else if(name == "quot") {
163  entity["code_point"] = '"';
164  } else if(name == "amp") {
165  entity["code_point"] = '&';
166  }
167  } else {
168  s.seekg(0);
169  if(type == HEX) {
170  s >> std::hex;
171  }
172  int n;
173  s >> n;
174  entity["code_point"] = n;
175  }
176  return entity;
177 }
178 
179 static char parse_escape(std::string::const_iterator& beg, std::string::const_iterator end)
180 {
181  assert(*beg == '\\');
182  // An escape at the end of stream is just treated as a literal.
183  // Otherwise, take the next character as a literal and be done with it.
184  if((beg + 1) != end) {
185  ++beg;
186  }
187  return *beg;
188 }
189 
190 static config parse_text_until(std::string::const_iterator& beg, std::string::const_iterator end, char close)
191 {
192  // In practice, close will be one of < ' "
193  // Parsing will go until either close or end of stream, and will emit one or more text and character_entity tags.
194  // However, recognized character entities will be collapsed into the text tags.
195  std::ostringstream s;
196  bool saw_newline = false;
197  config res;
198  for(; beg != end && *beg != close; ++beg) {
199  if(*beg == '&') {
200  auto entity = parse_entity(beg, end);
201  if(beg == end) {
202  throw parse_error(beg, "unexpected end of stream after entity");
203  }
204  if(entity.has_attribute("code_point")) {
205  s << unicode_cast<std::string>(entity["code_point"].to_int());
206  } else {
207  // TODO: Adding the text here seems wrong in the case that the stream BEGINS with an entity...
208  res.add_child("text", config("text", s.str()));
209  res.add_child("character_entity", entity);
210  s.str("");
211  }
212  } else if(*beg == '\\') {
213  s << parse_escape(beg, end);
214  } else if(*beg == '\n') {
215  if(saw_newline) {
216  res.add_child("text", config("text", s.str()));
217  s.str("");
218  } else {
219  saw_newline = true;
220  continue;
221  }
222  } else {
223  if(saw_newline) {
224  s << '\n';
225  }
226  s << *beg;
227  }
228  saw_newline = false;
229  }
230  // If the span ended in a newline, preserve it
231  if(saw_newline) {
232  s << '\n';
233  }
234  res.add_child("text", config("text", s.str()));
235  assert(beg == end || *beg == close);
236  return res;
237 }
238 
239 static std::string parse_name(std::string::const_iterator& beg, std::string::const_iterator end)
240 {
241  std::ostringstream s;
242  for(; beg != end && (isalnum(*beg) || *beg == '_'); ++beg) {
243  s << *beg;
244  }
245  return s.str();
246 }
247 
248 static std::pair<std::string, std::string> parse_attribute(std::string::const_iterator& beg, std::string::const_iterator end, bool allow_empty)
249 {
250  std::string attr = parse_name(beg, end);
251  if(attr.empty()) {
252  throw parse_error(beg, "missing attribute name");
253  }
254  while(isspace(*beg)) ++beg;
255 
256  if(*beg != '=') {
257  if(allow_empty) {
258  // The caller expects beg to point to the last character of the attribute upon return.
259  // But in this path, we're now pointing to the character AFTER that.
260  --beg;
261  return {attr, ""};
262  } else throw parse_error(beg, "attribute missing value in old-style tag");
263  }
264  ++beg;
265  while(isspace(*beg)) ++beg;
266 
267  std::string value;
268  if(*beg == '\'' || *beg == '"') {
269  config res = parse_text_until(beg, end, *beg++);
270  if(res.has_child("character_entity")) {
271  throw parse_error(beg, "unsupported entity in attribute value");
272  } else if(res.all_children_count() > 1) {
273  throw parse_error(beg, "paragraph break in attribute value");
274  }
275  if(auto t = res.optional_child("text")) {
276  value = t["text"].str();
277  }
278  } else {
279  std::ostringstream s;
280  bool found_slash = false;
281  for(; beg != end && *beg != '>' && *beg != '<' && !isspace(*beg); ++beg) {
282  if(*beg == '&') {
283  auto entity = parse_entity(beg, end);
284  if(beg == end) {
285  throw parse_error(beg, "unexpected end of stream after entity");
286  }
287  if(entity.has_attribute("code_point")) {
288  s << unicode_cast<std::string>(entity["code_point"].to_int());
289  } else {
290  throw parse_error(beg, "unsupported entity in attribute value");
291  }
292  } else if(*beg == '\\') {
293  s << parse_escape(beg, end);
294  } else if(*beg == '/') {
295  found_slash = true;
296  } else {
297  if(found_slash) {
298  s << '/';
299  found_slash = false;
300  }
301  s << *beg;
302  }
303  }
304  value = s.str();
305  // The caller expects beg to point to the last character of the attribute upon return.
306  // But in this path, we're now pointing to the character AFTER that.
307  --beg;
308  if(found_slash) --beg;
309  }
310  return {attr, value};
311 }
312 
313 static void check_closing_tag(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match)
314 {
315  std::size_t remaining = end - beg;
316  assert(remaining >= 2 && *beg == '<' && *(beg + 1) == '/');
317  if(remaining < match.size() + 3) {
318  throw parse_error(beg, "Unexpected end of stream in closing tag");
319  }
320  beg += 2;
321  if(!std::equal(match.begin(), match.end(), beg)) {
322  throw parse_error(beg, "Mismatched closing tag " + std::string(match));
323  }
324  beg += match.size();
325  if(*beg != '>') {
326  throw parse_error(beg, "Unterminated closing tag " + std::string(match));
327  }
328  ++beg;
329 }
330 
331 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end);
332 static config parse_tag_contents(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
333 {
334  assert(*beg == '>');
335  ++beg;
336  // This also parses the matching closing tag!
337  config res;
338  for(; check_for_attributes && beg != end && *beg != '<'; ++beg) {
339  if(isspace(*beg)) continue;
340  auto save_beg = beg;
341  try {
342  auto [key, val] = parse_attribute(beg, end, false);
343  res[key] = val;
344  } catch(parse_error&) {
345  beg = save_beg;
346  while(beg != end && isspace(*beg)) ++beg;
347  break;
348  }
349  }
350  if(res.has_attribute("text")) {
351  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
352  throw parse_error(beg, "Extra text at the end of old-style tag with explicit 'text' attribute");
353  }
354  check_closing_tag(beg, end, match);
355  return res;
356  } else if(res.attribute_count() > 0) {
357  config text = parse_text_until(beg, end, '<');
358  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
359  throw parse_error(beg, "Extra text at the end of old-style tag with explicit 'text' attribute");
360  }
361  if(text.all_children_count() == 1 && text.has_child("text")) {
362  res["text"] = text.mandatory_child("text")["text"];
363  } else {
364  res.append_children(text);
365  }
366  check_closing_tag(beg, end, match);
367  return res;
368  }
369  while(true) {
370  config text = parse_text_until(beg, end, '<');
371  if(beg == end || beg + 1 == end) {
372  throw parse_error(beg, "Missing closing tag for " + std::string(match));
373  }
374  res.append_children(text);
375  if(*(beg + 1) == '/') {
376  check_closing_tag(beg, end, match);
377  break;
378  }
379  auto [tag, contents] = parse_tag(beg, end);
380  res.add_child(tag, contents);
381  }
382  if(res.all_children_count() == 1 && res.has_child("text")) {
383  return res.mandatory_child("text");
384  }
385  return res;
386 }
387 
388 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end)
389 {
390  assert(*beg == '<');
391  ++beg;
392  std::string tag_name = parse_name(beg, end);
393  if(tag_name.empty()) {
394  throw parse_error(beg, "missing tag name");
395  }
396  bool auto_closed = false;
397  config elem;
398  for(; beg != end && *beg != '>'; ++beg) {
399  if(isspace(*beg)) continue;
400  if(*beg == '/' && (beg + 1) != end && *(beg + 1) == '>') {
401  auto_closed = true;
402  } else if(isalnum(*beg) || *beg == '_') {
403  const auto& [key, value] = parse_attribute(beg, end, true);
404  if(beg == end) {
405  throw parse_error(beg, "unexpected end of stream following attribute");
406  }
407  elem[key] = value;
408  }
409  }
410  if(auto_closed) {
411  assert(*beg == '>');
412  ++beg;
413  } else {
414  config contents = parse_tag_contents(beg, end, tag_name, elem.attribute_count() == 0);
415  if(contents.all_children_count() == 0 && contents.attribute_count() == 1 && contents.has_attribute("text")) {
416  elem["text"] = contents["text"];
417  } else {
418  elem.append(contents);
419  }
420  }
421  return {tag_name, elem};
422 }
423 
424 config parse_text(const std::string& text)
425 {
426  config res;
427  auto beg = text.begin(), end = text.end();
428  try {
429  while(beg != end) {
430  if(*beg == '<') {
431  auto [tag, contents] = parse_tag(beg, end);
432  res.add_child(tag, contents);
433  } else {
434  config text = parse_text_until(beg, end, '<');
435  res.append_children(text);
436  }
437  }
438  } catch(parse_error& e) {
439  // NOTE: The text.begin() itor is in scope here, so we add the error location info
440  // to the error message here and rethrow. Both itors used in the call to
441  // position_info below can go out of scope otherwise.
442  e.message = position_info(text.begin(), e.error_location()) + ": " + e.message;
443  throw e;
444  }
445  return res;
446 }
447 
448 }
double t
Definition: astarsearch.cpp:63
A config object defines a single node in a WML file, with access to child nodes.
Definition: config.hpp:157
config & add_child(std::string_view key)
Definition: config.cpp:435
void append(const config &cfg)
Append data from another config object to this one.
Definition: config.cpp:187
std::size_t attribute_count() const
Count the number of non-blank attributes.
Definition: config.cpp:306
optional_config_impl< config > optional_child(std::string_view key, int n=0)
Equivalent to mandatory_child, but returns an empty optional if the nth child was not found.
Definition: config.cpp:379
std::size_t all_children_count() const
Definition: config.cpp:301
bool has_attribute(std::string_view key) const
Definition: config.cpp:156
bool has_child(std::string_view key) const
Determine whether a config has a child or not.
Definition: config.cpp:311
void append_children(const config &cfg)
Adds children from cfg.
Definition: config.cpp:166
config & mandatory_child(std::string_view key, int n=0)
Returns the nth child with the given key, or throws an error if there is none.
Definition: config.cpp:361
std::ostringstream wrapper.
Definition: formatter.hpp:40
Definitions for the interface to Wesnoth Markup Language (WML).
static std::string parse_name(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:239
static config parse_tag_contents(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
Definition: markup.cpp:332
std::string img(const std::string &src, const std::string &align, bool floating)
Generates a Help markup tag corresponding to an image.
Definition: markup.cpp:35
std::string make_link(const std::string &text, const std::string &dst)
Generates a Help markup tag corresponding to a reference or link.
Definition: markup.cpp:29
static std::string position_info(const std::string::const_iterator &text_start, const std::string::const_iterator &error_position)
Definition: markup.cpp:97
static std::pair< std::string, std::string > parse_attribute(std::string::const_iterator &beg, std::string::const_iterator end, bool allow_empty)
Definition: markup.cpp:248
static std::pair< std::string, config > parse_tag(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:388
static char parse_escape(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:179
static config parse_text_until(std::string::const_iterator &beg, std::string::const_iterator end, char close)
Definition: markup.cpp:190
std::string tag(std::string_view tag, Args &&... data)
Wraps the given data in the specified tag.
Definition: markup.hpp:45
static config parse_entity(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:110
static void check_closing_tag(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view match)
Definition: markup.cpp:313
config parse_text(const std::string &text)
Parse a xml style marked up text string.
Definition: markup.cpp:424
std::size_t size(std::string_view str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:81
rect dst
Location on the final composed sheet.
rect src
Non-transparent portion of the surface to compose.
Thrown when the help system fails to parse something.
Definition: markup.hpp:213
static map_location::direction n
static map_location::direction s
#define e