The Battle for Wesnoth  1.19.15+dev
markup.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2024 - 2025
3  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 #include "serialization/markup.hpp"
16 
17 #include "config.hpp"
18 #include "formatter.hpp"
19 #include "game_config.hpp"
20 #include "gettext.hpp"
23 #include "serialization/unicode_cast.hpp" // for unicode_cast
24 
25 #include <algorithm>
26 
27 namespace markup {
28 
29 std::string make_link(const std::string& text, const std::string& dst)
30 {
31  // some sorting done on list of links may rely on the fact that text is first
32  return formatter() << "<ref dst='" << dst << "'>" << text << "</ref>";
33 }
34 
35 std::string img(const std::string& src, const std::string& align, bool floating)
36 {
37  return formatter()
38  << "<img src='" << src << "' "
39  << "float='" << std::boolalpha << floating << "' "
40  << "align='" << align << "' "
41  << "/>";
42 }
43 
44 //
45 // Markup Parser
46 //
47 
48 /*
49 
50 Here's a little mini-grammar of the markup language:
51 
52 DOCUMENT ::= (TEXT | TAG)*
53 TEXT ::= ([^<&\] | ENTITY | ESCAPE)*
54 ESCAPE ::= '\' [:unicode-char:]
55 ENTITY ::= '&' '#' [0-9]+ ';'
56 ENTITY ::= '&' 'x' [0-9a-fA-F]+ ';'
57 ENTITY ::= '&' NAME ';'
58 TAG ::= '<' NAME ATTRIBUTE* '/' '>'
59 TAG ::= '<' NAME ATTRIBUTE* '>' DOCUMENT '<' '/' NAME '>' ## NB: the names must match!
60 TAG ::= '<' NAME '>' ATTRIBUTE* TEXT? '<' '/' NAME '>' ## NB: the names must match!
61 ATTRIBUTE ::= NAME
62 ATTRIBUTE ::= NAME '=' [^'" ]*
63 ATTRIBUTE ::= NAME '=' "'" TEXT "'"
64 ATTRIBUTE ::= NAME '=' '"' TEXT '"'
65 NAME ::= [_0-9a-zA-Z]+
66 
67 Notes:
68 * Entities and the first two tag formats are Pango-style. The tags can be nested inside each other.
69 * Escapes and the third tag format are for compatibility with the old help markup. Tags cannot be nested.
70 * This mostly doesn't attempt to define the meaning of specific tags or entity names. It does however substitute numeric entities, as well as some very basic named entities: lt, gt, amp, quot, apos.
71 * The definition of TEXT is left a bit nebulous, but just think of it as "non-greedy"
72 * Attributes without a value are only supported in Pango-style tags
73 * Some restrictions may apply beyond what the grammar specifies. For example, arbitrary named entities are not supported in attribute values (numeric ones and the 5 special ones work though).
74 
75 ------
76 
77 The result of the parsing is represented in the format of a WML config.
78 Text spans are represented as a [text] tag, and character entities as a [character_entity] tag.
79 All other tags are represented by a tag of the same name.
80 Any attributes on a tag become key-value pairs within the tag.
81 Old-style help markup tags with text at the end put the text in a "text" key in the tag.
82 The same approach is used for new-style Pango tags, but only if there are no nested tags or entities.
83 If there ARE nested tags or entities, the contents of the tag is broken down into spans as subtags of the parent tag.
84 Thus, a tag with content has EITHER a text attribute OR some subtags.
85 
86 Note: Only unrecognized named entities count for the above purposes!
87 Numerical entities and the special five lt, gt, amp, apos, quot are directly substituted in-place.
88 
89 Also, text spans will be broken up on paragraph breaks (double newlines).
90 This means that adjacent [text] tags should be rendered with a paragraph break between them.
91 However, no paragraph break should be used when [text] is followed by something else.
92 It is possible to have empty text spans in some cases, for example given a run of more than 2 newlines,
93 or a character entity directly followed by a paragraph break.
94 
95 */
96 
97 static std::string position_info(const std::string::const_iterator& text_start, const std::string::const_iterator& error_position)
98 {
99  // line numbers start from 1
100  int lines = std::count(text_start, error_position, '\n') + 1;
101  // Find the start position of the line where the current position is.
102  // We do this by searching in reverse from cursor_position toward text_start.
103  auto pos = error_position;
104  for(; pos != text_start && *pos != '\n'; pos--);
105  return formatter()
106  << "line " << lines
107  << ", character " << utf8::size(pos, error_position);
108 }
109 
110 static config parse_entity(std::string::const_iterator& beg, std::string::const_iterator end)
111 {
112  config entity;
113  std::stringstream s;
114  enum { UNKNOWN, NAMED, HEX, DECIMAL } type = UNKNOWN;
115  assert(*beg == '&');
116  ++beg;
117  for(; beg != end && *beg != ';'; ++beg) {
118  switch(type) {
119  case UNKNOWN:
120  if(*beg == '#') {
121  type = DECIMAL;
122  } else if(isalnum(*beg) || *beg == '_') {
123  type = NAMED;
124  s << *beg;
125  } else {
126  throw parse_error(beg, "invalid entity: unexpected characters after '&', alphanumeric characters, '#' or '_' expected.");
127  }
128  break;
129  case NAMED:
130  if(!isalnum(*beg)) {
131  throw parse_error(beg, "invalid entity: non-alphanumeric characters after '&'.");
132  }
133  s << *beg;
134  break;
135  case DECIMAL:
136  if(*beg == 'x') {
137  type = HEX;
138  } else if(isdigit(*beg)) {
139  s << *beg;
140  } else {
141  throw parse_error(beg, "invalid entity: unexpected characters after '&#', numbers or 'x' expected.");
142  }
143  break;
144  case HEX:
145  if(isxdigit(*beg)) {
146  s << *beg;
147  } else {
148  throw parse_error(beg, "invalid entity: unexpected characters after '&#x', hexadecimal digits expected.");
149  }
150  break;
151  }
152  }
153  if(type == NAMED) {
154  std::string name = s.str();
155  entity["name"] = name;
156  if(name == "lt") {
157  entity["code_point"] = '<';
158  } else if(name == "gt") {
159  entity["code_point"] = '>';
160  } else if(name == "apos") {
161  entity["code_point"] = '\'';
162  } else if(name == "quot") {
163  entity["code_point"] = '"';
164  } else if(name == "amp") {
165  entity["code_point"] = '&';
166  }
167  } else {
168  s.seekg(0);
169  if(type == HEX) {
170  s >> std::hex;
171  }
172  int n;
173  s >> n;
174  entity["code_point"] = n;
175  }
176  return entity;
177 }
178 
179 static char parse_escape(std::string::const_iterator& beg, std::string::const_iterator end)
180 {
181  assert(*beg == '\\');
182  // An escape at the end of stream is just treated as a literal.
183  // Otherwise, take the next character as a literal and be done with it.
184  if((beg + 1) != end) {
185  ++beg;
186  }
187  return *beg;
188 }
189 
190 static config parse_text_until(std::string::const_iterator& beg, std::string::const_iterator end, char close)
191 {
192  // In practice, close will be one of < ' "
193  // Parsing will go until either close or end of stream, and will emit one or more text and character_entity tags.
194  // However, recognized character entities will be collapsed into the text tags.
195  std::ostringstream s;
196  bool saw_newline = false;
197  config res;
198  for(; beg != end && *beg != close; ++beg) {
199  if(*beg == '&') {
200  auto entity = parse_entity(beg, end);
201  if(beg == end) {
202  throw parse_error(beg, "unexpected end of stream after entity");
203  }
204  if(entity.has_attribute("code_point")) {
205  s << unicode_cast<std::string>(entity["code_point"].to_int());
206  } else {
207  // TODO: Adding the text here seems wrong in the case that the stream BEGINS with an entity...
208  res.add_child("text", config("text", s.str()));
209  res.add_child("character_entity", entity);
210  s.str("");
211  }
212  } else if(*beg == '\\') {
213  s << parse_escape(beg, end);
214  } else if(*beg == '\n') {
215  if(saw_newline) {
216  res.add_child("text", config("text", s.str()));
217  s.str("");
218  } else {
219  saw_newline = true;
220  continue;
221  }
222  } else {
223  if(saw_newline) {
224  s << '\n';
225  }
226  s << *beg;
227  }
228  saw_newline = false;
229  }
230  // If the span ended in a newline, preserve it
231  if(saw_newline) {
232  s << '\n';
233  }
234  res.add_child("text", config("text", s.str()));
235  assert(beg == end || *beg == close);
236  return res;
237 }
238 
239 static std::string parse_name(std::string::const_iterator& beg, std::string::const_iterator end)
240 {
241  std::ostringstream s;
242  for(; beg != end && (isalnum(*beg) || *beg == '_'); ++beg) {
243  s << *beg;
244  }
245  return s.str();
246 }
247 
248 static std::pair<std::string, std::string> parse_attribute(std::string::const_iterator& beg, std::string::const_iterator end, bool allow_empty)
249 {
250  std::string attr = parse_name(beg, end), value;
251  if(attr.empty()) {
252  throw parse_error(beg, "missing attribute name");
253  }
254  while(isspace(*beg)) ++beg;
255  if(*beg != '=') {
256  if(allow_empty) {
257  // The caller expects beg to point to the last character of the attribute upon return.
258  // But in this path, we're now pointing to the character AFTER that.
259  --beg;
260  return {attr, value};
261  } else throw parse_error(beg, "attribute missing value in old-style tag");
262  }
263  ++beg;
264  while(isspace(*beg)) ++beg;
265  if(*beg == '\'' || *beg == '"') {
266  config res = parse_text_until(beg, end, *beg++);
267  if(res.has_child("character_entity")) {
268  throw parse_error(beg, "unsupported entity in attribute value");
269  } else if(res.all_children_count() > 1) {
270  throw parse_error(beg, "paragraph break in attribute value");
271  }
272  if(auto t = res.optional_child("text")) {
273  value = t["text"].str();
274  }
275  } else {
276  std::ostringstream s;
277  bool found_slash = false;
278  for(; beg != end && *beg != '>' && *beg != '<' && !isspace(*beg); ++beg) {
279  if(*beg == '&') {
280  auto entity = parse_entity(beg, end);
281  if(beg == end) {
282  throw parse_error(beg, "unexpected end of stream after entity");
283  }
284  if(entity.has_attribute("code_point")) {
285  s << unicode_cast<std::string>(entity["code_point"].to_int());
286  } else {
287  throw parse_error(beg, "unsupported entity in attribute value");
288  }
289  } else if(*beg == '\\') {
290  s << parse_escape(beg, end);
291  } else if(*beg == '/') {
292  found_slash = true;
293  } else {
294  if(found_slash) {
295  s << '/';
296  found_slash = false;
297  }
298  s << *beg;
299  }
300  }
301  value = s.str();
302  // The caller expects beg to point to the last character of the attribute upon return.
303  // But in this path, we're now pointing to the character AFTER that.
304  --beg;
305  if(found_slash) --beg;
306  }
307  return {attr, value};
308 }
309 
310 static void check_closing_tag(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match)
311 {
312  std::size_t remaining = end - beg;
313  assert(remaining >= 2 && *beg == '<' && *(beg + 1) == '/');
314  if(remaining < match.size() + 3) {
315  throw parse_error(beg, "Unexpected end of stream in closing tag");
316  }
317  beg += 2;
318  if(!std::equal(match.begin(), match.end(), beg)) {
319  throw parse_error(beg, "Mismatched closing tag " + std::string(match));
320  }
321  beg += match.size();
322  if(*beg != '>') {
323  throw parse_error(beg, "Unterminated closing tag " + std::string(match));
324  }
325  ++beg;
326 }
327 
328 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end);
329 static config parse_tag_contents(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
330 {
331  assert(*beg == '>');
332  ++beg;
333  // This also parses the matching closing tag!
334  config res;
335  for(; check_for_attributes && beg != end && *beg != '<'; ++beg) {
336  if(isspace(*beg)) continue;
337  auto save_beg = beg;
338  try {
339  auto [key, val] = parse_attribute(beg, end, false);
340  res[key] = val;
341  } catch(parse_error&) {
342  beg = save_beg;
343  while(beg != end && isspace(*beg)) ++beg;
344  break;
345  }
346  }
347  if(res.has_attribute("text")) {
348  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
349  throw parse_error(beg, "Extra text at the end of old-style tag with explicit 'text' attribute");
350  }
351  check_closing_tag(beg, end, match);
352  return res;
353  } else if(res.attribute_count() > 0) {
354  config text = parse_text_until(beg, end, '<');
355  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
356  throw parse_error(beg, "Extra text at the end of old-style tag with explicit 'text' attribute");
357  }
358  if(text.all_children_count() == 1 && text.has_child("text")) {
359  res["text"] = text.mandatory_child("text")["text"];
360  } else {
361  res.append_children(text);
362  }
363  check_closing_tag(beg, end, match);
364  return res;
365  }
366  while(true) {
367  config text = parse_text_until(beg, end, '<');
368  if(beg == end || beg + 1 == end) {
369  throw parse_error(beg, "Missing closing tag for " + std::string(match));
370  }
371  res.append_children(text);
372  if(*(beg + 1) == '/') {
373  check_closing_tag(beg, end, match);
374  break;
375  }
376  auto [tag, contents] = parse_tag(beg, end);
377  res.add_child(tag, contents);
378  }
379  if(res.all_children_count() == 1 && res.has_child("text")) {
380  return res.mandatory_child("text");
381  }
382  return res;
383 }
384 
385 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end)
386 {
387  assert(*beg == '<');
388  ++beg;
389  std::string tag_name = parse_name(beg, end);
390  if(tag_name.empty()) {
391  throw parse_error(beg, "missing tag name");
392  }
393  bool auto_closed = false;
394  config elem;
395  for(; beg != end && *beg != '>'; ++beg) {
396  if(isspace(*beg)) continue;
397  if(*beg == '/' && (beg + 1) != end && *(beg + 1) == '>') {
398  auto_closed = true;
399  } else if(isalnum(*beg) || *beg == '_') {
400  const auto& [key, value] = parse_attribute(beg, end, true);
401  if(beg == end) {
402  throw parse_error(beg, "unexpected end of stream following attribute");
403  }
404  elem[key] = value;
405  }
406  }
407  if(auto_closed) {
408  assert(*beg == '>');
409  ++beg;
410  } else {
411  config contents = parse_tag_contents(beg, end, tag_name, elem.attribute_count() == 0);
412  if(contents.all_children_count() == 0 && contents.attribute_count() == 1 && contents.has_attribute("text")) {
413  elem["text"] = contents["text"];
414  } else {
415  elem.append(contents);
416  }
417  }
418  return {tag_name, elem};
419 }
420 
421 config parse_text(const std::string& text)
422 {
423  config res;
424  auto beg = text.begin(), end = text.end();
425  try {
426  while(beg != end) {
427  if(*beg == '<') {
428  auto [tag, contents] = parse_tag(beg, end);
429  res.add_child(tag, contents);
430  } else {
431  config text = parse_text_until(beg, end, '<');
432  res.append_children(text);
433  }
434  }
435  } catch(parse_error& e) {
436  // NOTE: The text.begin() itor is in scope here, so we add the error location info
437  // to the error message here and rethrow. Both itors used in the call to
438  // position_info below can go out of scope otherwise.
439  e.message = position_info(text.begin(), e.error_location()) + ": " + e.message;
440  throw e;
441  }
442  return res;
443 }
444 
445 }
double t
Definition: astarsearch.cpp:63
A config object defines a single node in a WML file, with access to child nodes.
Definition: config.hpp:158
void append(const config &cfg)
Append data from another config object to this one.
Definition: config.cpp:188
std::size_t attribute_count() const
Count the number of non-blank attributes.
Definition: config.cpp:307
config & mandatory_child(config_key_type key, int n=0)
Returns the nth child with the given key, or throws an error if there is none.
Definition: config.cpp:362
bool has_child(config_key_type key) const
Determine whether a config has a child or not.
Definition: config.cpp:312
bool has_attribute(config_key_type key) const
Definition: config.cpp:157
std::size_t all_children_count() const
Definition: config.cpp:302
void append_children(const config &cfg)
Adds children from cfg.
Definition: config.cpp:167
optional_config_impl< config > optional_child(config_key_type key, int n=0)
Equivalent to mandatory_child, but returns an empty optional if the nth child was not found.
Definition: config.cpp:380
config & add_child(config_key_type key)
Definition: config.cpp:436
std::ostringstream wrapper.
Definition: formatter.hpp:40
Definitions for the interface to Wesnoth Markup Language (WML).
static std::string parse_name(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:239
static config parse_tag_contents(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
Definition: markup.cpp:329
std::string img(const std::string &src, const std::string &align, bool floating)
Generates a Help markup tag corresponding to an image.
Definition: markup.cpp:35
std::string make_link(const std::string &text, const std::string &dst)
Generates a Help markup tag corresponding to a reference or link.
Definition: markup.cpp:29
static std::string position_info(const std::string::const_iterator &text_start, const std::string::const_iterator &error_position)
Definition: markup.cpp:97
static std::pair< std::string, std::string > parse_attribute(std::string::const_iterator &beg, std::string::const_iterator end, bool allow_empty)
Definition: markup.cpp:248
static std::pair< std::string, config > parse_tag(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:385
static char parse_escape(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:179
static config parse_text_until(std::string::const_iterator &beg, std::string::const_iterator end, char close)
Definition: markup.cpp:190
std::string tag(std::string_view tag, Args &&... data)
Wraps the given data in the specified tag.
Definition: markup.hpp:45
static config parse_entity(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:110
static void check_closing_tag(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view match)
Definition: markup.cpp:310
config parse_text(const std::string &text)
Parse a xml style marked up text string.
Definition: markup.cpp:421
std::size_t size(std::string_view str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:81
rect dst
Location on the final composed sheet.
rect src
Non-transparent portion of the surface to compose.
Thrown when the help system fails to parse something.
Definition: markup.hpp:213
static map_location::direction n
static map_location::direction s
#define e