The Battle for Wesnoth  1.19.21+dev
markup.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2024 - 2025
3  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 #include "serialization/markup.hpp"
16 
17 #include "config.hpp"
18 #include "font/pango/escape.hpp"
19 #include "formatter.hpp"
20 #include "game_config.hpp"
21 #include "gettext.hpp"
24 #include "serialization/unicode_cast.hpp" // for unicode_cast
25 #include "utils/general.hpp"
26 
27 #include <algorithm>
28 
29 namespace markup {
30 
31 std::string make_link(const std::string& text, const std::string& dst)
32 {
33  // some sorting done on list of links may rely on the fact that text is first
34  return formatter() << "<ref dst='" << dst << "'>" << text << "</ref>";
35 }
36 
37 std::string img(const std::string& src, const std::string& align, bool floating)
38 {
39  return formatter()
40  << "<img src='" << src << "' "
41  << "float='" << std::boolalpha << floating << "' "
42  << "align='" << align << "' "
43  << "/>";
44 }
45 
46 //
47 // Markup Parser
48 //
49 
50 /*
51 
52 Here's a little mini-grammar of the markup language:
53 
54 DOCUMENT ::= (TEXT | TAG)*
55 TEXT ::= ([^<&\] | ENTITY | ESCAPE)*
56 ESCAPE ::= '\' [:unicode-char:]
57 ENTITY ::= '&' '#' [0-9]+ ';'
58 ENTITY ::= '&' 'x' [0-9a-fA-F]+ ';'
59 ENTITY ::= '&' NAME ';'
60 TAG ::= '<' NAME ATTRIBUTE* '/' '>'
61 TAG ::= '<' NAME ATTRIBUTE* '>' DOCUMENT '<' '/' NAME '>' ## NB: the names must match!
62 TAG ::= '<' NAME '>' ATTRIBUTE* TEXT? '<' '/' NAME '>' ## NB: the names must match!
63 ATTRIBUTE ::= NAME
64 ATTRIBUTE ::= NAME '=' [^'" ]*
65 ATTRIBUTE ::= NAME '=' "'" TEXT "'"
66 ATTRIBUTE ::= NAME '=' '"' TEXT '"'
67 NAME ::= [_0-9a-zA-Z]+
68 
69 Notes:
70 * Entities and the first two tag formats are Pango-style. The tags can be nested inside each other.
71 * Escapes and the third tag format are for compatibility with the old help markup. Tags cannot be nested.
72 * This mostly doesn't attempt to define the meaning of specific tags or entity names. It does however substitute numeric entities, as well as some very basic named entities: lt, gt, amp, quot, apos.
73 * The definition of TEXT is left a bit nebulous, but just think of it as "non-greedy"
74 * Attributes without a value are only supported in Pango-style tags
75 * Some restrictions may apply beyond what the grammar specifies. For example, arbitrary named entities are not supported in attribute values (numeric ones and the 5 special ones work though).
76 
77 ------
78 
79 The result of the parsing is represented in the format of a WML config.
80 Text spans are represented as a [text] tag, and character entities as a [character_entity] tag.
81 All other tags are represented by a tag of the same name.
82 Any attributes on a tag become key-value pairs within the tag.
83 Old-style help markup tags with text at the end put the text in a "text" key in the tag.
84 The same approach is used for new-style Pango tags, but only if there are no nested tags or entities.
85 If there ARE nested tags or entities, the contents of the tag is broken down into spans as subtags of the parent tag.
86 Thus, a tag with content has EITHER a text attribute OR some subtags.
87 
88 Note: Only unrecognized named entities count for the above purposes!
89 Numerical entities and the special five lt, gt, amp, apos, quot are directly substituted in-place.
90 
91 Also, text spans will be broken up on paragraph breaks (double newlines).
92 This means that adjacent [text] tags should be rendered with a paragraph break between them.
93 However, no paragraph break should be used when [text] is followed by something else.
94 It is possible to have empty text spans in some cases, for example given a run of more than 2 newlines,
95 or a character entity directly followed by a paragraph break.
96 
97 */
98 
99 namespace
100 {
101 using namespace std::string_literals;
102 const std::array old_style_tags{ "bold"s, "italic"s, "header"s, "format"s, "img"s, "ref"s, "jump"s };
103 const std::array old_style_attr{ /*ref*/ "dst"s, "text"s, "force"s, /*jump*/ "to"s, "amount"s, /*img*/ "src"s, "align"s, "float"s, /*format*/ "bold"s, "italic"s, "color"s, "font_size"s };
104 }
105 
106 static std::string position_info(const std::string::const_iterator& text_start, const std::string::const_iterator& error_position)
107 {
108  // line numbers start from 1
109  int lines = std::count(text_start, error_position, '\n') + 1;
110  // Find the start position of the line where the current position is.
111  // We do this by searching in reverse from cursor_position toward text_start.
112  auto pos = error_position;
113  for(; pos != text_start && *pos != '\n'; pos--);
114  return formatter()
115  << "line " << lines
116  << ", character " << utf8::size(pos, error_position);
117 }
118 
119 static config parse_entity(std::string::const_iterator& beg, std::string::const_iterator end)
120 {
121  config entity;
122  std::stringstream s;
123  enum { UNKNOWN, NAMED, HEX, DECIMAL } type = UNKNOWN;
124  assert(*beg == '&');
125  ++beg;
126  for(; beg != end && *beg != ';'; ++beg) {
127  switch(type) {
128  case UNKNOWN:
129  if(*beg == '#') {
130  type = DECIMAL;
131  } else if(isalnum(*beg) || *beg == '_') {
132  type = NAMED;
133  s << *beg;
134  } else {
135  throw parse_error(beg, "invalid entity: unexpected characters after '&', alphanumeric characters, '#' or '_' expected.");
136  }
137  break;
138  case NAMED:
139  if(!isalnum(*beg)) {
140  throw parse_error(beg, "invalid entity: non-alphanumeric characters after '&'.");
141  }
142  s << *beg;
143  break;
144  case DECIMAL:
145  if(*beg == 'x') {
146  type = HEX;
147  } else if(isdigit(*beg)) {
148  s << *beg;
149  } else {
150  throw parse_error(beg, "invalid entity: unexpected characters after '&#', numbers or 'x' expected.");
151  }
152  break;
153  case HEX:
154  if(isxdigit(*beg)) {
155  s << *beg;
156  } else {
157  throw parse_error(beg, "invalid entity: unexpected characters after '&#x', hexadecimal digits expected.");
158  }
159  break;
160  }
161  }
162  if(type == NAMED) {
163  std::string name = s.str();
164  entity["name"] = name;
165  if(name == "lt") {
166  entity["code_point"] = '<';
167  } else if(name == "gt") {
168  entity["code_point"] = '>';
169  } else if(name == "apos") {
170  entity["code_point"] = '\'';
171  } else if(name == "quot") {
172  entity["code_point"] = '"';
173  } else if(name == "amp") {
174  entity["code_point"] = '&';
175  }
176  } else {
177  s.seekg(0);
178  if(type == HEX) {
179  s >> std::hex;
180  }
181  int n;
182  s >> n;
183  entity["code_point"] = n;
184  }
185  return entity;
186 }
187 
188 static char parse_escape(std::string::const_iterator& beg, std::string::const_iterator end)
189 {
190  assert(*beg == '\\');
191  // An escape at the end of stream is just treated as a literal.
192  // Otherwise, take the next character as a literal and be done with it.
193  if((beg + 1) != end) {
194  ++beg;
195  }
196  return *beg;
197 }
198 
199 static config parse_text_until(std::string::const_iterator& beg, std::string::const_iterator end, char close)
200 {
201  // In practice, close will be one of < ' "
202  // Parsing will go until either close or end of stream, and will emit one or more text and character_entity tags.
203  // However, recognized character entities will be collapsed into the text tags.
204  std::ostringstream s;
205  bool saw_newline = false;
206  config res;
207  for(; beg != end && *beg != close; ++beg) {
208  if(*beg == '&') {
209  auto entity = parse_entity(beg, end);
210  if(beg == end) {
211  throw parse_error(beg, "unexpected end of stream after entity");
212  }
213  if(entity.has_attribute("code_point")) {
214  s << unicode_cast<std::string>(entity["code_point"].to_int());
215  } else {
216  // TODO: Adding the text here seems wrong in the case that the stream BEGINS with an entity...
217  res.add_child("text", config("text", s.str()));
218  res.add_child("character_entity", entity);
219  s.str("");
220  }
221  } else if(*beg == '\\') {
222  s << parse_escape(beg, end);
223  } else if(*beg == '\n') {
224  if(saw_newline) {
225  res.add_child("text", config("text", s.str()));
226  s.str("");
227  } else {
228  saw_newline = true;
229  continue;
230  }
231  } else {
232  if(saw_newline) {
233  s << '\n';
234  }
235  s << *beg;
236  }
237  saw_newline = false;
238  }
239  // If the span ended in a newline, preserve it
240  if(saw_newline) {
241  s << '\n';
242  }
243  res.add_child("text", config("text", s.str()));
244  assert(beg == end || *beg == close);
245  return res;
246 }
247 
248 static std::string parse_name(std::string::const_iterator& beg, std::string::const_iterator end)
249 {
250  std::ostringstream s;
251  for(; beg != end && (isalnum(*beg) || *beg == '_'); ++beg) {
252  s << *beg;
253  }
254  return s.str();
255 }
256 
257 static std::pair<std::string, std::string> parse_attribute(std::string::const_iterator& beg, std::string::const_iterator end, bool old_style)
258 {
259  std::string attr = parse_name(beg, end);
260  if(attr.empty()) {
261  throw parse_error(beg, "missing attribute name");
262  }
263  if(old_style && !utils::contains(old_style_attr, attr)) {
264  throw parse_error(beg, "dummy error: not an old-style attribute name"); // old-style=true caller ignores parse errors
265  }
266 
267  while(isspace(*beg)) ++beg;
268 
269  if(*beg != '=') {
270  if(old_style) {
271  throw parse_error(beg, "attribute missing value in old-style tag");
272  } else {
273  // The caller expects beg to point to the last character of the attribute upon return.
274  // But in this path, we're now pointing to the character AFTER that.
275  --beg;
276  return {attr, ""};
277  }
278  }
279  ++beg;
280  while(isspace(*beg)) ++beg;
281 
282  std::string value;
283  if(*beg == '\'' || *beg == '"') {
284  config res = parse_text_until(beg, end, *beg++);
285  if(res.has_child("character_entity")) {
286  throw parse_error(beg, "unsupported entity in attribute value");
287  } else if(res.all_children_count() > 1) {
288  throw parse_error(beg, "paragraph break in attribute value");
289  }
290  if(auto t = res.optional_child("text")) {
291  value = t["text"].str();
292  }
293  } else {
294  std::ostringstream s;
295  bool found_slash = false;
296  for(; beg != end && *beg != '>' && *beg != '<' && !isspace(*beg); ++beg) {
297  if(*beg == '&') {
298  auto entity = parse_entity(beg, end);
299  if(beg == end) {
300  throw parse_error(beg, "unexpected end of stream after entity");
301  }
302  if(entity.has_attribute("code_point")) {
303  s << unicode_cast<std::string>(entity["code_point"].to_int());
304  } else {
305  throw parse_error(beg, "unsupported entity in attribute value");
306  }
307  } else if(*beg == '\\') {
308  s << parse_escape(beg, end);
309  } else if(*beg == '/') {
310  found_slash = true;
311  } else {
312  if(found_slash) {
313  s << '/';
314  found_slash = false;
315  }
316  s << *beg;
317  }
318  }
319  value = s.str();
320  // The caller expects beg to point to the last character of the attribute upon return.
321  // But in this path, we're now pointing to the character AFTER that.
322  --beg;
323  if(found_slash) --beg;
324  }
325  return {attr, value};
326 }
327 
328 static void check_closing_tag(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view tag_name)
329 {
330  std::size_t remaining = end - beg;
331  assert(remaining >= 2 && *beg == '<' && *(beg + 1) == '/');
332  if(remaining < tag_name.size() + 3) {
333  throw parse_error(beg, "Unexpected end of stream in closing tag");
334  }
335  beg += 2;
336  if(!std::equal(tag_name.begin(), tag_name.end(), beg)) {
337  throw parse_error(beg, "Mismatched closing tag " + std::string(tag_name));
338  }
339  beg += tag_name.size();
340  if(*beg != '>') {
341  throw parse_error(beg, "Unterminated closing tag " + std::string(tag_name));
342  }
343  ++beg;
344 }
345 
346 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end);
347 static config parse_tag_contents(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view tag_name, bool check_for_attributes)
348 {
349  assert(*beg == '>');
350  ++beg;
351 
352  if(!utils::contains(old_style_tags, tag_name)) {
353  check_for_attributes = false;
354  }
355 
356  // This also parses the matching closing tag!
357  config res;
358  for(; check_for_attributes && beg != end && *beg != '<'; ++beg) {
359  if(isspace(*beg)) continue;
360  auto save_beg = beg;
361  try {
362  auto [key, val] = parse_attribute(beg, end, true);
363  res[key] = val;
364  } catch(parse_error&) {
365  beg = save_beg;
366  while(beg != end && isspace(*beg)) ++beg;
367  break;
368  }
369  }
370  if(res.has_attribute("text")) {
371  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
372  throw parse_error(beg, "Extra text at the end of old-style tag with explicit 'text' attribute");
373  }
374  check_closing_tag(beg, end, tag_name);
375  return res;
376  } else if(res.attribute_count() > 0) {
377  config text = parse_text_until(beg, end, '<');
378  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
379  throw parse_error(beg, "Extra text at the end of old-style tag with explicit 'text' attribute");
380  }
381  if(text.all_children_count() == 1 && text.has_child("text")) {
382  res["text"] = text.mandatory_child("text")["text"];
383  } else {
384  res.append_children(text);
385  }
386  check_closing_tag(beg, end, tag_name);
387  return res;
388  }
389  while(true) {
390  config text = parse_text_until(beg, end, '<');
391  if(beg == end || beg + 1 == end) {
392  throw parse_error(beg, "Missing closing tag for " + std::string(tag_name));
393  }
394  res.append_children(text);
395  if(*(beg + 1) == '/') {
396  check_closing_tag(beg, end, tag_name);
397  break;
398  }
399  auto [tag, contents] = parse_tag(beg, end);
400  res.add_child(tag, contents);
401  }
402  if(res.all_children_count() == 1 && res.has_child("text")) {
403  return res.mandatory_child("text");
404  }
405  return res;
406 }
407 
408 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end)
409 {
410  assert(*beg == '<');
411  ++beg;
412  std::string tag_name = parse_name(beg, end);
413  if(tag_name.empty()) {
414  throw parse_error(beg, "missing tag name");
415  }
416  bool auto_closed = false;
417  config elem;
418  for(; beg != end && *beg != '>'; ++beg) {
419  if(isspace(*beg)) continue;
420  if(*beg == '/' && (beg + 1) != end && *(beg + 1) == '>') {
421  auto_closed = true;
422  } else if(isalnum(*beg) || *beg == '_') {
423  const auto& [key, value] = parse_attribute(beg, end, false);
424  if(beg == end) {
425  throw parse_error(beg, "unexpected end of stream following attribute");
426  }
427  elem[key] = value;
428  }
429  }
430  if(auto_closed) {
431  assert(*beg == '>');
432  ++beg;
433  } else {
434  config contents = parse_tag_contents(beg, end, tag_name, elem.attribute_count() == 0);
435  if(contents.all_children_count() == 0 && contents.attribute_count() == 1 && contents.has_attribute("text")) {
436  elem["text"] = contents["text"];
437  } else {
438  elem.append(contents);
439  }
440  }
441  return {tag_name, elem};
442 }
443 
444 config parse_text(const std::string& text)
445 {
446  config res;
447  auto beg = text.begin(), end = text.end();
448  try {
449  while(beg != end) {
450  if(*beg == '<') {
451  auto [tag, contents] = parse_tag(beg, end);
452  res.add_child(tag, contents);
453  } else {
454  config text = parse_text_until(beg, end, '<');
455  res.append_children(text);
456  }
457  }
458  } catch(parse_error& e) {
459  // NOTE: The text.begin() itor is in scope here, so we add the error location info
460  // to the error message here and rethrow. Both itors used in the call to
461  // position_info below can go out of scope otherwise.
462  e.message = position_info(text.begin(), e.error_location()) + ": " + e.message;
463  throw e;
464  }
465  return res;
466 }
467 
468 static std::string config_to_pango_markup(const std::string& orig_tagname, const config& cfg) {
469  std::stringstream text;
470 
471  std::string tagname;
472  if(orig_tagname == "bold" || orig_tagname == "b") {
473  tagname = "b";
474  } else if(orig_tagname == "italic" || orig_tagname == "i") {
475  tagname = "i";
476  } else if(orig_tagname == "underline" || orig_tagname == "u" || orig_tagname == "ref") {
477  tagname = "u"; // <ref> tags will be shown as pango underline
478  } else if(orig_tagname == "format" || orig_tagname == "header" || orig_tagname == "h") {
479  tagname = "span";
480  } else if(orig_tagname == "img") {
481  return ""; // ignore
482  } else if(orig_tagname == "character_entity") {
483  return font::escape_text("&" + cfg["name"].str() + ";");
484  } else if(orig_tagname == "table") {
485  text << "\n";
486  for (const auto& row : cfg.child_range("row")) {
487  for (const auto& cell : row.child_range("col")) {
488  text << config_to_pango_markup("col", cell) << " ";
489  }
490  text << "\n";
491  }
492  return text.str();
493  } else {
494  // Anything that does not match any preceding if blocks
495  // gets its inner text extracted and returned, no special handling.
496  // This also handles plain text [text] blocks.
497  return font::escape_text(cfg["text"].str());
498  }
499 
500  // Inner text content
501  if(cfg.has_attribute("text")) {
502  text << font::escape_text(cfg["text"].str());
503  }
504 
505  // Tag specific formatting attributes
506  tag_attributes attrs;
507  if(orig_tagname == "span" || orig_tagname == "format") {
508  for(const auto& [key, val] : cfg.attribute_range()) {
509  attrs.emplace_back(key, val.str());
510  }
511  } else if(orig_tagname == "header" || orig_tagname == "h") {
512  attrs.emplace_back("weight", "heavy");
513  attrs.emplace_back("color", "white");
514  attrs.emplace_back("size", "large");
515  }
516 
517  // Nested tags
518  for(const auto& [tagname, cfg] : cfg.all_children_view()) {
519  text << config_to_pango_markup(tagname, cfg);
520  }
521 
522  return markup::tag_attr(tagname, attrs, text.str());
523 }
524 
525 std::string help_to_pango_markup(const std::string& help_markup) {
526  try {
527  const config& help_cfg = parse_text(help_markup);
528  std::stringstream pango_text;
529  std::string prev_tagname = "";
530  for(const auto& [tagname, child_cfg] : help_cfg.all_children_view()) {
531  // Two consecutive [text] blocks mean two paragraphs of text.
532  // Reinserting the paragraph break.
533  if(prev_tagname == "text" && tagname == "text") {
534  pango_text << "\n";
535  }
536 
537  pango_text << config_to_pango_markup(tagname, child_cfg);
538 
539  prev_tagname = tagname;
540  }
541  return pango_text.str();
542  } catch(const parse_error& e) {
544  "Error:[" + e.message + "]\n"
545  + font::escape_text(help_markup));
546  }
547 }
548 
549 }
double t
Definition: astarsearch.cpp:63
A config object defines a single node in a WML file, with access to child nodes.
Definition: config.hpp:157
config & add_child(std::string_view key)
Definition: config.cpp:436
void append(const config &cfg)
Append data from another config object to this one.
Definition: config.cpp:188
std::size_t attribute_count() const
Count the number of non-blank attributes.
Definition: config.cpp:307
optional_config_impl< config > optional_child(std::string_view key, int n=0)
Equivalent to mandatory_child, but returns an empty optional if the nth child was not found.
Definition: config.cpp:380
const_attr_itors attribute_range() const
Definition: config.cpp:740
auto all_children_view() const
In-order iteration over all children.
Definition: config.hpp:795
child_itors child_range(std::string_view key)
Definition: config.cpp:268
std::size_t all_children_count() const
Definition: config.cpp:302
bool has_attribute(std::string_view key) const
Definition: config.cpp:157
bool has_child(std::string_view key) const
Determine whether a config has a child or not.
Definition: config.cpp:312
void append_children(const config &cfg)
Adds children from cfg.
Definition: config.cpp:167
config & mandatory_child(std::string_view key, int n=0)
Returns the nth child with the given key, or throws an error if there is none.
Definition: config.cpp:362
std::ostringstream wrapper.
Definition: formatter.hpp:40
Definitions for the interface to Wesnoth Markup Language (WML).
const config * cfg
const color_t BAD_COLOR
std::string escape_text(std::string_view text)
Escapes the pango markup characters in a text.
Definition: escape.hpp:33
std::vector< std::pair< std::string_view, std::string_view > > tag_attributes
Definition: markup.hpp:53
static std::string parse_name(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:248
std::string img(const std::string &src, const std::string &align, bool floating)
Generates a Help markup tag corresponding to an image.
Definition: markup.cpp:37
std::string make_link(const std::string &text, const std::string &dst)
Generates a Help markup tag corresponding to a reference or link.
Definition: markup.cpp:31
static std::string position_info(const std::string::const_iterator &text_start, const std::string::const_iterator &error_position)
Definition: markup.cpp:106
static std::string config_to_pango_markup(const std::string &orig_tagname, const config &cfg)
Definition: markup.cpp:468
static config parse_tag_contents(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view tag_name, bool check_for_attributes)
Definition: markup.cpp:347
std::string span_color(const color_t &color, Args &&... data)
Applies Pango markup to the input specifying its display color.
Definition: markup.hpp:110
static void check_closing_tag(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view tag_name)
Definition: markup.cpp:328
std::string tag_attr(std::string_view tag, const tag_attributes &attrs, Args &&... data)
Wraps the given data in the specified tag.
Definition: markup.hpp:69
static std::pair< std::string, config > parse_tag(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:408
static char parse_escape(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:188
static config parse_text_until(std::string::const_iterator &beg, std::string::const_iterator end, char close)
Definition: markup.cpp:199
std::string tag(std::string_view tag, Args &&... data)
Wraps the given data in the specified tag.
Definition: markup.hpp:45
std::string help_to_pango_markup(const std::string &help_markup)
Definition: markup.cpp:525
static config parse_entity(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:119
static std::pair< std::string, std::string > parse_attribute(std::string::const_iterator &beg, std::string::const_iterator end, bool old_style)
Definition: markup.cpp:257
config parse_text(const std::string &text)
Parse a xml style marked up text string.
Definition: markup.cpp:444
std::size_t size(std::string_view str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:81
bool contains(const Container &container, const Value &value)
Returns true iff value is found in container.
Definition: general.hpp:87
rect dst
Location on the final composed sheet.
rect src
Non-transparent portion of the surface to compose.
Thrown when the help system fails to parse something.
Definition: markup.hpp:213
static map_location::direction n
static map_location::direction s
#define e