The Battle for Wesnoth  1.19.5+dev
markup.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2024
3  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 
16 #include "game_config.hpp"
17 #include "gettext.hpp"
18 #include "serialization/markup.hpp"
19 #include "serialization/unicode_cast.hpp" // for unicode_cast
20 
21 namespace markup {
22 
23 std::string make_link(const std::string& text, const std::string& dst)
24 {
25  // some sorting done on list of links may rely on the fact that text is first
26  return "<ref dst='" + utils::escape(dst, "'\\") + "'>" + utils::escape(text, "'\\") + "</ref>";
27 }
28 
29 std::string img(const std::string& src, const std::string& align, const bool floating)
30 {
31  return formatter()
32  << "<img src='" << src << "' "
33  << "float='" << std::boolalpha << floating << "' "
34  << "align='" << align << "' "
35  << "/>";
36 }
37 
38 //
39 // Markup Parser
40 //
41 
42 /*
43 
44 Here's a little mini-grammar of the markup language:
45 
46 DOCUMENT ::= (TEXT | TAG)*
47 TEXT ::= ([^<&\] | ENTITY | ESCAPE)*
48 ESCAPE ::= '\' [:unicode-char:]
49 ENTITY ::= '&' '#' [0-9]+ ';'
50 ENTITY ::= '&' 'x' [0-9a-fA-F]+ ';'
51 ENTITY ::= '&' NAME ';'
52 TAG ::= '<' NAME ATTRIBUTE* '/' '>'
53 TAG ::= '<' NAME ATTRIBUTE* '>' DOCUMENT '<' '/' NAME '>' ## NB: the names must match!
54 TAG ::= '<' NAME '>' ATTRIBUTE* TEXT? '<' '/' NAME '>' ## NB: the names must match!
55 ATTRIBUTE ::= NAME
56 ATTRIBUTE ::= NAME '=' [^'" ]*
57 ATTRIBUTE ::= NAME '=' "'" TEXT "'"
58 ATTRIBUTE ::= NAME '=' '"' TEXT '"'
59 NAME ::= [_0-9a-zA-Z]+
60 
61 Notes:
62 * Entities and the first two tag formats are Pango-style. The tags can be nested inside each other.
63 * Escapes and the third tag format are for compatibility with the old help markup. Tags cannot be nested.
64 * This mostly doesn't attempt to define the meaning of specific tags or entity names. It does however substitute numeric entities, as well as some very basic named entities: lt, gt, amp, quot, apos.
65 * The definition of TEXT is left a bit nebulous, but just think of it as "non-greedy"
66 * Attributes without a value are only supported in Pango-style tags
67 * Some restrictions may apply beyond what the grammar specifies. For example, arbitrary named entities are not supported in attribute values (numeric ones and the 5 special ones work though).
68 
69 ------
70 
71 The result of the parsing is represented in the format of a WML config.
72 Text spans are represented as a [text] tag, and character entities as a [character_entity] tag.
73 All other tags are represented by a tag of the same name.
74 Any attributes on a tag become key-value pairs within the tag.
75 Old-style help markup tags with text at the end put the text in a "text" key in the tag.
76 The same approach is used for new-style Pango tags, but only if there are no nested tags or entities.
77 If there ARE nested tags or entities, the contents of the tag is broken down into spans as subtags of the parent tag.
78 Thus, a tag with content has EITHER a text attribute OR some subtags.
79 
80 Note: Only unrecognized named entities count for the above purposes!
81 Numerical entities and the special five lt, gt, amp, apos, quot are directly substituted in-place.
82 
83 Also, text spans will be broken up on paragraph breaks (double newlines).
84 This means that adjacent [text] tags should be rendered with a paragraph break between them.
85 However, no paragraph break should be used when [text] is followed by something else.
86 It is possible to have empty text spans in some cases, for example given a run of more than 2 newlines,
87 or a character entity directly followed by a paragraph break.
88 
89 */
90 static config parse_entity(std::string::const_iterator& beg, std::string::const_iterator end)
91 {
92  config entity;
93  std::stringstream s;
94  enum { UNKNOWN, NAMED, HEX, DECIMAL } type = UNKNOWN;
95  assert(*beg == '&');
96  ++beg;
97  for(; beg != end && *beg != ';'; ++beg) {
98  switch(type) {
99  case UNKNOWN:
100  if(*beg == '#') {
101  type = DECIMAL;
102  } else if(isalnum(*beg) || *beg == '_') {
103  type = NAMED;
104  s << *beg;
105  } else {
106  throw parse_error("TODO");
107  }
108  break;
109  case NAMED:
110  if(!isalnum(*beg)) {
111  throw parse_error("TODO");
112  }
113  s << *beg;
114  break;
115  case DECIMAL:
116  if(*beg == 'x') {
117  type = HEX;
118  } else if(isdigit(*beg)) {
119  s << *beg;
120  } else {
121  throw parse_error("TODO");
122  }
123  break;
124  case HEX:
125  if(isxdigit(*beg)) {
126  s << *beg;
127  } else {
128  throw parse_error("TODO");
129  }
130  break;
131  }
132  }
133  if(type == NAMED) {
134  std::string name = s.str();
135  entity["name"] = name;
136  if(name == "lt") {
137  entity["code_point"] = '<';
138  } else if(name == "gt") {
139  entity["code_point"] = '>';
140  } else if(name == "apos") {
141  entity["code_point"] = '\'';
142  } else if(name == "quot") {
143  entity["code_point"] = '"';
144  } else if(name == "amp") {
145  entity["code_point"] = '&';
146  }
147  } else {
148  s.seekg(0);
149  if(type == HEX) {
150  s >> std::hex;
151  }
152  int n;
153  s >> n;
154  entity["code_point"] = n;
155  }
156  return entity;
157 }
158 
159 static char parse_escape(std::string::const_iterator& beg, std::string::const_iterator end)
160 {
161  assert(*beg == '\\');
162  // An escape at the end of stream is just treated as a literal.
163  // Otherwise, take the next character as a literal and be done with it.
164  if((beg + 1) != end) {
165  ++beg;
166  }
167  return *beg;
168 }
169 
170 static config parse_text_until(std::string::const_iterator& beg, std::string::const_iterator end, char close)
171 {
172  // In practice, close will be one of < ' "
173  // Parsing will go until either close or eos, and will emit one or more text and character_entity tags.
174  // However, recognized character entities will be collapsed into the text tags.
175  std::ostringstream s;
176  bool saw_newline = false;
177  config res;
178  for(; beg != end && *beg != close; ++beg) {
179  if(*beg == '&') {
180  auto entity = parse_entity(beg, end);
181  if(beg == end) {
182  throw parse_error("unexpected eos after entity");
183  }
184  if(entity.has_attribute("code_point")) {
185  s << unicode_cast<std::string>(entity["code_point"].to_int());
186  } else {
187  // TODO: Adding the text here seems wrong in the case that the stream BEGINS with an entity...
188  res.add_child("text", config("text", s.str()));
189  res.add_child("character_entity", entity);
190  s.str("");
191  }
192  } else if(*beg == '\\') {
193  s << parse_escape(beg, end);
194  } else if(*beg == '\n') {
195  if(saw_newline) {
196  res.add_child("text", config("text", s.str()));
197  s.str("");
198  } else {
199  saw_newline = true;
200  continue;
201  }
202  } else {
203  if(saw_newline) {
204  s << '\n';
205  }
206  s << *beg;
207  }
208  saw_newline = false;
209  }
210  // If the span ended in a newline, preserve it
211  if(saw_newline) {
212  s << '\n';
213  }
214  res.add_child("text", config("text", s.str()));
215  assert(beg == end || *beg == close);
216  return res;
217 }
218 
219 static std::string parse_name(std::string::const_iterator& beg, std::string::const_iterator end)
220 {
221  std::ostringstream s;
222  for(; beg != end && (isalnum(*beg) || *beg == '_'); ++beg) {
223  s << *beg;
224  }
225  return s.str();
226 }
227 
228 static std::pair<std::string, std::string> parse_attribute(std::string::const_iterator& beg, std::string::const_iterator end, bool allow_empty)
229 {
230  std::string attr = parse_name(beg, end), value;
231  if(attr.empty()) {
232  throw parse_error("missing attribute name");
233  }
234  while(isspace(*beg)) ++beg;
235  if(*beg != '=') {
236  if(allow_empty) {
237  // The caller expects beg to point to the last character of the attribute upon return.
238  // But in this path, we're now pointing to the character AFTER that.
239  --beg;
240  return {attr, value};
241  } else throw parse_error("attribute missing value in old-style tag");
242  }
243  ++beg;
244  while(isspace(*beg)) ++beg;
245  if(*beg == '\'' || *beg == '"') {
246  config res = parse_text_until(beg, end, *beg++);
247  if(res.has_child("character_entity")) {
248  throw parse_error("unsupported entity in attribute value");
249  } else if(res.all_children_count() > 1) {
250  throw parse_error("paragraph break in attribute value");
251  }
252  if(auto t = res.optional_child("text")) {
253  value = t["text"].str();
254  }
255  } else {
256  std::ostringstream s;
257  bool found_slash = false;
258  for(; beg != end && *beg != '>' && *beg != '<' && !isspace(*beg); ++beg) {
259  if(*beg == '&') {
260  auto entity = parse_entity(beg, end);
261  if(beg == end) {
262  throw parse_error("unexpected eos after entity");
263  }
264  if(entity.has_attribute("code_point")) {
265  s << unicode_cast<std::string>(entity["code_point"].to_int());
266  } else {
267  throw parse_error("unsupported entity in attribute value");
268  }
269  } else if(*beg == '\\') {
270  s << parse_escape(beg, end);
271  } else if(*beg == '/') {
272  found_slash = true;
273  } else {
274  if(found_slash) {
275  s << '/';
276  found_slash = false;
277  }
278  s << *beg;
279  }
280  }
281  value = s.str();
282  // The caller expects beg to point to the last character of the attribute upon return.
283  // But in this path, we're now pointing to the character AFTER that.
284  --beg;
285  if(found_slash) --beg;
286  }
287  return {attr, value};
288 }
289 
290 static void check_closing_tag(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match)
291 {
292  size_t remaining = end - beg;
293  assert(remaining >= 2 && *beg == '<' && *(beg + 1) == '/');
294  if(remaining < match.size() + 3) {
295  throw parse_error("Unexpected eos in closing tag");
296  }
297  beg += 2;
298  if(!std::equal(match.begin(), match.end(), beg)) {
299  throw parse_error("Mismatched closing tag");
300  }
301  beg += match.size();
302  if(*beg != '>') {
303  throw parse_error("Unterminated closing tag");
304  }
305  ++beg;
306 }
307 
308 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end);
309 static config parse_tag_contents(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
310 {
311  assert(*beg == '>');
312  ++beg;
313  // This also parses the matching closing tag!
314  config res;
315  for(; check_for_attributes && beg != end && *beg != '<'; ++beg) {
316  if(isspace(*beg)) continue;
317  auto save_beg = beg;
318  try {
319  auto [key, val] = parse_attribute(beg, end, false);
320  res[key] = val;
321  } catch(parse_error&) {
322  beg = save_beg;
323  while(beg != end && isspace(*beg)) ++beg;
324  break;
325  }
326  }
327  if(res.has_attribute("text")) {
328  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
329  throw parse_error("Extra text at the end of old-style tag with explicit 'text' attribute");
330  }
331  check_closing_tag(beg, end, match);
332  return res;
333  } else if(res.attribute_count() > 0) {
334  config text = parse_text_until(beg, end, '<');
335  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
336  throw parse_error("Extra text at the end of old-style tag with explicit 'text' attribute");
337  }
338  if(text.all_children_count() == 1 && text.has_child("text")) {
339  res["text"] = text.mandatory_child("text")["text"];
340  } else {
341  res.append_children(text);
342  }
343  check_closing_tag(beg, end, match);
344  return res;
345  }
346  while(true) {
347  config text = parse_text_until(beg, end, '<');
348  if(beg == end || beg + 1 == end) {
349  throw parse_error("Missing closing tag");
350  }
351  res.append_children(text);
352  if(*(beg + 1) == '/') {
353  check_closing_tag(beg, end, match);
354  break;
355  }
356  auto [tag, contents] = parse_tag(beg, end);
357  res.add_child(tag, contents);
358  }
359  if(res.all_children_count() == 1 && res.has_child("text")) {
360  return res.mandatory_child("text");
361  }
362  return res;
363 }
364 
365 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end)
366 {
367  assert(*beg == '<');
368  ++beg;
369  std::string tag_name = parse_name(beg, end);
370  if(tag_name.empty()) {
371  throw parse_error("missing tag name");
372  }
373  bool auto_closed = false;
374  config elem;
375  for(; beg != end && *beg != '>'; ++beg) {
376  if(isspace(*beg)) continue;
377  if(*beg == '/' && (beg + 1) != end && *(beg + 1) == '>') {
378  auto_closed = true;
379  } else if(isalnum(*beg) || *beg == '_') {
380  const auto& [key, value] = parse_attribute(beg, end, true);
381  if(beg == end) {
382  throw parse_error("unexpected eos following attribute");
383  }
384  elem[key] = value;
385  }
386  }
387  if(auto_closed) {
388  assert(*beg == '>');
389  ++beg;
390  } else {
391  config contents = parse_tag_contents(beg, end, tag_name, elem.attribute_count() == 0);
392  if(contents.all_children_count() == 0 && contents.attribute_count() == 1 && contents.has_attribute("text")) {
393  elem["text"] = contents["text"];
394  } else {
395  elem.append(contents);
396  }
397  }
398  return {tag_name, elem};
399 }
400 
401 config parse_text(const std::string &text)
402 {
403  config res;
404  auto beg = text.begin(), end = text.end();
405  while(beg != end) {
406  if(*beg == '<') {
407  auto [tag, contents] = parse_tag(beg, end);
408  res.add_child(tag, contents);
409  } else {
410  config text = parse_text_until(beg, end, '<');
411  res.append_children(text);
412  }
413  }
414  return res;
415 }
416 
417 }
double t
Definition: astarsearch.cpp:63
A config object defines a single node in a WML file, with access to child nodes.
Definition: config.hpp:172
void append(const config &cfg)
Append data from another config object to this one.
Definition: config.cpp:202
std::size_t attribute_count() const
Count the number of non-blank attributes.
Definition: config.cpp:310
config & mandatory_child(config_key_type key, int n=0)
Returns the nth child with the given key, or throws an error if there is none.
Definition: config.cpp:365
bool has_child(config_key_type key) const
Determine whether a config has a child or not.
Definition: config.cpp:315
bool has_attribute(config_key_type key) const
Definition: config.cpp:156
std::size_t all_children_count() const
Definition: config.cpp:305
void append_children(const config &cfg)
Adds children from cfg.
Definition: config.cpp:166
optional_config_impl< config > optional_child(config_key_type key, int n=0)
Equivalent to mandatory_child, but returns an empty optional if the nth child was not found.
Definition: config.cpp:383
config & add_child(config_key_type key)
Definition: config.cpp:439
std::ostringstream wrapper.
Definition: formatter.hpp:40
static std::string parse_name(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:219
static config parse_tag_contents(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
Definition: markup.cpp:309
std::string make_link(const std::string &text, const std::string &dst)
Definition: markup.cpp:23
std::string img(const std::string &src, const std::string &align, const bool floating)
Definition: markup.cpp:29
std::string tag(const std::string &tag_name, Args &&... contents)
Returns the contents enclosed inside <tag_name> and </tag_name>
Definition: markup.hpp:35
static std::pair< std::string, std::string > parse_attribute(std::string::const_iterator &beg, std::string::const_iterator end, bool allow_empty)
Definition: markup.cpp:228
static std::pair< std::string, config > parse_tag(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:365
static char parse_escape(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:159
static config parse_text_until(std::string::const_iterator &beg, std::string::const_iterator end, char close)
Definition: markup.cpp:170
static config parse_entity(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:90
static void check_closing_tag(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view match)
Definition: markup.cpp:290
config parse_text(const std::string &text)
Parse a xml style marked up text string.
Definition: markup.cpp:401
std::string escape(const std::string &str, const char *special_chars)
Prepends a configurable set of characters with a backslash.
rect dst
Location on the final composed sheet.
rect src
Non-transparent portion of the surface to compose.
Thrown when the help system fails to parse something.
Definition: markup.hpp:135
static map_location::direction n
static map_location::direction s