The Battle for Wesnoth  1.19.8+dev
markup.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2024
3  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 #include "serialization/markup.hpp"
16 
17 #include "config.hpp"
18 #include "game_config.hpp"
19 #include "gettext.hpp"
21 #include "serialization/unicode_cast.hpp" // for unicode_cast
22 
23 namespace markup {
24 
25 std::string make_link(const std::string& text, const std::string& dst)
26 {
27  // some sorting done on list of links may rely on the fact that text is first
28  return formatter() << "<ref dst='" << dst << "'>" << text << "</ref>";
29 }
30 
31 std::string img(const std::string& src, const std::string& align, bool floating)
32 {
33  return formatter()
34  << "<img src='" << src << "' "
35  << "float='" << std::boolalpha << floating << "' "
36  << "align='" << align << "' "
37  << "/>";
38 }
39 
40 //
41 // Markup Parser
42 //
43 
44 /*
45 
46 Here's a little mini-grammar of the markup language:
47 
48 DOCUMENT ::= (TEXT | TAG)*
49 TEXT ::= ([^<&\] | ENTITY | ESCAPE)*
50 ESCAPE ::= '\' [:unicode-char:]
51 ENTITY ::= '&' '#' [0-9]+ ';'
52 ENTITY ::= '&' 'x' [0-9a-fA-F]+ ';'
53 ENTITY ::= '&' NAME ';'
54 TAG ::= '<' NAME ATTRIBUTE* '/' '>'
55 TAG ::= '<' NAME ATTRIBUTE* '>' DOCUMENT '<' '/' NAME '>' ## NB: the names must match!
56 TAG ::= '<' NAME '>' ATTRIBUTE* TEXT? '<' '/' NAME '>' ## NB: the names must match!
57 ATTRIBUTE ::= NAME
58 ATTRIBUTE ::= NAME '=' [^'" ]*
59 ATTRIBUTE ::= NAME '=' "'" TEXT "'"
60 ATTRIBUTE ::= NAME '=' '"' TEXT '"'
61 NAME ::= [_0-9a-zA-Z]+
62 
63 Notes:
64 * Entities and the first two tag formats are Pango-style. The tags can be nested inside each other.
65 * Escapes and the third tag format are for compatibility with the old help markup. Tags cannot be nested.
66 * This mostly doesn't attempt to define the meaning of specific tags or entity names. It does however substitute numeric entities, as well as some very basic named entities: lt, gt, amp, quot, apos.
67 * The definition of TEXT is left a bit nebulous, but just think of it as "non-greedy"
68 * Attributes without a value are only supported in Pango-style tags
69 * Some restrictions may apply beyond what the grammar specifies. For example, arbitrary named entities are not supported in attribute values (numeric ones and the 5 special ones work though).
70 
71 ------
72 
73 The result of the parsing is represented in the format of a WML config.
74 Text spans are represented as a [text] tag, and character entities as a [character_entity] tag.
75 All other tags are represented by a tag of the same name.
76 Any attributes on a tag become key-value pairs within the tag.
77 Old-style help markup tags with text at the end put the text in a "text" key in the tag.
78 The same approach is used for new-style Pango tags, but only if there are no nested tags or entities.
79 If there ARE nested tags or entities, the contents of the tag is broken down into spans as subtags of the parent tag.
80 Thus, a tag with content has EITHER a text attribute OR some subtags.
81 
82 Note: Only unrecognized named entities count for the above purposes!
83 Numerical entities and the special five lt, gt, amp, apos, quot are directly substituted in-place.
84 
85 Also, text spans will be broken up on paragraph breaks (double newlines).
86 This means that adjacent [text] tags should be rendered with a paragraph break between them.
87 However, no paragraph break should be used when [text] is followed by something else.
88 It is possible to have empty text spans in some cases, for example given a run of more than 2 newlines,
89 or a character entity directly followed by a paragraph break.
90 
91 */
92 static config parse_entity(std::string::const_iterator& beg, std::string::const_iterator end)
93 {
94  config entity;
95  std::stringstream s;
96  enum { UNKNOWN, NAMED, HEX, DECIMAL } type = UNKNOWN;
97  assert(*beg == '&');
98  ++beg;
99  for(; beg != end && *beg != ';'; ++beg) {
100  switch(type) {
101  case UNKNOWN:
102  if(*beg == '#') {
103  type = DECIMAL;
104  } else if(isalnum(*beg) || *beg == '_') {
105  type = NAMED;
106  s << *beg;
107  } else {
108  throw parse_error("TODO");
109  }
110  break;
111  case NAMED:
112  if(!isalnum(*beg)) {
113  throw parse_error("TODO");
114  }
115  s << *beg;
116  break;
117  case DECIMAL:
118  if(*beg == 'x') {
119  type = HEX;
120  } else if(isdigit(*beg)) {
121  s << *beg;
122  } else {
123  throw parse_error("TODO");
124  }
125  break;
126  case HEX:
127  if(isxdigit(*beg)) {
128  s << *beg;
129  } else {
130  throw parse_error("TODO");
131  }
132  break;
133  }
134  }
135  if(type == NAMED) {
136  std::string name = s.str();
137  entity["name"] = name;
138  if(name == "lt") {
139  entity["code_point"] = '<';
140  } else if(name == "gt") {
141  entity["code_point"] = '>';
142  } else if(name == "apos") {
143  entity["code_point"] = '\'';
144  } else if(name == "quot") {
145  entity["code_point"] = '"';
146  } else if(name == "amp") {
147  entity["code_point"] = '&';
148  }
149  } else {
150  s.seekg(0);
151  if(type == HEX) {
152  s >> std::hex;
153  }
154  int n;
155  s >> n;
156  entity["code_point"] = n;
157  }
158  return entity;
159 }
160 
161 static char parse_escape(std::string::const_iterator& beg, std::string::const_iterator end)
162 {
163  assert(*beg == '\\');
164  // An escape at the end of stream is just treated as a literal.
165  // Otherwise, take the next character as a literal and be done with it.
166  if((beg + 1) != end) {
167  ++beg;
168  }
169  return *beg;
170 }
171 
172 static config parse_text_until(std::string::const_iterator& beg, std::string::const_iterator end, char close)
173 {
174  // In practice, close will be one of < ' "
175  // Parsing will go until either close or eos, and will emit one or more text and character_entity tags.
176  // However, recognized character entities will be collapsed into the text tags.
177  std::ostringstream s;
178  bool saw_newline = false;
179  config res;
180  for(; beg != end && *beg != close; ++beg) {
181  if(*beg == '&') {
182  auto entity = parse_entity(beg, end);
183  if(beg == end) {
184  throw parse_error("unexpected eos after entity");
185  }
186  if(entity.has_attribute("code_point")) {
187  s << unicode_cast<std::string>(entity["code_point"].to_int());
188  } else {
189  // TODO: Adding the text here seems wrong in the case that the stream BEGINS with an entity...
190  res.add_child("text", config("text", s.str()));
191  res.add_child("character_entity", entity);
192  s.str("");
193  }
194  } else if(*beg == '\\') {
195  s << parse_escape(beg, end);
196  } else if(*beg == '\n') {
197  if(saw_newline) {
198  res.add_child("text", config("text", s.str()));
199  s.str("");
200  } else {
201  saw_newline = true;
202  continue;
203  }
204  } else {
205  if(saw_newline) {
206  s << '\n';
207  }
208  s << *beg;
209  }
210  saw_newline = false;
211  }
212  // If the span ended in a newline, preserve it
213  if(saw_newline) {
214  s << '\n';
215  }
216  res.add_child("text", config("text", s.str()));
217  assert(beg == end || *beg == close);
218  return res;
219 }
220 
221 static std::string parse_name(std::string::const_iterator& beg, std::string::const_iterator end)
222 {
223  std::ostringstream s;
224  for(; beg != end && (isalnum(*beg) || *beg == '_'); ++beg) {
225  s << *beg;
226  }
227  return s.str();
228 }
229 
230 static std::pair<std::string, std::string> parse_attribute(std::string::const_iterator& beg, std::string::const_iterator end, bool allow_empty)
231 {
232  std::string attr = parse_name(beg, end), value;
233  if(attr.empty()) {
234  throw parse_error("missing attribute name");
235  }
236  while(isspace(*beg)) ++beg;
237  if(*beg != '=') {
238  if(allow_empty) {
239  // The caller expects beg to point to the last character of the attribute upon return.
240  // But in this path, we're now pointing to the character AFTER that.
241  --beg;
242  return {attr, value};
243  } else throw parse_error("attribute missing value in old-style tag");
244  }
245  ++beg;
246  while(isspace(*beg)) ++beg;
247  if(*beg == '\'' || *beg == '"') {
248  config res = parse_text_until(beg, end, *beg++);
249  if(res.has_child("character_entity")) {
250  throw parse_error("unsupported entity in attribute value");
251  } else if(res.all_children_count() > 1) {
252  throw parse_error("paragraph break in attribute value");
253  }
254  if(auto t = res.optional_child("text")) {
255  value = t["text"].str();
256  }
257  } else {
258  std::ostringstream s;
259  bool found_slash = false;
260  for(; beg != end && *beg != '>' && *beg != '<' && !isspace(*beg); ++beg) {
261  if(*beg == '&') {
262  auto entity = parse_entity(beg, end);
263  if(beg == end) {
264  throw parse_error("unexpected eos after entity");
265  }
266  if(entity.has_attribute("code_point")) {
267  s << unicode_cast<std::string>(entity["code_point"].to_int());
268  } else {
269  throw parse_error("unsupported entity in attribute value");
270  }
271  } else if(*beg == '\\') {
272  s << parse_escape(beg, end);
273  } else if(*beg == '/') {
274  found_slash = true;
275  } else {
276  if(found_slash) {
277  s << '/';
278  found_slash = false;
279  }
280  s << *beg;
281  }
282  }
283  value = s.str();
284  // The caller expects beg to point to the last character of the attribute upon return.
285  // But in this path, we're now pointing to the character AFTER that.
286  --beg;
287  if(found_slash) --beg;
288  }
289  return {attr, value};
290 }
291 
292 static void check_closing_tag(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match)
293 {
294  size_t remaining = end - beg;
295  assert(remaining >= 2 && *beg == '<' && *(beg + 1) == '/');
296  if(remaining < match.size() + 3) {
297  throw parse_error("Unexpected eos in closing tag");
298  }
299  beg += 2;
300  if(!std::equal(match.begin(), match.end(), beg)) {
301  throw parse_error("Mismatched closing tag");
302  }
303  beg += match.size();
304  if(*beg != '>') {
305  throw parse_error("Unterminated closing tag");
306  }
307  ++beg;
308 }
309 
310 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end);
311 static config parse_tag_contents(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
312 {
313  assert(*beg == '>');
314  ++beg;
315  // This also parses the matching closing tag!
316  config res;
317  for(; check_for_attributes && beg != end && *beg != '<'; ++beg) {
318  if(isspace(*beg)) continue;
319  auto save_beg = beg;
320  try {
321  auto [key, val] = parse_attribute(beg, end, false);
322  res[key] = val;
323  } catch(parse_error&) {
324  beg = save_beg;
325  while(beg != end && isspace(*beg)) ++beg;
326  break;
327  }
328  }
329  if(res.has_attribute("text")) {
330  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
331  throw parse_error("Extra text at the end of old-style tag with explicit 'text' attribute");
332  }
333  check_closing_tag(beg, end, match);
334  return res;
335  } else if(res.attribute_count() > 0) {
336  config text = parse_text_until(beg, end, '<');
337  if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
338  throw parse_error("Extra text at the end of old-style tag with explicit 'text' attribute");
339  }
340  if(text.all_children_count() == 1 && text.has_child("text")) {
341  res["text"] = text.mandatory_child("text")["text"];
342  } else {
343  res.append_children(text);
344  }
345  check_closing_tag(beg, end, match);
346  return res;
347  }
348  while(true) {
349  config text = parse_text_until(beg, end, '<');
350  if(beg == end || beg + 1 == end) {
351  throw parse_error("Missing closing tag");
352  }
353  res.append_children(text);
354  if(*(beg + 1) == '/') {
355  check_closing_tag(beg, end, match);
356  break;
357  }
358  auto [tag, contents] = parse_tag(beg, end);
359  res.add_child(tag, contents);
360  }
361  if(res.all_children_count() == 1 && res.has_child("text")) {
362  return res.mandatory_child("text");
363  }
364  return res;
365 }
366 
367 static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end)
368 {
369  assert(*beg == '<');
370  ++beg;
371  std::string tag_name = parse_name(beg, end);
372  if(tag_name.empty()) {
373  throw parse_error("missing tag name");
374  }
375  bool auto_closed = false;
376  config elem;
377  for(; beg != end && *beg != '>'; ++beg) {
378  if(isspace(*beg)) continue;
379  if(*beg == '/' && (beg + 1) != end && *(beg + 1) == '>') {
380  auto_closed = true;
381  } else if(isalnum(*beg) || *beg == '_') {
382  const auto& [key, value] = parse_attribute(beg, end, true);
383  if(beg == end) {
384  throw parse_error("unexpected eos following attribute");
385  }
386  elem[key] = value;
387  }
388  }
389  if(auto_closed) {
390  assert(*beg == '>');
391  ++beg;
392  } else {
393  config contents = parse_tag_contents(beg, end, tag_name, elem.attribute_count() == 0);
394  if(contents.all_children_count() == 0 && contents.attribute_count() == 1 && contents.has_attribute("text")) {
395  elem["text"] = contents["text"];
396  } else {
397  elem.append(contents);
398  }
399  }
400  return {tag_name, elem};
401 }
402 
403 config parse_text(const std::string &text)
404 {
405  config res;
406  auto beg = text.begin(), end = text.end();
407  while(beg != end) {
408  if(*beg == '<') {
409  auto [tag, contents] = parse_tag(beg, end);
410  res.add_child(tag, contents);
411  } else {
412  config text = parse_text_until(beg, end, '<');
413  res.append_children(text);
414  }
415  }
416  return res;
417 }
418 
419 }
double t
Definition: astarsearch.cpp:63
A config object defines a single node in a WML file, with access to child nodes.
Definition: config.hpp:158
void append(const config &cfg)
Append data from another config object to this one.
Definition: config.cpp:188
std::size_t attribute_count() const
Count the number of non-blank attributes.
Definition: config.cpp:307
config & mandatory_child(config_key_type key, int n=0)
Returns the nth child with the given key, or throws an error if there is none.
Definition: config.cpp:362
bool has_child(config_key_type key) const
Determine whether a config has a child or not.
Definition: config.cpp:312
bool has_attribute(config_key_type key) const
Definition: config.cpp:157
std::size_t all_children_count() const
Definition: config.cpp:302
void append_children(const config &cfg)
Adds children from cfg.
Definition: config.cpp:167
optional_config_impl< config > optional_child(config_key_type key, int n=0)
Equivalent to mandatory_child, but returns an empty optional if the nth child was not found.
Definition: config.cpp:380
config & add_child(config_key_type key)
Definition: config.cpp:436
std::ostringstream wrapper.
Definition: formatter.hpp:40
Definitions for the interface to Wesnoth Markup Language (WML).
static std::string parse_name(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:221
static config parse_tag_contents(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
Definition: markup.cpp:311
std::string img(const std::string &src, const std::string &align, bool floating)
Generates a Help markup tag corresponding to an image.
Definition: markup.cpp:31
std::string make_link(const std::string &text, const std::string &dst)
Generates a Help markup tag corresponding to a reference or link.
Definition: markup.cpp:25
static std::pair< std::string, std::string > parse_attribute(std::string::const_iterator &beg, std::string::const_iterator end, bool allow_empty)
Definition: markup.cpp:230
static std::pair< std::string, config > parse_tag(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:367
static char parse_escape(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:161
static config parse_text_until(std::string::const_iterator &beg, std::string::const_iterator end, char close)
Definition: markup.cpp:172
std::string tag(std::string_view tag, Args &&... data)
Wraps the given data in the specified formatting tag.
Definition: markup.hpp:50
static config parse_entity(std::string::const_iterator &beg, std::string::const_iterator end)
Definition: markup.cpp:92
static void check_closing_tag(std::string::const_iterator &beg, std::string::const_iterator end, std::string_view match)
Definition: markup.cpp:292
config parse_text(const std::string &text)
Parse a xml style marked up text string.
Definition: markup.cpp:403
rect dst
Location on the final composed sheet.
rect src
Non-transparent portion of the surface to compose.
Thrown when the help system fails to parse something.
Definition: markup.hpp:190
static map_location::direction n
static map_location::direction s