The Battle for Wesnoth  1.19.15+dev
unicode.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2025
3  by Philippe Plantier <ayin@anathas.org>
4  Copyright (C) 2005 by Guillaume Melquiond <guillaume.melquiond@gmail.com>
5  Copyright (C) 2003 by David White <dave@whitevine.net>
6  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
7 
8  This program is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation; either version 2 of the License, or
11  (at your option) any later version.
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY.
14 
15  See the COPYING file for more details.
16 */
17 
18 /**
19  * @file
20  * Unicode support functions.
21  */
22 
25 
26 #include "log.hpp"
27 
28 #include <limits>
29 
30 static lg::log_domain log_engine("engine");
31 #define ERR_GENERAL LOG_STREAM(err, lg::general())
32 
33 namespace utf8 {
34 
35 static int byte_size_from_utf8_first(const unsigned char ch)
36 {
37  if (!(ch & 0x80)) {
38  return 1; // US-ASCII character, 1 byte
39  }
40  /* first bit set: character not in US-ASCII, multiple bytes
41  * number of set bits at the beginning = bytes per character
42  * e.g. 11110xxx indicates a 4-byte character */
43  int count = count_leading_ones(ch);
44  if (count == 1 || count > 6) { // count > 4 after RFC 3629
45  throw invalid_utf8_exception(); // Stop on invalid characters
46  }
47  return count;
48 }
49 
50 std::string lowercase(std::string_view s)
51 {
52  if(!s.empty()) {
53  utf8::iterator itor(s);
54  std::string res;
55 
56  for(;itor != utf8::iterator::end(s); ++itor) {
57  char32_t uchar = *itor;
58  // If wchar_t is less than 32 bits wide, we cannot apply towlower() to all codepoints
59  if(uchar <= static_cast<char32_t>(std::numeric_limits<wchar_t>::max()))
60  uchar = towlower(static_cast<wchar_t>(uchar));
61  res += unicode_cast<std::string>(uchar);
62  }
63 
64  res.append(itor.substr().second, s.end());
65  return res;
66  }
67  return std::string();
68 }
69 
70 std::size_t index(std::string_view str, const std::size_t index)
71 {
72  // chr counts characters, i is the codepoint index
73  // remark: several functions rely on the fallback to str.length()
74  unsigned int i = 0, len = str.size();
75  for(unsigned int chr = 0; chr < index && i < len; ++chr) {
76  i += byte_size_from_utf8_first(str[i]);
77  }
78  return i;
79 }
80 
81 std::size_t size(std::string_view str)
82 {
83  std::size_t chr, i = 0, len = str.size();
84  for(chr = 0; i < len; ++chr) {
85  i += byte_size_from_utf8_first(str[i]);
86  }
87  return chr;
88 }
89 
90 std::size_t size(const std::string::const_iterator& start, const std::string::const_iterator& end)
91 {
92  std::size_t count;
93  std::string::const_iterator pos = start;
94  for(count = 0; pos < end; ++count) {
95  pos += byte_size_from_utf8_first(*pos);
96  }
97  return count;
98 }
99 
100 std::string& insert(std::string& str, const std::size_t pos, const std::string& insert)
101 {
102  return str.insert(index(str, pos), insert);
103 }
104 
105 std::string& erase(std::string& str, const std::size_t start, const std::size_t len)
106 {
107  if (start > size(str)) return str;
108  unsigned pos = index(str, start);
109 
110  if (len == std::string::npos) {
111  // without second argument, std::string::erase truncates
112  return str.erase(pos);
113  } else {
114  return str.erase(pos, index(str,start+len) - pos);
115  }
116 }
117 
118 std::string& truncate(std::string& str, const std::size_t size)
119 {
120  return erase(str, size);
121 }
122 
123 void truncate_as_ucs4(std::string &str, const std::size_t size)
124 {
125  std::u32string u4_str = unicode_cast<std::u32string>(str);
126  if(u4_str.size() > size) {
127  u4_str.resize(size);
128  str = unicode_cast<std::string>(u4_str);
129  }
130 }
131 
132 } // end namespace utf8
static iterator_base end(const string_type &str)
const std::pair< typename string_type::const_iterator, typename string_type::const_iterator > & substr() const
Thrown by operations encountering invalid UTF-8 data.
std::size_t i
Definition: function.cpp:1032
Standard logging facilities (interface).
constexpr unsigned int count_leading_ones(N n)
Returns the quantity of leading 1 bits in n — i.e., the quantity of bits in n, minus the 1-based bit ...
Definition: math.hpp:179
EXIT_STATUS start(bool clear_id, const std::string &filename, bool take_screenshot, const std::string &screenshot_filename)
Main interface for launching the editor from the title screen.
Functions for converting Unicode wide-char strings to UTF-8 encoded strings, back and forth.
Definition: unicode.cpp:33
std::string lowercase(std::string_view s)
Returns a lowercased version of the string.
Definition: unicode.cpp:50
std::string & insert(std::string &str, const std::size_t pos, const std::string &insert)
Insert a UTF-8 string at the specified position.
Definition: unicode.cpp:100
std::string & erase(std::string &str, const std::size_t start, const std::size_t len)
Erases a portion of a UTF-8 string.
Definition: unicode.cpp:105
void truncate_as_ucs4(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:123
std::size_t size(std::string_view str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:81
static int byte_size_from_utf8_first(const unsigned char ch)
Definition: unicode.cpp:35
std::string & truncate(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:118
std::size_t index(std::string_view str, const std::size_t index)
Codepoint index corresponding to the nth character in a UTF-8 string.
Definition: unicode.cpp:70
static map_location::direction s
static lg::log_domain log_engine("engine")