The Battle for Wesnoth  1.19.5+dev
unicode.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2024
3  by Philippe Plantier <ayin@anathas.org>
4  Copyright (C) 2005 by Guillaume Melquiond <guillaume.melquiond@gmail.com>
5  Copyright (C) 2003 by David White <dave@whitevine.net>
6  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
7 
8  This program is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation; either version 2 of the License, or
11  (at your option) any later version.
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY.
14 
15  See the COPYING file for more details.
16 */
17 
18 /**
19  * @file
20  * Unicode support functions.
21  */
22 
25 
26 #include "log.hpp"
27 
28 #include <limits>
29 
30 static lg::log_domain log_engine("engine");
31 #define ERR_GENERAL LOG_STREAM(err, lg::general())
32 
33 namespace utf8 {
34 
35 static int byte_size_from_utf8_first(const unsigned char ch)
36 {
37  if (!(ch & 0x80)) {
38  return 1; // US-ASCII character, 1 byte
39  }
40  /* first bit set: character not in US-ASCII, multiple bytes
41  * number of set bits at the beginning = bytes per character
42  * e.g. 11110xxx indicates a 4-byte character */
43  int count = count_leading_ones(ch);
44  if (count == 1 || count > 6) { // count > 4 after RFC 3629
45  throw invalid_utf8_exception(); // Stop on invalid characters
46  }
47  return count;
48 }
49 
50 std::string lowercase(const std::string& s)
51 {
52  if(!s.empty()) {
53  utf8::iterator itor(s);
54  std::string res;
55 
56  for(;itor != utf8::iterator::end(s); ++itor) {
57  char32_t uchar = *itor;
58  // If wchar_t is less than 32 bits wide, we cannot apply towlower() to all codepoints
59  if(uchar <= static_cast<char32_t>(std::numeric_limits<wchar_t>::max()))
60  uchar = towlower(static_cast<wchar_t>(uchar));
61  res += unicode_cast<std::string>(uchar);
62  }
63 
64  res.append(itor.substr().second, s.end());
65  return res;
66  }
67  return s;
68 }
69 
70 std::size_t index(const std::string& str, const std::size_t index)
71 {
72  // chr counts characters, i is the codepoint index
73  // remark: several functions rely on the fallback to str.length()
74  unsigned int i = 0, len = str.size();
75  try {
76  for (unsigned int chr=0; chr<index && i<len; ++chr) {
77  i += byte_size_from_utf8_first(str[i]);
78  }
79  } catch(const invalid_utf8_exception&) {
80  ERR_GENERAL << "Invalid UTF-8 string.";
81  }
82  return i;
83 }
84 
85 std::size_t size(const std::string& str)
86 {
87  unsigned int chr, i = 0, len = str.size();
88  try {
89  for (chr=0; i<len; ++chr) {
90  i += byte_size_from_utf8_first(str[i]);
91  }
92  } catch(const invalid_utf8_exception&) {
93  ERR_GENERAL << "Invalid UTF-8 string.";
94  }
95  return chr;
96 }
97 
98 std::string& insert(std::string& str, const std::size_t pos, const std::string& insert)
99 {
100  return str.insert(index(str, pos), insert);
101 }
102 
103 std::string& erase(std::string& str, const std::size_t start, const std::size_t len)
104 {
105  if (start > size(str)) return str;
106  unsigned pos = index(str, start);
107 
108  if (len == std::string::npos) {
109  // without second argument, std::string::erase truncates
110  return str.erase(pos);
111  } else {
112  return str.erase(pos, index(str,start+len) - pos);
113  }
114 }
115 
116 std::string& truncate(std::string& str, const std::size_t size)
117 {
118  return erase(str, size);
119 }
120 
121 void truncate_as_ucs4(std::string &str, const std::size_t size)
122 {
123  std::u32string u4_str = unicode_cast<std::u32string>(str);
124  if(u4_str.size() > size) {
125  u4_str.resize(size);
126  str = unicode_cast<std::string>(u4_str);
127  }
128 }
129 
130 } // end namespace utf8
static iterator_base end(const string_type &str)
const std::pair< typename string_type::const_iterator, typename string_type::const_iterator > & substr() const
Thrown by operations encountering invalid UTF-8 data.
std::size_t i
Definition: function.cpp:1028
Standard logging facilities (interface).
constexpr unsigned int count_leading_ones(N n)
Returns the quantity of leading 1 bits in n — i.e., the quantity of bits in n, minus the 1-based bit ...
Definition: math.hpp:179
EXIT_STATUS start(bool clear_id, const std::string &filename, bool take_screenshot, const std::string &screenshot_filename)
Main interface for launching the editor from the title screen.
Functions for converting Unicode wide-char strings to UTF-8 encoded strings, back and forth.
Definition: unicode.cpp:33
std::size_t index(const std::string &str, const std::size_t index)
Codepoint index corresponding to the nth character in a UTF-8 string.
Definition: unicode.cpp:70
std::string & insert(std::string &str, const std::size_t pos, const std::string &insert)
Insert a UTF-8 string at the specified position.
Definition: unicode.cpp:98
std::string lowercase(const std::string &s)
Returns a lowercased version of the string.
Definition: unicode.cpp:50
std::string & erase(std::string &str, const std::size_t start, const std::size_t len)
Erases a portion of a UTF-8 string.
Definition: unicode.cpp:103
void truncate_as_ucs4(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:121
std::size_t size(const std::string &str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:85
static int byte_size_from_utf8_first(const unsigned char ch)
Definition: unicode.cpp:35
std::string & truncate(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:116
static map_location::direction s
#define ERR_GENERAL
Definition: unicode.cpp:31
static lg::log_domain log_engine("engine")