The Battle for Wesnoth  1.17.0-dev
unicode.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2021
3  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 /**
16  * @file
17  * Unicode support functions.
18  */
19 
23 
24 #include "log.hpp"
25 
26 #include <cassert>
27 #include <limits>
28 
29 static lg::log_domain log_engine("engine");
30 #define ERR_GENERAL LOG_STREAM(err, lg::general())
31 
32 namespace utf8 {
33 
34 static int byte_size_from_utf8_first(const unsigned char ch)
35 {
36  if (!(ch & 0x80)) {
37  return 1; // US-ASCII character, 1 byte
38  }
39  /* first bit set: character not in US-ASCII, multiple bytes
40  * number of set bits at the beginning = bytes per character
41  * e.g. 11110xxx indicates a 4-byte character */
42  int count = count_leading_ones(ch);
43  if (count == 1 || count > 6) { // count > 4 after RFC 3629
44  throw invalid_utf8_exception(); // Stop on invalid characters
45  }
46  return count;
47 }
48 
49 std::string lowercase(const std::string& s)
50 {
51  if(!s.empty()) {
52  utf8::iterator itor(s);
53  std::string res;
54 
55  for(;itor != utf8::iterator::end(s); ++itor) {
56  char32_t uchar = *itor;
57  // If wchar_t is less than 32 bits wide, we cannot apply towlower() to all codepoints
58  if(uchar <= static_cast<char32_t>(std::numeric_limits<wchar_t>::max()))
59  uchar = towlower(static_cast<wchar_t>(uchar));
60  res += unicode_cast<std::string>(uchar);
61  }
62 
63  res.append(itor.substr().second, s.end());
64  return res;
65  }
66  return s;
67 }
68 
69 std::size_t index(const std::string& str, const std::size_t index)
70 {
71  // chr counts characters, i is the codepoint index
72  // remark: several functions rely on the fallback to str.length()
73  unsigned int i = 0, len = str.size();
74  try {
75  for (unsigned int chr=0; chr<index && i<len; ++chr) {
76  i += byte_size_from_utf8_first(str[i]);
77  }
78  } catch(const invalid_utf8_exception&) {
79  ERR_GENERAL << "Invalid UTF-8 string." << std::endl;
80  }
81  return i;
82 }
83 
84 std::size_t size(const std::string& str)
85 {
86  unsigned int chr, i = 0, len = str.size();
87  try {
88  for (chr=0; i<len; ++chr) {
89  i += byte_size_from_utf8_first(str[i]);
90  }
91  } catch(const invalid_utf8_exception&) {
92  ERR_GENERAL << "Invalid UTF-8 string." << std::endl;
93  }
94  return chr;
95 }
96 
97 std::string& insert(std::string& str, const std::size_t pos, const std::string& insert)
98 {
99  return str.insert(index(str, pos), insert);
100 }
101 
102 std::string& erase(std::string& str, const std::size_t start, const std::size_t len)
103 {
104  if (start > size(str)) return str;
105  unsigned pos = index(str, start);
106 
107  if (len == std::string::npos) {
108  // without second argument, std::string::erase truncates
109  return str.erase(pos);
110  } else {
111  return str.erase(pos, index(str,start+len) - pos);
112  }
113 }
114 
115 std::string& truncate(std::string& str, const std::size_t size)
116 {
117  return erase(str, size);
118 }
119 
120 void truncate_as_ucs4(std::string &str, const std::size_t size)
121 {
122  std::u32string u4_str = unicode_cast<std::u32string>(str);
123  if(u4_str.size() > size) {
124  u4_str.resize(size);
125  str = unicode_cast<std::string>(u4_str);
126  }
127 }
128 
129 } // end namespace utf8
#define uchar(c)
Definition: lstrlib.cpp:40
ucs4_convert_impl::enableif< TD, typename TS::value_type >::type unicode_cast(const TS &source)
void truncate_as_ucs4(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:120
unsigned int count_leading_ones(N n)
Returns the quantity of leading 1 bits in n — i.e., the quantity of bits in n, minus the 1-based bit...
Definition: math.hpp:297
std::size_t size(const std::string &str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:84
#define ERR_GENERAL
Definition: unicode.cpp:30
std::string & truncate(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:115
const std::pair< typename string_type::const_iterator, typename string_type::const_iterator > & substr() const
static int byte_size_from_utf8_first(const unsigned char ch)
Definition: unicode.cpp:34
std::size_t i
Definition: function.cpp:940
Functions for converting Unicode wide-char strings to UTF-8 encoded strings, back and forth...
Definition: unicode.cpp:32
Thrown by operations encountering invalid UTF-8 data.
static map_location::DIRECTION s
std::string & insert(std::string &str, const std::size_t pos, const std::string &insert)
Insert a UTF-8 string at the specified position.
Definition: unicode.cpp:97
std::size_t index(const std::string &str, const std::size_t index)
Codepoint index corresponding to the nth character in a UTF-8 string.
Definition: unicode.cpp:69
std::string lowercase(const std::string &s)
Returns a lowercased version of the string.
Definition: unicode.cpp:49
static iterator_base end(const string_type &str)
std::string & erase(std::string &str, const std::size_t start, const std::size_t len)
Erases a portion of a UTF-8 string.
Definition: unicode.cpp:102
static lg::log_domain log_engine("engine")
Standard logging facilities (interface).
EXIT_STATUS start(const std::string &filename, bool take_screenshot, const std::string &screenshot_filename)
Main interface for launching the editor from the title screen.
Definition: editor_main.cpp:29