The Battle for Wesnoth  1.15.0-dev
unicode.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 by David White <dave@whitevine.net>
3  Copyright (C) 2005 by Guillaume Melquiond <guillaume.melquiond@gmail.com>
4  Copyright (C) 2005 - 2018 by Philippe Plantier <ayin@anathas.org>
5  Part of the Battle for Wesnoth Project http://www.wesnoth.org/
6 
7  This program is free software; you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation; either version 2 of the License, or
10  (at your option) any later version.
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY.
13 
14  See the COPYING file for more details.
15 */
16 
17 /**
18  * @file
19  * Unicode support functions.
20  */
21 
25 
26 #include "log.hpp"
27 
28 #include <cassert>
29 #include <limits>
30 
31 static lg::log_domain log_engine("engine");
32 #define ERR_GENERAL LOG_STREAM(err, lg::general())
33 
34 namespace utf8 {
35 
36 static int byte_size_from_utf8_first(const unsigned char ch)
37 {
38  if (!(ch & 0x80)) {
39  return 1; // US-ASCII character, 1 byte
40  }
41  /* first bit set: character not in US-ASCII, multiple bytes
42  * number of set bits at the beginning = bytes per character
43  * e.g. 11110xxx indicates a 4-byte character */
44  int count = count_leading_ones(ch);
45  if (count == 1 || count > 6) { // count > 4 after RFC 3629
46  throw invalid_utf8_exception(); // Stop on invalid characters
47  }
48  return count;
49 }
50 
51 std::string lowercase(const std::string& s)
52 {
53  if(!s.empty()) {
54  utf8::iterator itor(s);
55  std::string res;
56 
57  for(;itor != utf8::iterator::end(s); ++itor) {
58  char32_t uchar = *itor;
59  // If wchar_t is less than 32 bits wide, we cannot apply towlower() to all codepoints
60  if(uchar <= static_cast<char32_t>(std::numeric_limits<wchar_t>::max()))
61  uchar = towlower(static_cast<wchar_t>(uchar));
62  res += unicode_cast<std::string>(uchar);
63  }
64 
65  res.append(itor.substr().second, s.end());
66  return res;
67  }
68  return s;
69 }
70 
71 std::size_t index(const std::string& str, const std::size_t index)
72 {
73  // chr counts characters, i is the codepoint index
74  // remark: several functions rely on the fallback to str.length()
75  unsigned int i = 0, len = str.size();
76  try {
77  for (unsigned int chr=0; chr<index && i<len; ++chr) {
78  i += byte_size_from_utf8_first(str[i]);
79  }
80  } catch(const invalid_utf8_exception&) {
81  ERR_GENERAL << "Invalid UTF-8 string." << std::endl;
82  }
83  return i;
84 }
85 
86 std::size_t size(const std::string& str)
87 {
88  unsigned int chr, i = 0, len = str.size();
89  try {
90  for (chr=0; i<len; ++chr) {
91  i += byte_size_from_utf8_first(str[i]);
92  }
93  } catch(const invalid_utf8_exception&) {
94  ERR_GENERAL << "Invalid UTF-8 string." << std::endl;
95  }
96  return chr;
97 }
98 
99 std::string& insert(std::string& str, const std::size_t pos, const std::string& insert)
100 {
101  return str.insert(index(str, pos), insert);
102 }
103 
104 std::string& erase(std::string& str, const std::size_t start, const std::size_t len)
105 {
106  if (start > size(str)) return str;
107  unsigned pos = index(str, start);
108 
109  if (len == std::string::npos) {
110  // without second argument, std::string::erase truncates
111  return str.erase(pos);
112  } else {
113  return str.erase(pos, index(str,start+len) - pos);
114  }
115 }
116 
117 std::string& truncate(std::string& str, const std::size_t size)
118 {
119  return erase(str, size);
120 }
121 
122 void truncate_as_ucs4(std::string &str, const std::size_t size)
123 {
124  std::u32string u4_str = unicode_cast<std::u32string>(str);
125  if(u4_str.size() > size) {
126  u4_str.resize(size);
127  str = unicode_cast<std::string>(u4_str);
128  }
129 }
130 
131 } // end namespace utf8
#define uchar(c)
Definition: lstrlib.cpp:39
ucs4_convert_impl::enableif< TD, typename TS::value_type >::type unicode_cast(const TS &source)
void truncate_as_ucs4(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:122
unsigned int count_leading_ones(N n)
Returns the quantity of leading 1 bits in n — i.e., the quantity of bits in n, minus the 1-based bit...
Definition: math.hpp:277
std::size_t size(const std::string &str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:86
#define ERR_GENERAL
Definition: unicode.cpp:32
std::string & truncate(std::string &str, const std::size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:117
const std::pair< typename string_type::const_iterator, typename string_type::const_iterator > & substr() const
static int byte_size_from_utf8_first(const unsigned char ch)
Definition: unicode.cpp:36
std::size_t i
Definition: function.cpp:933
Functions for converting Unicode wide-char strings to UTF-8 encoded strings, back and forth...
Definition: unicode.cpp:34
Thrown by operations encountering invalid UTF-8 data.
EXIT_STATUS start(const config &game_conf, const std::string &filename, bool take_screenshot, const std::string &screenshot_filename)
Main interface for launching the editor from the title screen.
Definition: editor_main.cpp:28
static map_location::DIRECTION s
std::string & insert(std::string &str, const std::size_t pos, const std::string &insert)
Insert a UTF-8 string at the specified position.
Definition: unicode.cpp:99
std::size_t index(const std::string &str, const std::size_t index)
Codepoint index corresponding to the nth character in a UTF-8 string.
Definition: unicode.cpp:71
std::string lowercase(const std::string &s)
Returns a lowercased version of the string.
Definition: unicode.cpp:51
static iterator_base end(const string_type &str)
std::string & erase(std::string &str, const std::size_t start, const std::size_t len)
Erases a portion of a UTF-8 string.
Definition: unicode.cpp:104
static lg::log_domain log_engine("engine")
Standard logging facilities (interface).