The Battle for Wesnoth  1.15.1+dev
ucs4_convert_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2018 by David White <dave@whitevine.net>
3  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 #pragma once
16 
17 #include "utf8_exception.hpp"
18 #include "utils/math.hpp"
19 #include <cassert>
20 
22 {
23  struct utf8_impl
24  {
25  static const char* get_name() { return "utf8"; }
26  static std::size_t byte_size_from_ucs4_codepoint(char32_t ch)
27  {
28  if(ch < (1u << 7))
29  return 1;
30  else if(ch < (1u << 11))
31  return 2;
32  else if(ch < (1u << 16))
33  return 3;
34  else if(ch < (1u << 21))
35  return 4;
36  else if(ch < (1u << 26))
37  return 5;
38  else if(ch < (1u << 31))
39  return 6;
40  else
41  throw utf8::invalid_utf8_exception(); // Invalid UCS-4
42  }
43 
44  static int byte_size_from_utf8_first(char ch)
45  {
46  if (!(ch & 0x80)) {
47  return 1; // US-ASCII character, 1 byte
48  }
49  /* first bit set: character not in US-ASCII, multiple bytes
50  * number of set bits at the beginning = bytes per character
51  * e.g. 11110xxx indicates a 4-byte character */
52  int count = count_leading_ones(ch);
53  if (count == 1 || count > 6) { // count > 4 after RFC 3629
54  throw utf8::invalid_utf8_exception(); // Stop on invalid characters
55  }
56  return count;
57  }
58 
59  /**
60  * Writes a UCS-4 character to a UTF-8 stream.
61  *
62  * @param out An object to write char. Required operations:
63  * 1) push(char) to write a single character
64  * 2) can_push(std::size_t n) to check whether there is still
65  * enough space for n characters.
66  * @param ch The UCS-4 character to write to the stream.
67  */
68  template<typename writer>
69  static inline void write(writer out, char32_t ch)
70  {
71  std::size_t count = byte_size_from_ucs4_codepoint(ch);
72  assert(out.can_push(count));
73  if(count == 1) {
74  out.push(static_cast<char>(ch));
75  } else {
76  for(int j = static_cast<int>(count) - 1; j >= 0; --j) {
77  unsigned char c = (ch >> (6 * j)) & 0x3f;
78  c |= 0x80;
79  if(j == static_cast<int>(count) - 1) {
80  c |= 0xff << (8 - count);
81  }
82  out.push(c);
83  }
84  }
85  }
86  /**
87  * Reads a UCS-4 character from a UTF-8 stream
88  *
89  * @param input An iterator pointing to the first character of a UTF-8
90  * sequence to read.
91  * @param end An iterator pointing to the end of the UTF-8 sequence
92  * to read.
93  */
94  template<typename iitor_t>
95  static inline char32_t read(iitor_t& input, const iitor_t& end)
96  {
97  assert(input != end);
98  std::size_t size = byte_size_from_utf8_first(*input);
99 
100  char32_t current_char = static_cast<unsigned char>(*input);
101 
102  // Convert the first character
103  if(size != 1) {
104  current_char &= 0xFF >> (size + 1);
105  }
106 
107  // Convert the continuation bytes
108  // i == number of '++input'
109  ++input;
110  for(std::size_t i = 1; i < size; ++i, ++input) {
111  // If the string ends occurs within an UTF8-sequence, this is bad.
112  if (input == end)
114 
115  if ((*input & 0xC0) != 0x80)
117 
118  current_char = (current_char << 6) | (static_cast<unsigned char>(*input) & 0x3F);
119  }
120  //i == size => input was increased size times.
121 
122  // Check for non-shortest-form encoding
123  // This has been forbidden in Unicode 3.1 for security reasons
124  if (size > byte_size_from_ucs4_codepoint(current_char))
126  return current_char;
127  }
128  };
129 
130  struct utf16_impl
131  {
132  static const char* get_name() { return "utf16"; }
133  template<typename writer>
134  static inline void write(writer out, char32_t ch)
135  {
136  const char32_t bit17 = 0x10000;
137 
138  if(ch < bit17)
139  {
140  assert(out.can_push(1));
141  out.push(static_cast<char16_t>(ch));
142  }
143  else
144  {
145  assert(out.can_push(2));
146  const char32_t char20 = ch - bit17;
147  assert(char20 < (1 << 20));
148  const char32_t lead = 0xD800 + (char20 >> 10);
149  const char32_t trail = 0xDC00 + (char20 & 0x3FF);
150  assert(lead < bit17);
151  assert(trail < bit17);
152  out.push(static_cast<char16_t>(lead));
153  out.push(static_cast<char16_t>(trail));
154  }
155  }
156 
157  template<typename iitor_t>
158  static inline char32_t read(iitor_t& input, const iitor_t& end)
159  {
160  const char32_t last10 = 0x3FF;
161  const char32_t type_filter = 0xFC00;
162  const char32_t type_lead = 0xD800;
163  const char32_t type_trail = 0xDC00;
164 
165  assert(input != end);
166  char32_t current_char = static_cast<char16_t>(*input);
167  ++input;
168  char32_t type = current_char & type_filter;
169  if(type == type_trail)
170  {
171  //found trail without head
173  }
174  else if(type == type_lead)
175  {
176  if(input == end)
177  {
178  //If the string ends occurs within an UTF16-sequence, this is bad.
180  }
181  if((*input & type_filter) != type_trail)
182  {
184  }
185  current_char &= last10;
186  current_char <<= 10;
187  current_char += (*input & last10);
188  current_char += 0x10000;
189  ++input;
190  }
191  return current_char;
192  }
193  };
194 
195  struct utf32_impl
196  {
197  static const char* get_name() { return "UCS4"; }
198  template<typename writer>
199  static inline void write(writer out, char32_t ch)
200  {
201  assert(out.can_push(1));
202  out.push(ch);
203  }
204 
205  template<typename iitor_t>
206  static inline char32_t read(iitor_t& input, const iitor_t& end)
207  {
208  assert(input != end);
209  char32_t current_char = *input;
210  ++input;
211  return current_char;
212  }
213  };
214 
215  template<typename T_CHAR>
216  struct convert_impl {};
217 
218  template<>
219  struct convert_impl<char>
220  {
221  typedef utf8_impl type;
222  };
223 
224  template<>
225  struct convert_impl<char16_t>
226  {
227  typedef utf16_impl type;
228  };
229 
230  template<>
231  struct convert_impl<wchar_t>
232  {
233  typedef utf16_impl type;
234  };
235 
236  template<>
237  struct convert_impl<char32_t>
238  {
239  typedef utf32_impl type;
240  };
241 }
static char32_t read(iitor_t &input, const iitor_t &end)
unsigned int count_leading_ones(N n)
Returns the quantity of leading 1 bits in n — i.e., the quantity of bits in n, minus the 1-based bit...
Definition: math.hpp:277
std::size_t size(const std::string &str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:86
General math utility functions.
static void write(writer out, char32_t ch)
Writes a UCS-4 character to a UTF-8 stream.
static char32_t read(iitor_t &input, const iitor_t &end)
Reads a UCS-4 character from a UTF-8 stream.
static const char * get_name()
std::size_t i
Definition: function.cpp:933
Thrown by operations encountering invalid UTF-8 data.
static void write(writer out, char32_t ch)
static int byte_size_from_utf8_first(char ch)
static int writer(lua_State *L, const void *b, size_t size, void *B)
Definition: lstrlib.cpp:182
static char32_t read(iitor_t &input, const iitor_t &end)
mock_char c
static void write(writer out, char32_t ch)
static std::size_t byte_size_from_ucs4_codepoint(char32_t ch)