The Battle for Wesnoth  1.19.0-dev
ucs4_convert_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2024
3  by David White <dave@whitevine.net>
4  Part of the Battle for Wesnoth Project https://www.wesnoth.org/
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY.
12 
13  See the COPYING file for more details.
14 */
15 
16 #pragma once
17 
18 #include "utf8_exception.hpp"
19 #include "utils/math.hpp"
20 #include <cassert>
21 
23 {
24  struct utf8_impl
25  {
26  static const char* get_name() { return "utf8"; }
27  static std::size_t byte_size_from_ucs4_codepoint(char32_t ch)
28  {
29  if(ch < (1u << 7))
30  return 1;
31  else if(ch < (1u << 11))
32  return 2;
33  else if(ch < (1u << 16))
34  return 3;
35  else if(ch < (1u << 21))
36  return 4;
37  else if(ch < (1u << 26))
38  return 5;
39  else if(ch < (1u << 31))
40  return 6;
41  else
42  throw utf8::invalid_utf8_exception(); // Invalid UCS-4
43  }
44 
45  static int byte_size_from_utf8_first(char ch)
46  {
47  if (!(ch & 0x80)) {
48  return 1; // US-ASCII character, 1 byte
49  }
50  /* first bit set: character not in US-ASCII, multiple bytes
51  * number of set bits at the beginning = bytes per character
52  * e.g. 11110xxx indicates a 4-byte character */
53  int count = count_leading_ones(ch);
54  if (count == 1 || count > 6) { // count > 4 after RFC 3629
55  throw utf8::invalid_utf8_exception(); // Stop on invalid characters
56  }
57  return count;
58  }
59 
60  /**
61  * Writes a UCS-4 character to a UTF-8 stream.
62  *
63  * @param out An object to write char. Required operations:
64  * 1) push(char) to write a single character
65  * 2) can_push(std::size_t n) to check whether there is still
66  * enough space for n characters.
67  * @param ch The UCS-4 character to write to the stream.
68  */
69  template<typename writer>
70  static inline void write(writer out, char32_t ch)
71  {
72  std::size_t count = byte_size_from_ucs4_codepoint(ch);
73  assert(out.can_push(count));
74  if(count == 1) {
75  out.push(static_cast<char>(ch));
76  } else {
77  for(int j = static_cast<int>(count) - 1; j >= 0; --j) {
78  unsigned char c = (ch >> (6 * j)) & 0x3f;
79  c |= 0x80;
80  if(j == static_cast<int>(count) - 1) {
81  c |= 0xff << (8 - count);
82  }
83  out.push(c);
84  }
85  }
86  }
87  /**
88  * Reads a UCS-4 character from a UTF-8 stream
89  *
90  * @param input An iterator pointing to the first character of a UTF-8
91  * sequence to read.
92  * @param end An iterator pointing to the end of the UTF-8 sequence
93  * to read.
94  */
95  template<typename iitor_t>
96  static inline char32_t read(iitor_t& input, const iitor_t& end)
97  {
98  assert(input != end);
99  std::size_t size = byte_size_from_utf8_first(*input);
100 
101  char32_t current_char = static_cast<unsigned char>(*input);
102 
103  // Convert the first character
104  if(size != 1) {
105  current_char &= 0xFF >> (size + 1);
106  }
107 
108  // Convert the continuation bytes
109  // i == number of '++input'
110  ++input;
111  for(std::size_t i = 1; i < size; ++i, ++input) {
112  // If the string ends occurs within an UTF8-sequence, this is bad.
113  if (input == end)
115 
116  if ((*input & 0xC0) != 0x80)
118 
119  current_char = (current_char << 6) | (static_cast<unsigned char>(*input) & 0x3F);
120  }
121  //i == size => input was increased size times.
122 
123  // Check for non-shortest-form encoding
124  // This has been forbidden in Unicode 3.1 for security reasons
125  if (size > byte_size_from_ucs4_codepoint(current_char))
127  return current_char;
128  }
129  };
130 
131  struct utf16_impl
132  {
133  static const char* get_name() { return "utf16"; }
134  template<typename writer>
135  static inline void write(writer out, char32_t ch)
136  {
137  const char32_t bit17 = 0x10000;
138 
139  if(ch < bit17)
140  {
141  assert(out.can_push(1));
142  out.push(static_cast<char16_t>(ch));
143  }
144  else
145  {
146  assert(out.can_push(2));
147  const char32_t char20 = ch - bit17;
148  assert(char20 < (1 << 20));
149  const char32_t lead = 0xD800 + (char20 >> 10);
150  const char32_t trail = 0xDC00 + (char20 & 0x3FF);
151  assert(lead < bit17);
152  assert(trail < bit17);
153  out.push(static_cast<char16_t>(lead));
154  out.push(static_cast<char16_t>(trail));
155  }
156  }
157 
158  template<typename iitor_t>
159  static inline char32_t read(iitor_t& input, const iitor_t& end)
160  {
161  const char32_t last10 = 0x3FF;
162  const char32_t type_filter = 0xFC00;
163  const char32_t type_lead = 0xD800;
164  const char32_t type_trail = 0xDC00;
165 
166  assert(input != end);
167  char32_t current_char = static_cast<char16_t>(*input);
168  ++input;
169  char32_t type = current_char & type_filter;
170  if(type == type_trail)
171  {
172  //found trail without head
174  }
175  else if(type == type_lead)
176  {
177  if(input == end)
178  {
179  //If the string ends occurs within an UTF16-sequence, this is bad.
181  }
182  if((*input & type_filter) != type_trail)
183  {
185  }
186  current_char &= last10;
187  current_char <<= 10;
188  current_char += (*input & last10);
189  current_char += 0x10000;
190  ++input;
191  }
192  return current_char;
193  }
194  };
195 
196  struct utf32_impl
197  {
198  static const char* get_name() { return "UCS4"; }
199  template<typename writer>
200  static inline void write(writer out, char32_t ch)
201  {
202  assert(out.can_push(1));
203  out.push(ch);
204  }
205 
206  template<typename iitor_t>
207  static inline char32_t read(iitor_t& input, const iitor_t& end)
208  {
209  assert(input != end);
210  char32_t current_char = *input;
211  ++input;
212  return current_char;
213  }
214  };
215 
216  template<typename T_CHAR>
217  struct convert_impl {};
218 
219  template<>
220  struct convert_impl<char>
221  {
222  typedef utf8_impl type;
223  };
224 
225  template<>
226  struct convert_impl<char16_t>
227  {
228  typedef utf16_impl type;
229  };
230 
231  template<>
232  struct convert_impl<wchar_t>
233  {
234  typedef utf16_impl type;
235  };
236 
237  template<>
238  struct convert_impl<char32_t>
239  {
240  typedef utf32_impl type;
241  };
242 }
Thrown by operations encountering invalid UTF-8 data.
std::size_t i
Definition: function.cpp:968
General math utility functions.
constexpr unsigned int count_leading_ones(N n)
Returns the quantity of leading 1 bits in n — i.e., the quantity of bits in n, minus the 1-based bit ...
Definition: math.hpp:179
std::size_t size(const std::string &str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:85
static char32_t read(iitor_t &input, const iitor_t &end)
static void write(writer out, char32_t ch)
static char32_t read(iitor_t &input, const iitor_t &end)
static void write(writer out, char32_t ch)
static void write(writer out, char32_t ch)
Writes a UCS-4 character to a UTF-8 stream.
static const char * get_name()
static char32_t read(iitor_t &input, const iitor_t &end)
Reads a UCS-4 character from a UTF-8 stream.
static int byte_size_from_utf8_first(char ch)
static std::size_t byte_size_from_ucs4_codepoint(char32_t ch)
mock_char c