The Battle for Wesnoth  1.15.0-dev
lutf8lib.cpp
Go to the documentation of this file.
1 /*
2 ** $Id: lutf8lib.c,v 1.16 2016/12/22 13:08:50 roberto Exp $
3 ** Standard library for UTF-8 manipulation
4 ** See Copyright Notice in lua.h
5 */
6 
7 #define lutf8lib_c
8 #define LUA_LIB
9 
10 #include "lprefix.h"
11 
12 
13 #include <assert.h>
14 #include <limits.h>
15 #include <stdlib.h>
16 #include <string.h>
17 
18 #include "lua.h"
19 
20 #include "lauxlib.h"
21 #include "lualib.h"
22 
23 #define MAXUNICODE 0x10FFFF
24 
25 #define iscont(p) ((*(p) & 0xC0) == 0x80)
26 
27 
28 /* from strlib */
29 /* translate a relative string position: negative means back from end */
30 static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
31  if (pos >= 0) return pos;
32  else if (0u - (size_t)pos > len) return 0;
33  else return (lua_Integer)len + pos + 1;
34 }
35 
36 
37 /*
38 ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
39 */
40 static const char *utf8_decode (const char *o, int *val) {
41  static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
42  const unsigned char *s = (const unsigned char *)o;
43  unsigned int c = s[0];
44  unsigned int res = 0; /* final result */
45  if (c < 0x80) /* ascii? */
46  res = c;
47  else {
48  int count = 0; /* to count number of continuation bytes */
49  while (c & 0x40) { /* still have continuation bytes? */
50  int cc = s[++count]; /* read next byte */
51  if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
52  return NULL; /* invalid byte sequence */
53  res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
54  c <<= 1; /* to test next bit */
55  }
56  res |= ((c & 0x7F) << (count * 5)); /* add first byte */
57  if (count > 3 || res > MAXUNICODE || res <= limits[count])
58  return NULL; /* invalid byte sequence */
59  s += count; /* skip continuation bytes read */
60  }
61  if (val) *val = res;
62  return (const char *)s + 1; /* +1 to include first byte */
63 }
64 
65 
66 /*
67 ** utf8len(s [, i [, j]]) --> number of characters that start in the
68 ** range [i,j], or nil + current position if 's' is not well formed in
69 ** that interval
70 */
71 static int utflen (lua_State *L) {
72  int n = 0;
73  size_t len;
74  const char *s = luaL_checklstring(L, 1, &len);
75  lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
76  lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
77  luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
78  "initial position out of string");
79  luaL_argcheck(L, --posj < (lua_Integer)len, 3,
80  "final position out of string");
81  while (posi <= posj) {
82  const char *s1 = utf8_decode(s + posi, NULL);
83  if (s1 == NULL) { /* conversion error? */
84  lua_pushnil(L); /* return nil ... */
85  lua_pushinteger(L, posi + 1); /* ... and current position */
86  return 2;
87  }
88  posi = s1 - s;
89  n++;
90  }
91  lua_pushinteger(L, n);
92  return 1;
93 }
94 
95 
96 /*
97 ** codepoint(s, [i, [j]]) -> returns codepoints for all characters
98 ** that start in the range [i,j]
99 */
100 static int codepoint (lua_State *L) {
101  size_t len;
102  const char *s = luaL_checklstring(L, 1, &len);
103  lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
104  lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
105  int n;
106  const char *se;
107  luaL_argcheck(L, posi >= 1, 2, "out of range");
108  luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
109  if (posi > pose) return 0; /* empty interval; return no values */
110  if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */
111  return luaL_error(L, "string slice too long");
112  n = (int)(pose - posi) + 1;
113  luaL_checkstack(L, n, "string slice too long");
114  n = 0;
115  se = s + pose;
116  for (s += posi - 1; s < se;) {
117  int code;
118  s = utf8_decode(s, &code);
119  if (s == NULL)
120  return luaL_error(L, "invalid UTF-8 code");
121  lua_pushinteger(L, code);
122  n++;
123  }
124  return n;
125 }
126 
127 
128 static void pushutfchar (lua_State *L, int arg) {
129  lua_Integer code = luaL_checkinteger(L, arg);
130  luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
131  lua_pushfstring(L, "%U", (long)code);
132 }
133 
134 
135 /*
136 ** utfchar(n1, n2, ...) -> char(n1)..char(n2)...
137 */
138 static int utfchar (lua_State *L) {
139  int n = lua_gettop(L); /* number of arguments */
140  if (n == 1) /* optimize common case of single char */
141  pushutfchar(L, 1);
142  else {
143  int i;
144  luaL_Buffer b;
145  luaL_buffinit(L, &b);
146  for (i = 1; i <= n; i++) {
147  pushutfchar(L, i);
148  luaL_addvalue(&b);
149  }
150  luaL_pushresult(&b);
151  }
152  return 1;
153 }
154 
155 
156 /*
157 ** offset(s, n, [i]) -> index where n-th character counting from
158 ** position 'i' starts; 0 means character at 'i'.
159 */
160 static int byteoffset (lua_State *L) {
161  size_t len;
162  const char *s = luaL_checklstring(L, 1, &len);
164  lua_Integer posi = (n >= 0) ? 1 : len + 1;
165  posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
166  luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
167  "position out of range");
168  if (n == 0) {
169  /* find beginning of current byte sequence */
170  while (posi > 0 && iscont(s + posi)) posi--;
171  }
172  else {
173  if (iscont(s + posi))
174  luaL_error(L, "initial position is a continuation byte");
175  if (n < 0) {
176  while (n < 0 && posi > 0) { /* move back */
177  do { /* find beginning of previous character */
178  posi--;
179  } while (posi > 0 && iscont(s + posi));
180  n++;
181  }
182  }
183  else {
184  n--; /* do not move for 1st character */
185  while (n > 0 && posi < (lua_Integer)len) {
186  do { /* find beginning of next character */
187  posi++;
188  } while (iscont(s + posi)); /* (cannot pass final '\0') */
189  n--;
190  }
191  }
192  }
193  if (n == 0) /* did it find given character? */
194  lua_pushinteger(L, posi + 1);
195  else /* no such character */
196  lua_pushnil(L);
197  return 1;
198 }
199 
200 
201 static int iter_aux (lua_State *L) {
202  size_t len;
203  const char *s = luaL_checklstring(L, 1, &len);
204  lua_Integer n = lua_tointeger(L, 2) - 1;
205  if (n < 0) /* first iteration? */
206  n = 0; /* start from here */
207  else if (n < (lua_Integer)len) {
208  n++; /* skip current byte */
209  while (iscont(s + n)) n++; /* and its continuations */
210  }
211  if (n >= (lua_Integer)len)
212  return 0; /* no more codepoints */
213  else {
214  int code;
215  const char *next = utf8_decode(s + n, &code);
216  if (next == NULL || iscont(next))
217  return luaL_error(L, "invalid UTF-8 code");
218  lua_pushinteger(L, n + 1);
219  lua_pushinteger(L, code);
220  return 2;
221  }
222 }
223 
224 
225 static int iter_codes (lua_State *L) {
226  luaL_checkstring(L, 1);
228  lua_pushvalue(L, 1);
229  lua_pushinteger(L, 0);
230  return 3;
231 }
232 
233 
234 /* pattern to match a single UTF-8 character */
235 #define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
236 
237 
238 static const luaL_Reg funcs[] = {
239  {"offset", byteoffset},
240  {"codepoint", codepoint},
241  {"char", utfchar},
242  {"len", utflen},
243  {"codes", iter_codes},
244  /* placeholders */
245  {"charpattern", NULL},
246  {NULL, NULL}
247 };
248 
249 
251  luaL_newlib(L, funcs);
252  lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
253  lua_setfield(L, -2, "charpattern");
254  return 1;
255 }
256 
LUALIB_API void luaL_addvalue(luaL_Buffer *B)
Definition: lauxlib.cpp:558
#define lua_pushcfunction(L, f)
Definition: lua.h:350
static lua_Integer u_posrelat(lua_Integer pos, size_t len)
Definition: lutf8lib.cpp:30
#define iscont(p)
Definition: lutf8lib.cpp:25
LUALIB_API void luaL_pushresult(luaL_Buffer *B)
Definition: lauxlib.cpp:542
LUA_API int lua_gettop(lua_State *L)
Definition: lapi.cpp:167
#define lua_tointeger(L, i)
Definition: lua.h:342
LUALIB_API lua_Integer luaL_optinteger(lua_State *L, int arg, lua_Integer def)
Definition: lauxlib.cpp:440
LUALIB_API void luaL_checkstack(lua_State *L, int space, const char *msg)
Definition: lauxlib.cpp:368
static int byteoffset(lua_State *L)
Definition: lutf8lib.cpp:160
#define b
static int utfchar(lua_State *L)
Definition: lutf8lib.cpp:138
#define MAXUNICODE
Definition: lutf8lib.cpp:23
LUA_INTEGER lua_Integer
Definition: lua.h:93
LUALIB_API void luaL_buffinit(lua_State *L, luaL_Buffer *B)
Definition: lauxlib.cpp:569
LUA_API const char * lua_pushlstring(lua_State *L, const char *s, size_t len)
Definition: lapi.cpp:479
static int utflen(lua_State *L)
Definition: lutf8lib.cpp:71
static int codepoint(lua_State *L)
Definition: lutf8lib.cpp:100
static int iter_aux(lua_State *L)
Definition: lutf8lib.cpp:201
LUA_API void lua_pushnil(lua_State *L)
Definition: lapi.cpp:450
static map_location::DIRECTION se
#define UTF8PATT
Definition: lutf8lib.cpp:235
static const luaL_Reg funcs[]
Definition: lutf8lib.cpp:238
static void pushutfchar(lua_State *L, int arg)
Definition: lutf8lib.cpp:128
std::size_t i
Definition: function.cpp:933
LUALIB_API lua_Integer luaL_checkinteger(lua_State *L, int arg)
Definition: lauxlib.cpp:430
static const char * utf8_decode(const char *o, int *val)
Definition: lutf8lib.cpp:40
static map_location::DIRECTION s
#define LUAMOD_API
Definition: luaconf.h:261
LUAMOD_API int luaopen_utf8(lua_State *L)
Definition: lutf8lib.cpp:250
LUA_API void lua_pushvalue(lua_State *L, int idx)
Definition: lapi.cpp:237
#define luaL_newlib(L, l)
Definition: lauxlib.h:119
LUALIB_API int luaL_error(lua_State *L, const char *fmt,...)
Definition: lauxlib.cpp:223
#define luaL_argcheck(L, cond, arg, extramsg)
Definition: lauxlib.h:122
#define next(ls)
Definition: llex.cpp:32
LUA_API const char * lua_pushfstring(lua_State *L, const char *fmt,...)
Definition: lapi.cpp:519
static int iter_codes(lua_State *L)
Definition: lutf8lib.cpp:225
mock_char c
static map_location::DIRECTION n
LUA_API void lua_pushinteger(lua_State *L, lua_Integer n)
Definition: lapi.cpp:466
LUALIB_API const char * luaL_checklstring(lua_State *L, int arg, size_t *len)
Definition: lauxlib.cpp:390
LUA_API void lua_setfield(lua_State *L, int idx, const char *k)
Definition: lapi.cpp:776
#define luaL_checkstring(L, n)
Definition: lauxlib.h:124