Mercurial > malikania
comparison extern/jansson/src/utf.c @ 0:8991989c4708
Initial import
author | David Demelier <markand@malikania.fr> |
---|---|
date | Tue, 22 Mar 2016 18:26:05 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8991989c4708 |
---|---|
1 /* | |
2 * Copyright (c) 2009-2014 Petri Lehtinen <petri@digip.org> | |
3 * | |
4 * Jansson is free software; you can redistribute it and/or modify | |
5 * it under the terms of the MIT license. See LICENSE for details. | |
6 */ | |
7 | |
8 #include <string.h> | |
9 #include "utf.h" | |
10 | |
11 int utf8_encode(int32_t codepoint, char *buffer, size_t *size) | |
12 { | |
13 if(codepoint < 0) | |
14 return -1; | |
15 else if(codepoint < 0x80) | |
16 { | |
17 buffer[0] = (char)codepoint; | |
18 *size = 1; | |
19 } | |
20 else if(codepoint < 0x800) | |
21 { | |
22 buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); | |
23 buffer[1] = 0x80 + ((codepoint & 0x03F)); | |
24 *size = 2; | |
25 } | |
26 else if(codepoint < 0x10000) | |
27 { | |
28 buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); | |
29 buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); | |
30 buffer[2] = 0x80 + ((codepoint & 0x003F)); | |
31 *size = 3; | |
32 } | |
33 else if(codepoint <= 0x10FFFF) | |
34 { | |
35 buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); | |
36 buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); | |
37 buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); | |
38 buffer[3] = 0x80 + ((codepoint & 0x00003F)); | |
39 *size = 4; | |
40 } | |
41 else | |
42 return -1; | |
43 | |
44 return 0; | |
45 } | |
46 | |
47 size_t utf8_check_first(char byte) | |
48 { | |
49 unsigned char u = (unsigned char)byte; | |
50 | |
51 if(u < 0x80) | |
52 return 1; | |
53 | |
54 if(0x80 <= u && u <= 0xBF) { | |
55 /* second, third or fourth byte of a multi-byte | |
56 sequence, i.e. a "continuation byte" */ | |
57 return 0; | |
58 } | |
59 else if(u == 0xC0 || u == 0xC1) { | |
60 /* overlong encoding of an ASCII byte */ | |
61 return 0; | |
62 } | |
63 else if(0xC2 <= u && u <= 0xDF) { | |
64 /* 2-byte sequence */ | |
65 return 2; | |
66 } | |
67 | |
68 else if(0xE0 <= u && u <= 0xEF) { | |
69 /* 3-byte sequence */ | |
70 return 3; | |
71 } | |
72 else if(0xF0 <= u && u <= 0xF4) { | |
73 /* 4-byte sequence */ | |
74 return 4; | |
75 } | |
76 else { /* u >= 0xF5 */ | |
77 /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid | |
78 UTF-8 */ | |
79 return 0; | |
80 } | |
81 } | |
82 | |
83 size_t utf8_check_full(const char *buffer, size_t size, int32_t *codepoint) | |
84 { | |
85 size_t i; | |
86 int32_t value = 0; | |
87 unsigned char u = (unsigned char)buffer[0]; | |
88 | |
89 if(size == 2) | |
90 { | |
91 value = u & 0x1F; | |
92 } | |
93 else if(size == 3) | |
94 { | |
95 value = u & 0xF; | |
96 } | |
97 else if(size == 4) | |
98 { | |
99 value = u & 0x7; | |
100 } | |
101 else | |
102 return 0; | |
103 | |
104 for(i = 1; i < size; i++) | |
105 { | |
106 u = (unsigned char)buffer[i]; | |
107 | |
108 if(u < 0x80 || u > 0xBF) { | |
109 /* not a continuation byte */ | |
110 return 0; | |
111 } | |
112 | |
113 value = (value << 6) + (u & 0x3F); | |
114 } | |
115 | |
116 if(value > 0x10FFFF) { | |
117 /* not in Unicode range */ | |
118 return 0; | |
119 } | |
120 | |
121 else if(0xD800 <= value && value <= 0xDFFF) { | |
122 /* invalid code point (UTF-16 surrogate halves) */ | |
123 return 0; | |
124 } | |
125 | |
126 else if((size == 2 && value < 0x80) || | |
127 (size == 3 && value < 0x800) || | |
128 (size == 4 && value < 0x10000)) { | |
129 /* overlong encoding */ | |
130 return 0; | |
131 } | |
132 | |
133 if(codepoint) | |
134 *codepoint = value; | |
135 | |
136 return 1; | |
137 } | |
138 | |
139 const char *utf8_iterate(const char *buffer, size_t bufsize, int32_t *codepoint) | |
140 { | |
141 size_t count; | |
142 int32_t value; | |
143 | |
144 if(!bufsize) | |
145 return buffer; | |
146 | |
147 count = utf8_check_first(buffer[0]); | |
148 if(count <= 0) | |
149 return NULL; | |
150 | |
151 if(count == 1) | |
152 value = (unsigned char)buffer[0]; | |
153 else | |
154 { | |
155 if(count > bufsize || !utf8_check_full(buffer, count, &value)) | |
156 return NULL; | |
157 } | |
158 | |
159 if(codepoint) | |
160 *codepoint = value; | |
161 | |
162 return buffer + count; | |
163 } | |
164 | |
165 int utf8_check_string(const char *string, size_t length) | |
166 { | |
167 size_t i; | |
168 | |
169 for(i = 0; i < length; i++) | |
170 { | |
171 size_t count = utf8_check_first(string[i]); | |
172 if(count == 0) | |
173 return 0; | |
174 else if(count > 1) | |
175 { | |
176 if(count > length - i) | |
177 return 0; | |
178 | |
179 if(!utf8_check_full(&string[i], count, NULL)) | |
180 return 0; | |
181 | |
182 i += count - 1; | |
183 } | |
184 } | |
185 | |
186 return 1; | |
187 } |