Mercurial > embed
comparison yaml/src/reader.c @ 1:4d89bd8a3f7f
yaml: import 0.1.5
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 24 Feb 2016 20:50:29 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:0047655db1aa | 1:4d89bd8a3f7f |
---|---|
1 | |
2 #include "yaml_private.h" | |
3 | |
4 /* | |
5 * Declarations. | |
6 */ | |
7 | |
8 static int | |
9 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, | |
10 size_t offset, int value); | |
11 | |
12 static int | |
13 yaml_parser_update_raw_buffer(yaml_parser_t *parser); | |
14 | |
15 static int | |
16 yaml_parser_determine_encoding(yaml_parser_t *parser); | |
17 | |
18 YAML_DECLARE(int) | |
19 yaml_parser_update_buffer(yaml_parser_t *parser, size_t length); | |
20 | |
21 /* | |
22 * Set the reader error and return 0. | |
23 */ | |
24 | |
25 static int | |
26 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, | |
27 size_t offset, int value) | |
28 { | |
29 parser->error = YAML_READER_ERROR; | |
30 parser->problem = problem; | |
31 parser->problem_offset = offset; | |
32 parser->problem_value = value; | |
33 | |
34 return 0; | |
35 } | |
36 | |
37 /* | |
38 * Byte order marks. | |
39 */ | |
40 | |
41 #define BOM_UTF8 "\xef\xbb\xbf" | |
42 #define BOM_UTF16LE "\xff\xfe" | |
43 #define BOM_UTF16BE "\xfe\xff" | |
44 | |
45 /* | |
46 * Determine the input stream encoding by checking the BOM symbol. If no BOM is | |
47 * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. | |
48 */ | |
49 | |
50 static int | |
51 yaml_parser_determine_encoding(yaml_parser_t *parser) | |
52 { | |
53 /* Ensure that we had enough bytes in the raw buffer. */ | |
54 | |
55 while (!parser->eof | |
56 && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) { | |
57 if (!yaml_parser_update_raw_buffer(parser)) { | |
58 return 0; | |
59 } | |
60 } | |
61 | |
62 /* Determine the encoding. */ | |
63 | |
64 if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 | |
65 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) { | |
66 parser->encoding = YAML_UTF16LE_ENCODING; | |
67 parser->raw_buffer.pointer += 2; | |
68 parser->offset += 2; | |
69 } | |
70 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 | |
71 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) { | |
72 parser->encoding = YAML_UTF16BE_ENCODING; | |
73 parser->raw_buffer.pointer += 2; | |
74 parser->offset += 2; | |
75 } | |
76 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3 | |
77 && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) { | |
78 parser->encoding = YAML_UTF8_ENCODING; | |
79 parser->raw_buffer.pointer += 3; | |
80 parser->offset += 3; | |
81 } | |
82 else { | |
83 parser->encoding = YAML_UTF8_ENCODING; | |
84 } | |
85 | |
86 return 1; | |
87 } | |
88 | |
89 /* | |
90 * Update the raw buffer. | |
91 */ | |
92 | |
93 static int | |
94 yaml_parser_update_raw_buffer(yaml_parser_t *parser) | |
95 { | |
96 size_t size_read = 0; | |
97 | |
98 /* Return if the raw buffer is full. */ | |
99 | |
100 if (parser->raw_buffer.start == parser->raw_buffer.pointer | |
101 && parser->raw_buffer.last == parser->raw_buffer.end) | |
102 return 1; | |
103 | |
104 /* Return on EOF. */ | |
105 | |
106 if (parser->eof) return 1; | |
107 | |
108 /* Move the remaining bytes in the raw buffer to the beginning. */ | |
109 | |
110 if (parser->raw_buffer.start < parser->raw_buffer.pointer | |
111 && parser->raw_buffer.pointer < parser->raw_buffer.last) { | |
112 memmove(parser->raw_buffer.start, parser->raw_buffer.pointer, | |
113 parser->raw_buffer.last - parser->raw_buffer.pointer); | |
114 } | |
115 parser->raw_buffer.last -= | |
116 parser->raw_buffer.pointer - parser->raw_buffer.start; | |
117 parser->raw_buffer.pointer = parser->raw_buffer.start; | |
118 | |
119 /* Call the read handler to fill the buffer. */ | |
120 | |
121 if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last, | |
122 parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) { | |
123 return yaml_parser_set_reader_error(parser, "input error", | |
124 parser->offset, -1); | |
125 } | |
126 parser->raw_buffer.last += size_read; | |
127 if (!size_read) { | |
128 parser->eof = 1; | |
129 } | |
130 | |
131 return 1; | |
132 } | |
133 | |
134 /* | |
135 * Ensure that the buffer contains at least `length` characters. | |
136 * Return 1 on success, 0 on failure. | |
137 * | |
138 * The length is supposed to be significantly less that the buffer size. | |
139 */ | |
140 | |
141 YAML_DECLARE(int) | |
142 yaml_parser_update_buffer(yaml_parser_t *parser, size_t length) | |
143 { | |
144 int first = 1; | |
145 | |
146 assert(parser->read_handler); /* Read handler must be set. */ | |
147 | |
148 /* If the EOF flag is set and the raw buffer is empty, do nothing. */ | |
149 | |
150 if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last) | |
151 return 1; | |
152 | |
153 /* Return if the buffer contains enough characters. */ | |
154 | |
155 if (parser->unread >= length) | |
156 return 1; | |
157 | |
158 /* Determine the input encoding if it is not known yet. */ | |
159 | |
160 if (!parser->encoding) { | |
161 if (!yaml_parser_determine_encoding(parser)) | |
162 return 0; | |
163 } | |
164 | |
165 /* Move the unread characters to the beginning of the buffer. */ | |
166 | |
167 if (parser->buffer.start < parser->buffer.pointer | |
168 && parser->buffer.pointer < parser->buffer.last) { | |
169 size_t size = parser->buffer.last - parser->buffer.pointer; | |
170 memmove(parser->buffer.start, parser->buffer.pointer, size); | |
171 parser->buffer.pointer = parser->buffer.start; | |
172 parser->buffer.last = parser->buffer.start + size; | |
173 } | |
174 else if (parser->buffer.pointer == parser->buffer.last) { | |
175 parser->buffer.pointer = parser->buffer.start; | |
176 parser->buffer.last = parser->buffer.start; | |
177 } | |
178 | |
179 /* Fill the buffer until it has enough characters. */ | |
180 | |
181 while (parser->unread < length) | |
182 { | |
183 /* Fill the raw buffer if necessary. */ | |
184 | |
185 if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) { | |
186 if (!yaml_parser_update_raw_buffer(parser)) return 0; | |
187 } | |
188 first = 0; | |
189 | |
190 /* Decode the raw buffer. */ | |
191 | |
192 while (parser->raw_buffer.pointer != parser->raw_buffer.last) | |
193 { | |
194 unsigned int value = 0, value2 = 0; | |
195 int incomplete = 0; | |
196 unsigned char octet; | |
197 unsigned int width = 0; | |
198 int low, high; | |
199 size_t k; | |
200 size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer; | |
201 | |
202 /* Decode the next character. */ | |
203 | |
204 switch (parser->encoding) | |
205 { | |
206 case YAML_UTF8_ENCODING: | |
207 | |
208 /* | |
209 * Decode a UTF-8 character. Check RFC 3629 | |
210 * (http://www.ietf.org/rfc/rfc3629.txt) for more details. | |
211 * | |
212 * The following table (taken from the RFC) is used for | |
213 * decoding. | |
214 * | |
215 * Char. number range | UTF-8 octet sequence | |
216 * (hexadecimal) | (binary) | |
217 * --------------------+------------------------------------ | |
218 * 0000 0000-0000 007F | 0xxxxxxx | |
219 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx | |
220 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx | |
221 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
222 * | |
223 * Additionally, the characters in the range 0xD800-0xDFFF | |
224 * are prohibited as they are reserved for use with UTF-16 | |
225 * surrogate pairs. | |
226 */ | |
227 | |
228 /* Determine the length of the UTF-8 sequence. */ | |
229 | |
230 octet = parser->raw_buffer.pointer[0]; | |
231 width = (octet & 0x80) == 0x00 ? 1 : | |
232 (octet & 0xE0) == 0xC0 ? 2 : | |
233 (octet & 0xF0) == 0xE0 ? 3 : | |
234 (octet & 0xF8) == 0xF0 ? 4 : 0; | |
235 | |
236 /* Check if the leading octet is valid. */ | |
237 | |
238 if (!width) | |
239 return yaml_parser_set_reader_error(parser, | |
240 "invalid leading UTF-8 octet", | |
241 parser->offset, octet); | |
242 | |
243 /* Check if the raw buffer contains an incomplete character. */ | |
244 | |
245 if (width > raw_unread) { | |
246 if (parser->eof) { | |
247 return yaml_parser_set_reader_error(parser, | |
248 "incomplete UTF-8 octet sequence", | |
249 parser->offset, -1); | |
250 } | |
251 incomplete = 1; | |
252 break; | |
253 } | |
254 | |
255 /* Decode the leading octet. */ | |
256 | |
257 value = (octet & 0x80) == 0x00 ? octet & 0x7F : | |
258 (octet & 0xE0) == 0xC0 ? octet & 0x1F : | |
259 (octet & 0xF0) == 0xE0 ? octet & 0x0F : | |
260 (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; | |
261 | |
262 /* Check and decode the trailing octets. */ | |
263 | |
264 for (k = 1; k < width; k ++) | |
265 { | |
266 octet = parser->raw_buffer.pointer[k]; | |
267 | |
268 /* Check if the octet is valid. */ | |
269 | |
270 if ((octet & 0xC0) != 0x80) | |
271 return yaml_parser_set_reader_error(parser, | |
272 "invalid trailing UTF-8 octet", | |
273 parser->offset+k, octet); | |
274 | |
275 /* Decode the octet. */ | |
276 | |
277 value = (value << 6) + (octet & 0x3F); | |
278 } | |
279 | |
280 /* Check the length of the sequence against the value. */ | |
281 | |
282 if (!((width == 1) || | |
283 (width == 2 && value >= 0x80) || | |
284 (width == 3 && value >= 0x800) || | |
285 (width == 4 && value >= 0x10000))) | |
286 return yaml_parser_set_reader_error(parser, | |
287 "invalid length of a UTF-8 sequence", | |
288 parser->offset, -1); | |
289 | |
290 /* Check the range of the value. */ | |
291 | |
292 if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) | |
293 return yaml_parser_set_reader_error(parser, | |
294 "invalid Unicode character", | |
295 parser->offset, value); | |
296 | |
297 break; | |
298 | |
299 case YAML_UTF16LE_ENCODING: | |
300 case YAML_UTF16BE_ENCODING: | |
301 | |
302 low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1); | |
303 high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0); | |
304 | |
305 /* | |
306 * The UTF-16 encoding is not as simple as one might | |
307 * naively think. Check RFC 2781 | |
308 * (http://www.ietf.org/rfc/rfc2781.txt). | |
309 * | |
310 * Normally, two subsequent bytes describe a Unicode | |
311 * character. However a special technique (called a | |
312 * surrogate pair) is used for specifying character | |
313 * values larger than 0xFFFF. | |
314 * | |
315 * A surrogate pair consists of two pseudo-characters: | |
316 * high surrogate area (0xD800-0xDBFF) | |
317 * low surrogate area (0xDC00-0xDFFF) | |
318 * | |
319 * The following formulas are used for decoding | |
320 * and encoding characters using surrogate pairs: | |
321 * | |
322 * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) | |
323 * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) | |
324 * W1 = 110110yyyyyyyyyy | |
325 * W2 = 110111xxxxxxxxxx | |
326 * | |
327 * where U is the character value, W1 is the high surrogate | |
328 * area, W2 is the low surrogate area. | |
329 */ | |
330 | |
331 /* Check for incomplete UTF-16 character. */ | |
332 | |
333 if (raw_unread < 2) { | |
334 if (parser->eof) { | |
335 return yaml_parser_set_reader_error(parser, | |
336 "incomplete UTF-16 character", | |
337 parser->offset, -1); | |
338 } | |
339 incomplete = 1; | |
340 break; | |
341 } | |
342 | |
343 /* Get the character. */ | |
344 | |
345 value = parser->raw_buffer.pointer[low] | |
346 + (parser->raw_buffer.pointer[high] << 8); | |
347 | |
348 /* Check for unexpected low surrogate area. */ | |
349 | |
350 if ((value & 0xFC00) == 0xDC00) | |
351 return yaml_parser_set_reader_error(parser, | |
352 "unexpected low surrogate area", | |
353 parser->offset, value); | |
354 | |
355 /* Check for a high surrogate area. */ | |
356 | |
357 if ((value & 0xFC00) == 0xD800) { | |
358 | |
359 width = 4; | |
360 | |
361 /* Check for incomplete surrogate pair. */ | |
362 | |
363 if (raw_unread < 4) { | |
364 if (parser->eof) { | |
365 return yaml_parser_set_reader_error(parser, | |
366 "incomplete UTF-16 surrogate pair", | |
367 parser->offset, -1); | |
368 } | |
369 incomplete = 1; | |
370 break; | |
371 } | |
372 | |
373 /* Get the next character. */ | |
374 | |
375 value2 = parser->raw_buffer.pointer[low+2] | |
376 + (parser->raw_buffer.pointer[high+2] << 8); | |
377 | |
378 /* Check for a low surrogate area. */ | |
379 | |
380 if ((value2 & 0xFC00) != 0xDC00) | |
381 return yaml_parser_set_reader_error(parser, | |
382 "expected low surrogate area", | |
383 parser->offset+2, value2); | |
384 | |
385 /* Generate the value of the surrogate pair. */ | |
386 | |
387 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); | |
388 } | |
389 | |
390 else { | |
391 width = 2; | |
392 } | |
393 | |
394 break; | |
395 | |
396 default: | |
397 assert(1); /* Impossible. */ | |
398 } | |
399 | |
400 /* Check if the raw buffer contains enough bytes to form a character. */ | |
401 | |
402 if (incomplete) break; | |
403 | |
404 /* | |
405 * Check if the character is in the allowed range: | |
406 * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) | |
407 * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) | |
408 * | [#x10000-#x10FFFF] (32 bit) | |
409 */ | |
410 | |
411 if (! (value == 0x09 || value == 0x0A || value == 0x0D | |
412 || (value >= 0x20 && value <= 0x7E) | |
413 || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) | |
414 || (value >= 0xE000 && value <= 0xFFFD) | |
415 || (value >= 0x10000 && value <= 0x10FFFF))) | |
416 return yaml_parser_set_reader_error(parser, | |
417 "control characters are not allowed", | |
418 parser->offset, value); | |
419 | |
420 /* Move the raw pointers. */ | |
421 | |
422 parser->raw_buffer.pointer += width; | |
423 parser->offset += width; | |
424 | |
425 /* Finally put the character into the buffer. */ | |
426 | |
427 /* 0000 0000-0000 007F -> 0xxxxxxx */ | |
428 if (value <= 0x7F) { | |
429 *(parser->buffer.last++) = value; | |
430 } | |
431 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ | |
432 else if (value <= 0x7FF) { | |
433 *(parser->buffer.last++) = 0xC0 + (value >> 6); | |
434 *(parser->buffer.last++) = 0x80 + (value & 0x3F); | |
435 } | |
436 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ | |
437 else if (value <= 0xFFFF) { | |
438 *(parser->buffer.last++) = 0xE0 + (value >> 12); | |
439 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); | |
440 *(parser->buffer.last++) = 0x80 + (value & 0x3F); | |
441 } | |
442 /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ | |
443 else { | |
444 *(parser->buffer.last++) = 0xF0 + (value >> 18); | |
445 *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F); | |
446 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); | |
447 *(parser->buffer.last++) = 0x80 + (value & 0x3F); | |
448 } | |
449 | |
450 parser->unread ++; | |
451 } | |
452 | |
453 /* On EOF, put NUL into the buffer and return. */ | |
454 | |
455 if (parser->eof) { | |
456 *(parser->buffer.last++) = '\0'; | |
457 parser->unread ++; | |
458 return 1; | |
459 } | |
460 | |
461 } | |
462 | |
463 if (parser->offset >= PTRDIFF_MAX) | |
464 return yaml_parser_set_reader_error(parser, "input is too long", | |
465 PTRDIFF_MAX, -1); | |
466 | |
467 return 1; | |
468 } | |
469 |