Mercurial > code
comparison tools/mkunicode/src/mkunicode.c @ 352:7fe8d4094983
Utf8:
- Fix invalid decoding from UTF-8 to UTF-32
- Add all files
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 08 Apr 2015 12:33:45 +0200 |
parents | |
children | b78d6d8f2872 |
comparison
equal
deleted
inserted
replaced
351:47a206e724f2 | 352:7fe8d4094983 |
---|---|
1 /* | |
2 * Tool to create our Unicode.cpp and Unicode.h file. | |
3 * | |
4 * Current version: 7.0.0 | |
5 * | |
6 * Based on mkrunetype from the Go language. | |
7 * | |
8 * Adapted to generated C++ code. | |
9 */ | |
10 | |
11 // Copyright 2009 The Go Authors. All rights reserved. | |
12 // Use of this source code is governed by a BSD-style | |
13 // license that can be found in the LICENSE file. | |
14 | |
15 /* | |
16 * make is(upper|lower|title|space|alpha)rune and | |
17 * to(upper|lower|title)rune from a UnicodeData.txt file. | |
18 * these can be found at unicode.org | |
19 * | |
20 * with -c, runs a check of the existing runetype functions vs. | |
21 * those extracted from UnicodeData. | |
22 * | |
23 * with -p, generates tables for pairs of chars, as well as for ranges | |
24 * and singletons. | |
25 * | |
26 * UnicodeData defines 4 fields of interest: | |
27 * 1) a category | |
28 * 2) an upper case mapping | |
29 * 3) a lower case mapping | |
30 * 4) a title case mapping | |
31 * | |
32 * toupper, tolower, and totitle are defined directly from the mapping. | |
33 * | |
34 * isalpharune(c) is true iff c is a "letter" category | |
35 * isupperrune(c) is true iff c is the target of toupperrune, | |
36 * or is in the uppercase letter category | |
37 * similarly for islowerrune and istitlerune. | |
38 * isspacerune is true for space category chars, "C" locale white space chars, | |
39 * and two additions: | |
40 * 0085 "next line" control char | |
41 * feff] "zero-width non-break space" | |
42 * isdigitrune is true iff c is a numeric-digit category. | |
43 */ | |
44 | |
45 #include <stdarg.h> | |
46 #include <stdio.h> | |
47 #include <stdlib.h> | |
48 #include <string.h> | |
49 | |
50 #include "utf.h" | |
51 #include "utfdef.h" | |
52 | |
53 #define nelem(x) (sizeof(x)/sizeof((x)[0])) | |
54 | |
55 enum { | |
56 /* | |
57 * fields in the unicode data file | |
58 */ | |
59 FIELD_CODE, | |
60 FIELD_NAME, | |
61 FIELD_CATEGORY, | |
62 FIELD_COMBINING, | |
63 FIELD_BIDIR, | |
64 FIELD_DECOMP, | |
65 FIELD_DECIMAL_DIG, | |
66 FIELD_DIG, | |
67 FIELD_NUMERIC_VAL, | |
68 FIELD_MIRRORED, | |
69 FIELD_UNICODE_1_NAME, | |
70 FIELD_COMMENT, | |
71 FIELD_UPPER, | |
72 FIELD_LOWER, | |
73 FIELD_TITLE, | |
74 NFIELDS, | |
75 | |
76 MAX_LINE = 1024, | |
77 | |
78 TO_OFFSET = 1 << 20, | |
79 | |
80 NRUNES = 1 << 21, | |
81 }; | |
82 | |
83 #define TO_DELTA(xmapped,x) (TO_OFFSET + (xmapped) - (x)) | |
84 | |
85 static char myisspace[NRUNES]; | |
86 static char myisalpha[NRUNES]; | |
87 static char myisdigit[NRUNES]; | |
88 static char myisupper[NRUNES]; | |
89 static char myislower[NRUNES]; | |
90 static char myistitle[NRUNES]; | |
91 | |
92 static int mytoupper[NRUNES]; | |
93 static int mytolower[NRUNES]; | |
94 static int mytotitle[NRUNES]; | |
95 | |
96 static void check(void); | |
97 static void mktables(char *src, int usepairs); | |
98 static void fatal(const char *fmt, ...); | |
99 static int mygetfields(char **fields, int nfields, char *str, const char *delim); | |
100 static int getunicodeline(FILE *in, char **fields, char *buf); | |
101 static int getcode(char *s); | |
102 | |
103 static void | |
104 usage(void) | |
105 { | |
106 fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n"); | |
107 exit(1); | |
108 } | |
109 | |
110 int | |
111 main(int argc, char *argv[]) | |
112 { | |
113 FILE *in; | |
114 char buf[MAX_LINE], buf2[MAX_LINE]; | |
115 char *fields[NFIELDS + 1], *fields2[NFIELDS + 1]; | |
116 char *p; | |
117 int i, code, last, usepairs; | |
118 | |
119 usepairs = 0; | |
120 | |
121 --argc; | |
122 ++argv; | |
123 | |
124 if (argc != 1){ | |
125 usage(); | |
126 } | |
127 | |
128 in = fopen(argv[0], "r"); | |
129 if (in == NULL){ | |
130 fatal("can't open %s", argv[0]); | |
131 } | |
132 | |
133 for(i = 0; i < NRUNES; i++){ | |
134 mytoupper[i] = i; | |
135 mytolower[i] = i; | |
136 mytotitle[i] = i; | |
137 } | |
138 | |
139 /* | |
140 * make sure isspace has all of the "C" locale whitespace chars | |
141 */ | |
142 myisspace['\t'] = 1; | |
143 myisspace['\n'] = 1; | |
144 myisspace['\r'] = 1; | |
145 myisspace['\f'] = 1; | |
146 myisspace['\v'] = 1; | |
147 | |
148 /* | |
149 * a couple of other exceptions | |
150 */ | |
151 myisspace[0x85] = 1; /* control char, "next line" */ | |
152 myisspace[0xfeff] = 1; /* zero-width non-break space */ | |
153 | |
154 last = -1; | |
155 while(getunicodeline(in, fields, buf)){ | |
156 code = getcode(fields[FIELD_CODE]); | |
157 if (code >= NRUNES) | |
158 fatal("code-point value too big: %x", code); | |
159 if(code <= last) | |
160 fatal("bad code sequence: %x then %x", last, code); | |
161 last = code; | |
162 | |
163 /* | |
164 * check for ranges | |
165 */ | |
166 p = fields[FIELD_CATEGORY]; | |
167 if(strstr(fields[FIELD_NAME], ", First>") != NULL){ | |
168 if(!getunicodeline(in, fields2, buf2)) | |
169 fatal("range start at eof"); | |
170 if (strstr(fields2[FIELD_NAME], ", Last>") == NULL) | |
171 fatal("range start not followed by range end"); | |
172 last = getcode(fields2[FIELD_CODE]); | |
173 if(last <= code) | |
174 fatal("range out of sequence: %x then %x", code, last); | |
175 if(strcmp(p, fields2[FIELD_CATEGORY]) != 0) | |
176 fatal("range with mismatched category"); | |
177 } | |
178 | |
179 /* | |
180 * set properties and conversions | |
181 */ | |
182 for (; code <= last; code++){ | |
183 if(p[0] == 'L') | |
184 myisalpha[code] = 1; | |
185 if(p[0] == 'Z') | |
186 myisspace[code] = 1; | |
187 | |
188 if(strcmp(p, "Lu") == 0) | |
189 myisupper[code] = 1; | |
190 if(strcmp(p, "Ll") == 0) | |
191 myislower[code] = 1; | |
192 | |
193 if(strcmp(p, "Lt") == 0) | |
194 myistitle[code] = 1; | |
195 | |
196 if(strcmp(p, "Nd") == 0) | |
197 myisdigit[code] = 1; | |
198 | |
199 /* | |
200 * when finding conversions, also need to mark | |
201 * upper/lower case, since some chars, like | |
202 * "III" (0x2162), aren't defined as letters but have a | |
203 * lower case mapping ("iii" (0x2172)). | |
204 */ | |
205 if(fields[FIELD_UPPER][0] != '\0'){ | |
206 mytoupper[code] = getcode(fields[FIELD_UPPER]); | |
207 } | |
208 if(fields[FIELD_LOWER][0] != '\0'){ | |
209 mytolower[code] = getcode(fields[FIELD_LOWER]); | |
210 } | |
211 if(fields[FIELD_TITLE][0] != '\0'){ | |
212 mytotitle[code] = getcode(fields[FIELD_TITLE]); | |
213 } | |
214 } | |
215 } | |
216 | |
217 fclose(in); | |
218 | |
219 /* | |
220 * check for codes with no totitle mapping but a toupper mapping. | |
221 * these appear in UnicodeData-2.0.14.txt, but are almost certainly | |
222 * erroneous. | |
223 */ | |
224 for(i = 0; i < NRUNES; i++){ | |
225 if(mytotitle[i] == i | |
226 && mytoupper[i] != i | |
227 && !myistitle[i]) | |
228 fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]); | |
229 } | |
230 | |
231 /* | |
232 * make sure isupper[c] is true if for some x toupper[x] == c | |
233 * ditto for islower and istitle | |
234 */ | |
235 for(i = 0; i < NRUNES; i++) { | |
236 if(mytoupper[i] != i) | |
237 myisupper[mytoupper[i]] = 1; | |
238 if(mytolower[i] != i) | |
239 myislower[mytolower[i]] = 1; | |
240 if(mytotitle[i] != i) | |
241 myistitle[mytotitle[i]] = 1; | |
242 } | |
243 | |
244 mktables(argv[0], usepairs); | |
245 exit(0); | |
246 } | |
247 | |
248 /* | |
249 * generate a properties array for ranges, clearing those cases covered. | |
250 * if force, generate one-entry ranges for singletons. | |
251 */ | |
252 static int | |
253 mkisrange(const char* label, char* prop, int force) | |
254 { | |
255 int start, stop, some; | |
256 | |
257 /* | |
258 * first, the ranges | |
259 */ | |
260 some = 0; | |
261 for(start = 0; start < NRUNES; ) { | |
262 if(!prop[start]){ | |
263 start++; | |
264 continue; | |
265 } | |
266 | |
267 for(stop = start + 1; stop < NRUNES; stop++){ | |
268 if(!prop[stop]){ | |
269 break; | |
270 } | |
271 prop[stop] = 0; | |
272 } | |
273 if(force || stop != start + 1){ | |
274 if(!some){ | |
275 printf("static char32_t is%sr[] = {\n", label); | |
276 some = 1; | |
277 } | |
278 prop[start] = 0; | |
279 printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1); | |
280 } | |
281 | |
282 start = stop; | |
283 } | |
284 if(some) | |
285 printf("};\n\n"); | |
286 return some; | |
287 } | |
288 | |
289 /* | |
290 * generate a mapping array for pairs with a skip between, | |
291 * clearing those entries covered. | |
292 */ | |
293 static int | |
294 mkispair(const char *label, char *prop) | |
295 { | |
296 int start, stop, some; | |
297 | |
298 some = 0; | |
299 for(start = 0; start + 2 < NRUNES; ) { | |
300 if(!prop[start]){ | |
301 start++; | |
302 continue; | |
303 } | |
304 | |
305 for(stop = start + 2; stop < NRUNES; stop += 2){ | |
306 if(!prop[stop]){ | |
307 break; | |
308 } | |
309 prop[stop] = 0; | |
310 } | |
311 if(stop != start + 2){ | |
312 if(!some){ | |
313 printf("static char32_t is%sp[] = {\n", label); | |
314 some = 1; | |
315 } | |
316 prop[start] = 0; | |
317 printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2); | |
318 } | |
319 | |
320 start = stop; | |
321 } | |
322 if(some) | |
323 printf("};\n\n"); | |
324 return some; | |
325 } | |
326 | |
327 /* | |
328 * generate a properties array for singletons, clearing those cases covered. | |
329 */ | |
330 static int | |
331 mkissingle(const char *label, char *prop) | |
332 { | |
333 int start, some; | |
334 | |
335 some = 0; | |
336 for(start = 0; start < NRUNES; start++) { | |
337 if(!prop[start]){ | |
338 continue; | |
339 } | |
340 | |
341 if(!some){ | |
342 printf("static char32_t is%ss[] = {\n", label); | |
343 some = 1; | |
344 } | |
345 prop[start] = 0; | |
346 printf("\t0x%.4x,\n", start); | |
347 } | |
348 if(some) | |
349 printf("};\n\n"); | |
350 return some; | |
351 } | |
352 | |
353 /* | |
354 * generate tables and a function for is<label>rune | |
355 */ | |
356 static void | |
357 mkis(const char* label, char* prop, int usepairs) | |
358 { | |
359 int isr, isp, iss; | |
360 | |
361 isr = mkisrange(label, prop, 0); | |
362 isp = 0; | |
363 if(usepairs) | |
364 isp = mkispair(label, prop); | |
365 iss = mkissingle(label, prop); | |
366 | |
367 printf( | |
368 "bool Unicode::is%s(char32_t c) noexcept\n" | |
369 "{\n" | |
370 " char32_t *p;\n" | |
371 "\n", | |
372 label); | |
373 | |
374 if(isr) | |
375 printf( | |
376 " p = rbsearch(c, is%sr, nelem (is%sr)/2, 2);\n" | |
377 " if (p && c >= p[0] && c <= p[1])\n" | |
378 " return true;\n", | |
379 label, label); | |
380 | |
381 if(isp) | |
382 printf( | |
383 "\n p = rbsearch(c, is%sp, nelem (is%sp)/2, 2);\n" | |
384 " if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" | |
385 " return true;\n", | |
386 label, label); | |
387 | |
388 if(iss) | |
389 printf( | |
390 "\n p = rbsearch(c, is%ss, nelem (is%ss), 1);\n" | |
391 " if (p && c == p[0])\n" | |
392 " return true;\n", | |
393 label, label); | |
394 | |
395 | |
396 printf( | |
397 "\n return false;\n" | |
398 "}\n" | |
399 "\n" | |
400 ); | |
401 } | |
402 | |
403 /* | |
404 * generate a mapping array for ranges, clearing those entries covered. | |
405 * if force, generate one-entry ranges for singletons. | |
406 */ | |
407 static int | |
408 mktorange(const char* label, int* map, int force) | |
409 { | |
410 int start, stop, delta, some; | |
411 | |
412 some = 0; | |
413 for(start = 0; start < NRUNES; ) { | |
414 if(map[start] == start){ | |
415 start++; | |
416 continue; | |
417 } | |
418 | |
419 delta = TO_DELTA(map[start], start); | |
420 if(delta != (Rune)delta) | |
421 fatal("bad map delta %d", delta); | |
422 for(stop = start + 1; stop < NRUNES; stop++){ | |
423 if(TO_DELTA(map[stop], stop) != delta){ | |
424 break; | |
425 } | |
426 map[stop] = stop; | |
427 } | |
428 if(stop != start + 1){ | |
429 if(!some){ | |
430 printf("char32_t to%sr[] = {\n", label); | |
431 some = 1; | |
432 } | |
433 map[start] = start; | |
434 printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta); | |
435 } | |
436 | |
437 start = stop; | |
438 } | |
439 if(some) | |
440 printf("};\n\n"); | |
441 return some; | |
442 } | |
443 | |
444 /* | |
445 * generate a mapping array for pairs with a skip between, | |
446 * clearing those entries covered. | |
447 */ | |
448 static int | |
449 mktopair(const char* label, int* map) | |
450 { | |
451 int start, stop, delta, some; | |
452 | |
453 some = 0; | |
454 for(start = 0; start + 2 < NRUNES; ) { | |
455 if(map[start] == start){ | |
456 start++; | |
457 continue; | |
458 } | |
459 | |
460 delta = TO_DELTA(map[start], start); | |
461 if(delta != (Rune)delta) | |
462 fatal("bad map delta %d", delta); | |
463 for(stop = start + 2; stop < NRUNES; stop += 2){ | |
464 if(TO_DELTA(map[stop], stop) != delta){ | |
465 break; | |
466 } | |
467 map[stop] = stop; | |
468 } | |
469 if(stop != start + 2){ | |
470 if(!some){ | |
471 printf("static char32_t to%sp[] = {\n", label); | |
472 some = 1; | |
473 } | |
474 map[start] = start; | |
475 printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta); | |
476 } | |
477 | |
478 start = stop; | |
479 } | |
480 if(some) | |
481 printf("};\n\n"); | |
482 return some; | |
483 } | |
484 | |
485 /* | |
486 * generate a mapping array for singletons, clearing those entries covered. | |
487 */ | |
488 static int | |
489 mktosingle(const char* label, int* map) | |
490 { | |
491 int start, delta, some; | |
492 | |
493 some = 0; | |
494 for(start = 0; start < NRUNES; start++) { | |
495 if(map[start] == start){ | |
496 continue; | |
497 } | |
498 | |
499 delta = TO_DELTA(map[start], start); | |
500 if(delta != (Rune)delta) | |
501 fatal("bad map delta %d", delta); | |
502 if(!some){ | |
503 printf("static char32_t to%ss[] = {\n", label); | |
504 some = 1; | |
505 } | |
506 map[start] = start; | |
507 printf("\t0x%.4x, %d,\n", start, delta); | |
508 } | |
509 if(some) | |
510 printf("};\n\n"); | |
511 return some; | |
512 } | |
513 | |
514 /* | |
515 * generate tables and a function for to<label>rune | |
516 */ | |
517 static void | |
518 mkto(const char* label, int* map, int usepairs) | |
519 { | |
520 int tor, top, tos; | |
521 | |
522 tor = mktorange(label, map, 0); | |
523 top = 0; | |
524 if(usepairs) | |
525 top = mktopair(label, map); | |
526 tos = mktosingle(label, map); | |
527 | |
528 printf( | |
529 "char32_t Unicode::to%s(char32_t c) noexcept\n" | |
530 "{\n" | |
531 " char32_t *p;\n" | |
532 "\n", | |
533 label); | |
534 | |
535 if(tor) | |
536 printf( | |
537 " p = rbsearch(c, to%sr, nelem (to%sr)/3, 3);\n" | |
538 " if (p && c >= p[0] && c <= p[1])\n" | |
539 " return c + p[2] - %d;\n", | |
540 label, label, TO_OFFSET); | |
541 | |
542 if(top) | |
543 printf( | |
544 "\n p = rbsearch(c, to%sp, nelem (to%sp)/3, 3);\n" | |
545 " if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" | |
546 " return c + p[2] - %d;\n", | |
547 label, label, TO_OFFSET); | |
548 | |
549 if(tos) | |
550 printf( | |
551 "\n p = rbsearch(c, to%ss, nelem (to%ss)/2, 2);\n" | |
552 " if (p && c == p[0])\n" | |
553 " return c + p[1] - %d;\n", | |
554 label, label, TO_OFFSET); | |
555 | |
556 printf( | |
557 "\n return c;\n" | |
558 "}\n" | |
559 "\n" | |
560 ); | |
561 } | |
562 | |
563 // Make only range tables and a function for is<label>rune. | |
564 static void | |
565 mkisronly(const char* label, char* prop) | |
566 { | |
567 mkisrange(label, prop, 1); | |
568 printf( | |
569 "bool Unicode::is%s(char32_t c) noexcept\n" | |
570 "{\n" | |
571 " char32_t *p;\n" | |
572 "\n" | |
573 " p = rbsearch(c, is%sr, nelem (is%sr)/2, 2);\n" | |
574 " if (p && c >= p[0] && c <= p[1])\n" | |
575 " return true;\n\n" | |
576 " return false;\n" | |
577 "}\n" | |
578 "\n", | |
579 label, label, label); | |
580 } | |
581 | |
582 /* | |
583 * generate the body of runetype. | |
584 * assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne); | |
585 */ | |
586 static void | |
587 mktables(char *src, int usepairs) | |
588 { | |
589 /* Add nelem macro */ | |
590 printf( | |
591 "#define nelem(x) (sizeof (x) / sizeof ((x)[0]))\n\n" | |
592 ); | |
593 | |
594 /* Add the rbsearch function */ | |
595 printf( | |
596 "char32_t *rbsearch(char32_t c, char32_t *t, int n, int ne) noexcept\n" | |
597 "{\n" | |
598 " char32_t *p;\n" | |
599 " int m;\n\n" | |
600 " while (n > 1) {\n" | |
601 " m = n >> 1;\n" | |
602 " p = t + m * ne;\n\n" | |
603 " if (c >= p[0]) {\n" | |
604 " t = p;\n" | |
605 " n = n - m;\n" | |
606 " } else {\n" | |
607 " n = m;\n" | |
608 " }\n" | |
609 " }\n\n" | |
610 " if (n && c >= t[0])\n" | |
611 " return t;\n\n" | |
612 " return nullptr;\n" | |
613 "}\n\n" | |
614 ); | |
615 | |
616 /* | |
617 * we special case the space and digit tables, since they are assumed | |
618 * to be small with several ranges. | |
619 */ | |
620 mkisronly("space", myisspace); | |
621 mkisronly("digit", myisdigit); | |
622 | |
623 mkis("alpha", myisalpha, 0); | |
624 mkis("upper", myisupper, usepairs); | |
625 mkis("lower", myislower, usepairs); | |
626 mkis("title", myistitle, usepairs); | |
627 | |
628 mkto("upper", mytoupper, usepairs); | |
629 mkto("lower", mytolower, usepairs); | |
630 mkto("title", mytotitle, usepairs); | |
631 } | |
632 | |
633 static int | |
634 mygetfields(char **fields, int nfields, char *str, const char *delim) | |
635 { | |
636 int nf; | |
637 | |
638 fields[0] = str; | |
639 nf = 1; | |
640 if(nf >= nfields) | |
641 return nf; | |
642 | |
643 for(; *str; str++){ | |
644 if(strchr(delim, *str) != NULL){ | |
645 *str = '\0'; | |
646 fields[nf++] = str + 1; | |
647 if(nf >= nfields) | |
648 break; | |
649 } | |
650 } | |
651 return nf; | |
652 } | |
653 | |
654 static int | |
655 getunicodeline(FILE *in, char **fields, char *buf) | |
656 { | |
657 char *p; | |
658 | |
659 if(fgets(buf, MAX_LINE, in) == NULL) | |
660 return 0; | |
661 | |
662 p = strchr(buf, '\n'); | |
663 if (p == NULL) | |
664 fatal("line too long"); | |
665 *p = '\0'; | |
666 | |
667 if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS) | |
668 fatal("bad number of fields"); | |
669 | |
670 return 1; | |
671 } | |
672 | |
673 static int | |
674 getcode(char *s) | |
675 { | |
676 int i, code; | |
677 | |
678 code = 0; | |
679 i = 0; | |
680 /* Parse a hex number */ | |
681 while(s[i]) { | |
682 code <<= 4; | |
683 if(s[i] >= '0' && s[i] <= '9') | |
684 code += s[i] - '0'; | |
685 else if(s[i] >= 'A' && s[i] <= 'F') | |
686 code += s[i] - 'A' + 10; | |
687 else | |
688 fatal("bad code char '%c'", s[i]); | |
689 i++; | |
690 } | |
691 return code; | |
692 } | |
693 | |
694 static void | |
695 fatal(const char *fmt, ...) | |
696 { | |
697 va_list arg; | |
698 | |
699 fprintf(stderr, "mkunicode: fatal error: "); | |
700 va_start(arg, fmt); | |
701 vfprintf(stderr, fmt, arg); | |
702 va_end(arg); | |
703 fprintf(stderr, "\n"); | |
704 | |
705 exit(1); | |
706 } |