comparison tools/mkunicode/src/mkunicode.c @ 352:7fe8d4094983

Utf8: - Fix invalid decoding from UTF-8 to UTF-32 - Add all files
author David Demelier <markand@malikania.fr>
date Wed, 08 Apr 2015 12:33:45 +0200
parents
children b78d6d8f2872
comparison
equal deleted inserted replaced
351:47a206e724f2 352:7fe8d4094983
1 /*
2 * Tool to create our Unicode.cpp and Unicode.h file.
3 *
4 * Current version: 7.0.0
5 *
6 * Based on mkrunetype from the Go language.
7 *
8 * Adapted to generated C++ code.
9 */
10
11 // Copyright 2009 The Go Authors. All rights reserved.
12 // Use of this source code is governed by a BSD-style
13 // license that can be found in the LICENSE file.
14
15 /*
16 * make is(upper|lower|title|space|alpha)rune and
17 * to(upper|lower|title)rune from a UnicodeData.txt file.
18 * these can be found at unicode.org
19 *
20 * with -c, runs a check of the existing runetype functions vs.
21 * those extracted from UnicodeData.
22 *
23 * with -p, generates tables for pairs of chars, as well as for ranges
24 * and singletons.
25 *
26 * UnicodeData defines 4 fields of interest:
27 * 1) a category
28 * 2) an upper case mapping
29 * 3) a lower case mapping
30 * 4) a title case mapping
31 *
32 * toupper, tolower, and totitle are defined directly from the mapping.
33 *
34 * isalpharune(c) is true iff c is a "letter" category
35 * isupperrune(c) is true iff c is the target of toupperrune,
36 * or is in the uppercase letter category
37 * similarly for islowerrune and istitlerune.
38 * isspacerune is true for space category chars, "C" locale white space chars,
39 * and two additions:
40 * 0085 "next line" control char
41 * feff] "zero-width non-break space"
42 * isdigitrune is true iff c is a numeric-digit category.
43 */
44
45 #include <stdarg.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49
50 #include "utf.h"
51 #include "utfdef.h"
52
53 #define nelem(x) (sizeof(x)/sizeof((x)[0]))
54
55 enum {
56 /*
57 * fields in the unicode data file
58 */
59 FIELD_CODE,
60 FIELD_NAME,
61 FIELD_CATEGORY,
62 FIELD_COMBINING,
63 FIELD_BIDIR,
64 FIELD_DECOMP,
65 FIELD_DECIMAL_DIG,
66 FIELD_DIG,
67 FIELD_NUMERIC_VAL,
68 FIELD_MIRRORED,
69 FIELD_UNICODE_1_NAME,
70 FIELD_COMMENT,
71 FIELD_UPPER,
72 FIELD_LOWER,
73 FIELD_TITLE,
74 NFIELDS,
75
76 MAX_LINE = 1024,
77
78 TO_OFFSET = 1 << 20,
79
80 NRUNES = 1 << 21,
81 };
82
83 #define TO_DELTA(xmapped,x) (TO_OFFSET + (xmapped) - (x))
84
85 static char myisspace[NRUNES];
86 static char myisalpha[NRUNES];
87 static char myisdigit[NRUNES];
88 static char myisupper[NRUNES];
89 static char myislower[NRUNES];
90 static char myistitle[NRUNES];
91
92 static int mytoupper[NRUNES];
93 static int mytolower[NRUNES];
94 static int mytotitle[NRUNES];
95
96 static void check(void);
97 static void mktables(char *src, int usepairs);
98 static void fatal(const char *fmt, ...);
99 static int mygetfields(char **fields, int nfields, char *str, const char *delim);
100 static int getunicodeline(FILE *in, char **fields, char *buf);
101 static int getcode(char *s);
102
103 static void
104 usage(void)
105 {
106 fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n");
107 exit(1);
108 }
109
110 int
111 main(int argc, char *argv[])
112 {
113 FILE *in;
114 char buf[MAX_LINE], buf2[MAX_LINE];
115 char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
116 char *p;
117 int i, code, last, usepairs;
118
119 usepairs = 0;
120
121 --argc;
122 ++argv;
123
124 if (argc != 1){
125 usage();
126 }
127
128 in = fopen(argv[0], "r");
129 if (in == NULL){
130 fatal("can't open %s", argv[0]);
131 }
132
133 for(i = 0; i < NRUNES; i++){
134 mytoupper[i] = i;
135 mytolower[i] = i;
136 mytotitle[i] = i;
137 }
138
139 /*
140 * make sure isspace has all of the "C" locale whitespace chars
141 */
142 myisspace['\t'] = 1;
143 myisspace['\n'] = 1;
144 myisspace['\r'] = 1;
145 myisspace['\f'] = 1;
146 myisspace['\v'] = 1;
147
148 /*
149 * a couple of other exceptions
150 */
151 myisspace[0x85] = 1; /* control char, "next line" */
152 myisspace[0xfeff] = 1; /* zero-width non-break space */
153
154 last = -1;
155 while(getunicodeline(in, fields, buf)){
156 code = getcode(fields[FIELD_CODE]);
157 if (code >= NRUNES)
158 fatal("code-point value too big: %x", code);
159 if(code <= last)
160 fatal("bad code sequence: %x then %x", last, code);
161 last = code;
162
163 /*
164 * check for ranges
165 */
166 p = fields[FIELD_CATEGORY];
167 if(strstr(fields[FIELD_NAME], ", First>") != NULL){
168 if(!getunicodeline(in, fields2, buf2))
169 fatal("range start at eof");
170 if (strstr(fields2[FIELD_NAME], ", Last>") == NULL)
171 fatal("range start not followed by range end");
172 last = getcode(fields2[FIELD_CODE]);
173 if(last <= code)
174 fatal("range out of sequence: %x then %x", code, last);
175 if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
176 fatal("range with mismatched category");
177 }
178
179 /*
180 * set properties and conversions
181 */
182 for (; code <= last; code++){
183 if(p[0] == 'L')
184 myisalpha[code] = 1;
185 if(p[0] == 'Z')
186 myisspace[code] = 1;
187
188 if(strcmp(p, "Lu") == 0)
189 myisupper[code] = 1;
190 if(strcmp(p, "Ll") == 0)
191 myislower[code] = 1;
192
193 if(strcmp(p, "Lt") == 0)
194 myistitle[code] = 1;
195
196 if(strcmp(p, "Nd") == 0)
197 myisdigit[code] = 1;
198
199 /*
200 * when finding conversions, also need to mark
201 * upper/lower case, since some chars, like
202 * "III" (0x2162), aren't defined as letters but have a
203 * lower case mapping ("iii" (0x2172)).
204 */
205 if(fields[FIELD_UPPER][0] != '\0'){
206 mytoupper[code] = getcode(fields[FIELD_UPPER]);
207 }
208 if(fields[FIELD_LOWER][0] != '\0'){
209 mytolower[code] = getcode(fields[FIELD_LOWER]);
210 }
211 if(fields[FIELD_TITLE][0] != '\0'){
212 mytotitle[code] = getcode(fields[FIELD_TITLE]);
213 }
214 }
215 }
216
217 fclose(in);
218
219 /*
220 * check for codes with no totitle mapping but a toupper mapping.
221 * these appear in UnicodeData-2.0.14.txt, but are almost certainly
222 * erroneous.
223 */
224 for(i = 0; i < NRUNES; i++){
225 if(mytotitle[i] == i
226 && mytoupper[i] != i
227 && !myistitle[i])
228 fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]);
229 }
230
231 /*
232 * make sure isupper[c] is true if for some x toupper[x] == c
233 * ditto for islower and istitle
234 */
235 for(i = 0; i < NRUNES; i++) {
236 if(mytoupper[i] != i)
237 myisupper[mytoupper[i]] = 1;
238 if(mytolower[i] != i)
239 myislower[mytolower[i]] = 1;
240 if(mytotitle[i] != i)
241 myistitle[mytotitle[i]] = 1;
242 }
243
244 mktables(argv[0], usepairs);
245 exit(0);
246 }
247
248 /*
249 * generate a properties array for ranges, clearing those cases covered.
250 * if force, generate one-entry ranges for singletons.
251 */
252 static int
253 mkisrange(const char* label, char* prop, int force)
254 {
255 int start, stop, some;
256
257 /*
258 * first, the ranges
259 */
260 some = 0;
261 for(start = 0; start < NRUNES; ) {
262 if(!prop[start]){
263 start++;
264 continue;
265 }
266
267 for(stop = start + 1; stop < NRUNES; stop++){
268 if(!prop[stop]){
269 break;
270 }
271 prop[stop] = 0;
272 }
273 if(force || stop != start + 1){
274 if(!some){
275 printf("static char32_t is%sr[] = {\n", label);
276 some = 1;
277 }
278 prop[start] = 0;
279 printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1);
280 }
281
282 start = stop;
283 }
284 if(some)
285 printf("};\n\n");
286 return some;
287 }
288
289 /*
290 * generate a mapping array for pairs with a skip between,
291 * clearing those entries covered.
292 */
293 static int
294 mkispair(const char *label, char *prop)
295 {
296 int start, stop, some;
297
298 some = 0;
299 for(start = 0; start + 2 < NRUNES; ) {
300 if(!prop[start]){
301 start++;
302 continue;
303 }
304
305 for(stop = start + 2; stop < NRUNES; stop += 2){
306 if(!prop[stop]){
307 break;
308 }
309 prop[stop] = 0;
310 }
311 if(stop != start + 2){
312 if(!some){
313 printf("static char32_t is%sp[] = {\n", label);
314 some = 1;
315 }
316 prop[start] = 0;
317 printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2);
318 }
319
320 start = stop;
321 }
322 if(some)
323 printf("};\n\n");
324 return some;
325 }
326
327 /*
328 * generate a properties array for singletons, clearing those cases covered.
329 */
330 static int
331 mkissingle(const char *label, char *prop)
332 {
333 int start, some;
334
335 some = 0;
336 for(start = 0; start < NRUNES; start++) {
337 if(!prop[start]){
338 continue;
339 }
340
341 if(!some){
342 printf("static char32_t is%ss[] = {\n", label);
343 some = 1;
344 }
345 prop[start] = 0;
346 printf("\t0x%.4x,\n", start);
347 }
348 if(some)
349 printf("};\n\n");
350 return some;
351 }
352
353 /*
354 * generate tables and a function for is<label>rune
355 */
356 static void
357 mkis(const char* label, char* prop, int usepairs)
358 {
359 int isr, isp, iss;
360
361 isr = mkisrange(label, prop, 0);
362 isp = 0;
363 if(usepairs)
364 isp = mkispair(label, prop);
365 iss = mkissingle(label, prop);
366
367 printf(
368 "bool Unicode::is%s(char32_t c) noexcept\n"
369 "{\n"
370 " char32_t *p;\n"
371 "\n",
372 label);
373
374 if(isr)
375 printf(
376 " p = rbsearch(c, is%sr, nelem (is%sr)/2, 2);\n"
377 " if (p && c >= p[0] && c <= p[1])\n"
378 " return true;\n",
379 label, label);
380
381 if(isp)
382 printf(
383 "\n p = rbsearch(c, is%sp, nelem (is%sp)/2, 2);\n"
384 " if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
385 " return true;\n",
386 label, label);
387
388 if(iss)
389 printf(
390 "\n p = rbsearch(c, is%ss, nelem (is%ss), 1);\n"
391 " if (p && c == p[0])\n"
392 " return true;\n",
393 label, label);
394
395
396 printf(
397 "\n return false;\n"
398 "}\n"
399 "\n"
400 );
401 }
402
403 /*
404 * generate a mapping array for ranges, clearing those entries covered.
405 * if force, generate one-entry ranges for singletons.
406 */
407 static int
408 mktorange(const char* label, int* map, int force)
409 {
410 int start, stop, delta, some;
411
412 some = 0;
413 for(start = 0; start < NRUNES; ) {
414 if(map[start] == start){
415 start++;
416 continue;
417 }
418
419 delta = TO_DELTA(map[start], start);
420 if(delta != (Rune)delta)
421 fatal("bad map delta %d", delta);
422 for(stop = start + 1; stop < NRUNES; stop++){
423 if(TO_DELTA(map[stop], stop) != delta){
424 break;
425 }
426 map[stop] = stop;
427 }
428 if(stop != start + 1){
429 if(!some){
430 printf("char32_t to%sr[] = {\n", label);
431 some = 1;
432 }
433 map[start] = start;
434 printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta);
435 }
436
437 start = stop;
438 }
439 if(some)
440 printf("};\n\n");
441 return some;
442 }
443
444 /*
445 * generate a mapping array for pairs with a skip between,
446 * clearing those entries covered.
447 */
448 static int
449 mktopair(const char* label, int* map)
450 {
451 int start, stop, delta, some;
452
453 some = 0;
454 for(start = 0; start + 2 < NRUNES; ) {
455 if(map[start] == start){
456 start++;
457 continue;
458 }
459
460 delta = TO_DELTA(map[start], start);
461 if(delta != (Rune)delta)
462 fatal("bad map delta %d", delta);
463 for(stop = start + 2; stop < NRUNES; stop += 2){
464 if(TO_DELTA(map[stop], stop) != delta){
465 break;
466 }
467 map[stop] = stop;
468 }
469 if(stop != start + 2){
470 if(!some){
471 printf("static char32_t to%sp[] = {\n", label);
472 some = 1;
473 }
474 map[start] = start;
475 printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta);
476 }
477
478 start = stop;
479 }
480 if(some)
481 printf("};\n\n");
482 return some;
483 }
484
485 /*
486 * generate a mapping array for singletons, clearing those entries covered.
487 */
488 static int
489 mktosingle(const char* label, int* map)
490 {
491 int start, delta, some;
492
493 some = 0;
494 for(start = 0; start < NRUNES; start++) {
495 if(map[start] == start){
496 continue;
497 }
498
499 delta = TO_DELTA(map[start], start);
500 if(delta != (Rune)delta)
501 fatal("bad map delta %d", delta);
502 if(!some){
503 printf("static char32_t to%ss[] = {\n", label);
504 some = 1;
505 }
506 map[start] = start;
507 printf("\t0x%.4x, %d,\n", start, delta);
508 }
509 if(some)
510 printf("};\n\n");
511 return some;
512 }
513
514 /*
515 * generate tables and a function for to<label>rune
516 */
517 static void
518 mkto(const char* label, int* map, int usepairs)
519 {
520 int tor, top, tos;
521
522 tor = mktorange(label, map, 0);
523 top = 0;
524 if(usepairs)
525 top = mktopair(label, map);
526 tos = mktosingle(label, map);
527
528 printf(
529 "char32_t Unicode::to%s(char32_t c) noexcept\n"
530 "{\n"
531 " char32_t *p;\n"
532 "\n",
533 label);
534
535 if(tor)
536 printf(
537 " p = rbsearch(c, to%sr, nelem (to%sr)/3, 3);\n"
538 " if (p && c >= p[0] && c <= p[1])\n"
539 " return c + p[2] - %d;\n",
540 label, label, TO_OFFSET);
541
542 if(top)
543 printf(
544 "\n p = rbsearch(c, to%sp, nelem (to%sp)/3, 3);\n"
545 " if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
546 " return c + p[2] - %d;\n",
547 label, label, TO_OFFSET);
548
549 if(tos)
550 printf(
551 "\n p = rbsearch(c, to%ss, nelem (to%ss)/2, 2);\n"
552 " if (p && c == p[0])\n"
553 " return c + p[1] - %d;\n",
554 label, label, TO_OFFSET);
555
556 printf(
557 "\n return c;\n"
558 "}\n"
559 "\n"
560 );
561 }
562
563 // Make only range tables and a function for is<label>rune.
564 static void
565 mkisronly(const char* label, char* prop)
566 {
567 mkisrange(label, prop, 1);
568 printf(
569 "bool Unicode::is%s(char32_t c) noexcept\n"
570 "{\n"
571 " char32_t *p;\n"
572 "\n"
573 " p = rbsearch(c, is%sr, nelem (is%sr)/2, 2);\n"
574 " if (p && c >= p[0] && c <= p[1])\n"
575 " return true;\n\n"
576 " return false;\n"
577 "}\n"
578 "\n",
579 label, label, label);
580 }
581
582 /*
583 * generate the body of runetype.
584 * assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne);
585 */
586 static void
587 mktables(char *src, int usepairs)
588 {
589 /* Add nelem macro */
590 printf(
591 "#define nelem(x) (sizeof (x) / sizeof ((x)[0]))\n\n"
592 );
593
594 /* Add the rbsearch function */
595 printf(
596 "char32_t *rbsearch(char32_t c, char32_t *t, int n, int ne) noexcept\n"
597 "{\n"
598 " char32_t *p;\n"
599 " int m;\n\n"
600 " while (n > 1) {\n"
601 " m = n >> 1;\n"
602 " p = t + m * ne;\n\n"
603 " if (c >= p[0]) {\n"
604 " t = p;\n"
605 " n = n - m;\n"
606 " } else {\n"
607 " n = m;\n"
608 " }\n"
609 " }\n\n"
610 " if (n && c >= t[0])\n"
611 " return t;\n\n"
612 " return nullptr;\n"
613 "}\n\n"
614 );
615
616 /*
617 * we special case the space and digit tables, since they are assumed
618 * to be small with several ranges.
619 */
620 mkisronly("space", myisspace);
621 mkisronly("digit", myisdigit);
622
623 mkis("alpha", myisalpha, 0);
624 mkis("upper", myisupper, usepairs);
625 mkis("lower", myislower, usepairs);
626 mkis("title", myistitle, usepairs);
627
628 mkto("upper", mytoupper, usepairs);
629 mkto("lower", mytolower, usepairs);
630 mkto("title", mytotitle, usepairs);
631 }
632
633 static int
634 mygetfields(char **fields, int nfields, char *str, const char *delim)
635 {
636 int nf;
637
638 fields[0] = str;
639 nf = 1;
640 if(nf >= nfields)
641 return nf;
642
643 for(; *str; str++){
644 if(strchr(delim, *str) != NULL){
645 *str = '\0';
646 fields[nf++] = str + 1;
647 if(nf >= nfields)
648 break;
649 }
650 }
651 return nf;
652 }
653
654 static int
655 getunicodeline(FILE *in, char **fields, char *buf)
656 {
657 char *p;
658
659 if(fgets(buf, MAX_LINE, in) == NULL)
660 return 0;
661
662 p = strchr(buf, '\n');
663 if (p == NULL)
664 fatal("line too long");
665 *p = '\0';
666
667 if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS)
668 fatal("bad number of fields");
669
670 return 1;
671 }
672
673 static int
674 getcode(char *s)
675 {
676 int i, code;
677
678 code = 0;
679 i = 0;
680 /* Parse a hex number */
681 while(s[i]) {
682 code <<= 4;
683 if(s[i] >= '0' && s[i] <= '9')
684 code += s[i] - '0';
685 else if(s[i] >= 'A' && s[i] <= 'F')
686 code += s[i] - 'A' + 10;
687 else
688 fatal("bad code char '%c'", s[i]);
689 i++;
690 }
691 return code;
692 }
693
694 static void
695 fatal(const char *fmt, ...)
696 {
697 va_list arg;
698
699 fprintf(stderr, "mkunicode: fatal error: ");
700 va_start(arg, fmt);
701 vfprintf(stderr, fmt, arg);
702 va_end(arg);
703 fprintf(stderr, "\n");
704
705 exit(1);
706 }