Mercurial > code
diff tools/mkunicode/src/mkunicode.c @ 352:7fe8d4094983
Utf8:
- Fix invalid decoding from UTF-8 to UTF-32
- Add all files
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 08 Apr 2015 12:33:45 +0200 |
parents | |
children | b78d6d8f2872 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/mkunicode/src/mkunicode.c Wed Apr 08 12:33:45 2015 +0200 @@ -0,0 +1,706 @@ +/* + * Tool to create our Unicode.cpp and Unicode.h file. + * + * Current version: 7.0.0 + * + * Based on mkrunetype from the Go language. + * + * Adapted to generated C++ code. + */ + +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* + * make is(upper|lower|title|space|alpha)rune and + * to(upper|lower|title)rune from a UnicodeData.txt file. + * these can be found at unicode.org + * + * with -c, runs a check of the existing runetype functions vs. + * those extracted from UnicodeData. + * + * with -p, generates tables for pairs of chars, as well as for ranges + * and singletons. + * + * UnicodeData defines 4 fields of interest: + * 1) a category + * 2) an upper case mapping + * 3) a lower case mapping + * 4) a title case mapping + * + * toupper, tolower, and totitle are defined directly from the mapping. + * + * isalpharune(c) is true iff c is a "letter" category + * isupperrune(c) is true iff c is the target of toupperrune, + * or is in the uppercase letter category + * similarly for islowerrune and istitlerune. + * isspacerune is true for space category chars, "C" locale white space chars, + * and two additions: + * 0085 "next line" control char + * feff] "zero-width non-break space" + * isdigitrune is true iff c is a numeric-digit category. + */ + +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "utf.h" +#include "utfdef.h" + +#define nelem(x) (sizeof(x)/sizeof((x)[0])) + +enum { + /* + * fields in the unicode data file + */ + FIELD_CODE, + FIELD_NAME, + FIELD_CATEGORY, + FIELD_COMBINING, + FIELD_BIDIR, + FIELD_DECOMP, + FIELD_DECIMAL_DIG, + FIELD_DIG, + FIELD_NUMERIC_VAL, + FIELD_MIRRORED, + FIELD_UNICODE_1_NAME, + FIELD_COMMENT, + FIELD_UPPER, + FIELD_LOWER, + FIELD_TITLE, + NFIELDS, + + MAX_LINE = 1024, + + TO_OFFSET = 1 << 20, + + NRUNES = 1 << 21, +}; + +#define TO_DELTA(xmapped,x) (TO_OFFSET + (xmapped) - (x)) + +static char myisspace[NRUNES]; +static char myisalpha[NRUNES]; +static char myisdigit[NRUNES]; +static char myisupper[NRUNES]; +static char myislower[NRUNES]; +static char myistitle[NRUNES]; + +static int mytoupper[NRUNES]; +static int mytolower[NRUNES]; +static int mytotitle[NRUNES]; + +static void check(void); +static void mktables(char *src, int usepairs); +static void fatal(const char *fmt, ...); +static int mygetfields(char **fields, int nfields, char *str, const char *delim); +static int getunicodeline(FILE *in, char **fields, char *buf); +static int getcode(char *s); + +static void +usage(void) +{ + fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + FILE *in; + char buf[MAX_LINE], buf2[MAX_LINE]; + char *fields[NFIELDS + 1], *fields2[NFIELDS + 1]; + char *p; + int i, code, last, usepairs; + + usepairs = 0; + + --argc; + ++argv; + + if (argc != 1){ + usage(); + } + + in = fopen(argv[0], "r"); + if (in == NULL){ + fatal("can't open %s", argv[0]); + } + + for(i = 0; i < NRUNES; i++){ + mytoupper[i] = i; + mytolower[i] = i; + mytotitle[i] = i; + } + + /* + * make sure isspace has all of the "C" locale whitespace chars + */ + myisspace['\t'] = 1; + myisspace['\n'] = 1; + myisspace['\r'] = 1; + myisspace['\f'] = 1; + myisspace['\v'] = 1; + + /* + * a couple of other exceptions + */ + myisspace[0x85] = 1; /* control char, "next line" */ + myisspace[0xfeff] = 1; /* zero-width non-break space */ + + last = -1; + while(getunicodeline(in, fields, buf)){ + code = getcode(fields[FIELD_CODE]); + if (code >= NRUNES) + fatal("code-point value too big: %x", code); + if(code <= last) + fatal("bad code sequence: %x then %x", last, code); + last = code; + + /* + * check for ranges + */ + p = fields[FIELD_CATEGORY]; + if(strstr(fields[FIELD_NAME], ", First>") != NULL){ + if(!getunicodeline(in, fields2, buf2)) + fatal("range start at eof"); + if (strstr(fields2[FIELD_NAME], ", Last>") == NULL) + fatal("range start not followed by range end"); + last = getcode(fields2[FIELD_CODE]); + if(last <= code) + fatal("range out of sequence: %x then %x", code, last); + if(strcmp(p, fields2[FIELD_CATEGORY]) != 0) + fatal("range with mismatched category"); + } + + /* + * set properties and conversions + */ + for (; code <= last; code++){ + if(p[0] == 'L') + myisalpha[code] = 1; + if(p[0] == 'Z') + myisspace[code] = 1; + + if(strcmp(p, "Lu") == 0) + myisupper[code] = 1; + if(strcmp(p, "Ll") == 0) + myislower[code] = 1; + + if(strcmp(p, "Lt") == 0) + myistitle[code] = 1; + + if(strcmp(p, "Nd") == 0) + myisdigit[code] = 1; + + /* + * when finding conversions, also need to mark + * upper/lower case, since some chars, like + * "III" (0x2162), aren't defined as letters but have a + * lower case mapping ("iii" (0x2172)). + */ + if(fields[FIELD_UPPER][0] != '\0'){ + mytoupper[code] = getcode(fields[FIELD_UPPER]); + } + if(fields[FIELD_LOWER][0] != '\0'){ + mytolower[code] = getcode(fields[FIELD_LOWER]); + } + if(fields[FIELD_TITLE][0] != '\0'){ + mytotitle[code] = getcode(fields[FIELD_TITLE]); + } + } + } + + fclose(in); + + /* + * check for codes with no totitle mapping but a toupper mapping. + * these appear in UnicodeData-2.0.14.txt, but are almost certainly + * erroneous. + */ + for(i = 0; i < NRUNES; i++){ + if(mytotitle[i] == i + && mytoupper[i] != i + && !myistitle[i]) + fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]); + } + + /* + * make sure isupper[c] is true if for some x toupper[x] == c + * ditto for islower and istitle + */ + for(i = 0; i < NRUNES; i++) { + if(mytoupper[i] != i) + myisupper[mytoupper[i]] = 1; + if(mytolower[i] != i) + myislower[mytolower[i]] = 1; + if(mytotitle[i] != i) + myistitle[mytotitle[i]] = 1; + } + + mktables(argv[0], usepairs); + exit(0); +} + +/* + * generate a properties array for ranges, clearing those cases covered. + * if force, generate one-entry ranges for singletons. + */ +static int +mkisrange(const char* label, char* prop, int force) +{ + int start, stop, some; + + /* + * first, the ranges + */ + some = 0; + for(start = 0; start < NRUNES; ) { + if(!prop[start]){ + start++; + continue; + } + + for(stop = start + 1; stop < NRUNES; stop++){ + if(!prop[stop]){ + break; + } + prop[stop] = 0; + } + if(force || stop != start + 1){ + if(!some){ + printf("static char32_t is%sr[] = {\n", label); + some = 1; + } + prop[start] = 0; + printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1); + } + + start = stop; + } + if(some) + printf("};\n\n"); + return some; +} + +/* + * generate a mapping array for pairs with a skip between, + * clearing those entries covered. + */ +static int +mkispair(const char *label, char *prop) +{ + int start, stop, some; + + some = 0; + for(start = 0; start + 2 < NRUNES; ) { + if(!prop[start]){ + start++; + continue; + } + + for(stop = start + 2; stop < NRUNES; stop += 2){ + if(!prop[stop]){ + break; + } + prop[stop] = 0; + } + if(stop != start + 2){ + if(!some){ + printf("static char32_t is%sp[] = {\n", label); + some = 1; + } + prop[start] = 0; + printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2); + } + + start = stop; + } + if(some) + printf("};\n\n"); + return some; +} + +/* + * generate a properties array for singletons, clearing those cases covered. + */ +static int +mkissingle(const char *label, char *prop) +{ + int start, some; + + some = 0; + for(start = 0; start < NRUNES; start++) { + if(!prop[start]){ + continue; + } + + if(!some){ + printf("static char32_t is%ss[] = {\n", label); + some = 1; + } + prop[start] = 0; + printf("\t0x%.4x,\n", start); + } + if(some) + printf("};\n\n"); + return some; +} + +/* + * generate tables and a function for is<label>rune + */ +static void +mkis(const char* label, char* prop, int usepairs) +{ + int isr, isp, iss; + + isr = mkisrange(label, prop, 0); + isp = 0; + if(usepairs) + isp = mkispair(label, prop); + iss = mkissingle(label, prop); + + printf( + "bool Unicode::is%s(char32_t c) noexcept\n" + "{\n" + " char32_t *p;\n" + "\n", + label); + + if(isr) + printf( + " p = rbsearch(c, is%sr, nelem (is%sr)/2, 2);\n" + " if (p && c >= p[0] && c <= p[1])\n" + " return true;\n", + label, label); + + if(isp) + printf( + "\n p = rbsearch(c, is%sp, nelem (is%sp)/2, 2);\n" + " if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" + " return true;\n", + label, label); + + if(iss) + printf( + "\n p = rbsearch(c, is%ss, nelem (is%ss), 1);\n" + " if (p && c == p[0])\n" + " return true;\n", + label, label); + + + printf( + "\n return false;\n" + "}\n" + "\n" + ); +} + +/* + * generate a mapping array for ranges, clearing those entries covered. + * if force, generate one-entry ranges for singletons. + */ +static int +mktorange(const char* label, int* map, int force) +{ + int start, stop, delta, some; + + some = 0; + for(start = 0; start < NRUNES; ) { + if(map[start] == start){ + start++; + continue; + } + + delta = TO_DELTA(map[start], start); + if(delta != (Rune)delta) + fatal("bad map delta %d", delta); + for(stop = start + 1; stop < NRUNES; stop++){ + if(TO_DELTA(map[stop], stop) != delta){ + break; + } + map[stop] = stop; + } + if(stop != start + 1){ + if(!some){ + printf("char32_t to%sr[] = {\n", label); + some = 1; + } + map[start] = start; + printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta); + } + + start = stop; + } + if(some) + printf("};\n\n"); + return some; +} + +/* + * generate a mapping array for pairs with a skip between, + * clearing those entries covered. + */ +static int +mktopair(const char* label, int* map) +{ + int start, stop, delta, some; + + some = 0; + for(start = 0; start + 2 < NRUNES; ) { + if(map[start] == start){ + start++; + continue; + } + + delta = TO_DELTA(map[start], start); + if(delta != (Rune)delta) + fatal("bad map delta %d", delta); + for(stop = start + 2; stop < NRUNES; stop += 2){ + if(TO_DELTA(map[stop], stop) != delta){ + break; + } + map[stop] = stop; + } + if(stop != start + 2){ + if(!some){ + printf("static char32_t to%sp[] = {\n", label); + some = 1; + } + map[start] = start; + printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta); + } + + start = stop; + } + if(some) + printf("};\n\n"); + return some; +} + +/* + * generate a mapping array for singletons, clearing those entries covered. + */ +static int +mktosingle(const char* label, int* map) +{ + int start, delta, some; + + some = 0; + for(start = 0; start < NRUNES; start++) { + if(map[start] == start){ + continue; + } + + delta = TO_DELTA(map[start], start); + if(delta != (Rune)delta) + fatal("bad map delta %d", delta); + if(!some){ + printf("static char32_t to%ss[] = {\n", label); + some = 1; + } + map[start] = start; + printf("\t0x%.4x, %d,\n", start, delta); + } + if(some) + printf("};\n\n"); + return some; +} + +/* + * generate tables and a function for to<label>rune + */ +static void +mkto(const char* label, int* map, int usepairs) +{ + int tor, top, tos; + + tor = mktorange(label, map, 0); + top = 0; + if(usepairs) + top = mktopair(label, map); + tos = mktosingle(label, map); + + printf( + "char32_t Unicode::to%s(char32_t c) noexcept\n" + "{\n" + " char32_t *p;\n" + "\n", + label); + + if(tor) + printf( + " p = rbsearch(c, to%sr, nelem (to%sr)/3, 3);\n" + " if (p && c >= p[0] && c <= p[1])\n" + " return c + p[2] - %d;\n", + label, label, TO_OFFSET); + + if(top) + printf( + "\n p = rbsearch(c, to%sp, nelem (to%sp)/3, 3);\n" + " if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" + " return c + p[2] - %d;\n", + label, label, TO_OFFSET); + + if(tos) + printf( + "\n p = rbsearch(c, to%ss, nelem (to%ss)/2, 2);\n" + " if (p && c == p[0])\n" + " return c + p[1] - %d;\n", + label, label, TO_OFFSET); + + printf( + "\n return c;\n" + "}\n" + "\n" + ); +} + +// Make only range tables and a function for is<label>rune. +static void +mkisronly(const char* label, char* prop) +{ + mkisrange(label, prop, 1); + printf( + "bool Unicode::is%s(char32_t c) noexcept\n" + "{\n" + " char32_t *p;\n" + "\n" + " p = rbsearch(c, is%sr, nelem (is%sr)/2, 2);\n" + " if (p && c >= p[0] && c <= p[1])\n" + " return true;\n\n" + " return false;\n" + "}\n" + "\n", + label, label, label); +} + +/* + * generate the body of runetype. + * assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne); + */ +static void +mktables(char *src, int usepairs) +{ + /* Add nelem macro */ + printf( + "#define nelem(x) (sizeof (x) / sizeof ((x)[0]))\n\n" + ); + + /* Add the rbsearch function */ + printf( + "char32_t *rbsearch(char32_t c, char32_t *t, int n, int ne) noexcept\n" + "{\n" + " char32_t *p;\n" + " int m;\n\n" + " while (n > 1) {\n" + " m = n >> 1;\n" + " p = t + m * ne;\n\n" + " if (c >= p[0]) {\n" + " t = p;\n" + " n = n - m;\n" + " } else {\n" + " n = m;\n" + " }\n" + " }\n\n" + " if (n && c >= t[0])\n" + " return t;\n\n" + " return nullptr;\n" + "}\n\n" + ); + + /* + * we special case the space and digit tables, since they are assumed + * to be small with several ranges. + */ + mkisronly("space", myisspace); + mkisronly("digit", myisdigit); + + mkis("alpha", myisalpha, 0); + mkis("upper", myisupper, usepairs); + mkis("lower", myislower, usepairs); + mkis("title", myistitle, usepairs); + + mkto("upper", mytoupper, usepairs); + mkto("lower", mytolower, usepairs); + mkto("title", mytotitle, usepairs); +} + +static int +mygetfields(char **fields, int nfields, char *str, const char *delim) +{ + int nf; + + fields[0] = str; + nf = 1; + if(nf >= nfields) + return nf; + + for(; *str; str++){ + if(strchr(delim, *str) != NULL){ + *str = '\0'; + fields[nf++] = str + 1; + if(nf >= nfields) + break; + } + } + return nf; +} + +static int +getunicodeline(FILE *in, char **fields, char *buf) +{ + char *p; + + if(fgets(buf, MAX_LINE, in) == NULL) + return 0; + + p = strchr(buf, '\n'); + if (p == NULL) + fatal("line too long"); + *p = '\0'; + + if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS) + fatal("bad number of fields"); + + return 1; +} + +static int +getcode(char *s) +{ + int i, code; + + code = 0; + i = 0; + /* Parse a hex number */ + while(s[i]) { + code <<= 4; + if(s[i] >= '0' && s[i] <= '9') + code += s[i] - '0'; + else if(s[i] >= 'A' && s[i] <= 'F') + code += s[i] - 'A' + 10; + else + fatal("bad code char '%c'", s[i]); + i++; + } + return code; +} + +static void +fatal(const char *fmt, ...) +{ + va_list arg; + + fprintf(stderr, "mkunicode: fatal error: "); + va_start(arg, fmt); + vfprintf(stderr, fmt, arg); + va_end(arg); + fprintf(stderr, "\n"); + + exit(1); +} \ No newline at end of file