diff tools/mkunicode/src/mkunicode.c @ 352:7fe8d4094983

Utf8: - Fix invalid decoding from UTF-8 to UTF-32 - Add all files
author David Demelier <markand@malikania.fr>
date Wed, 08 Apr 2015 12:33:45 +0200
parents
children b78d6d8f2872
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mkunicode/src/mkunicode.c	Wed Apr 08 12:33:45 2015 +0200
@@ -0,0 +1,706 @@
+/*
+ * Tool to create our Unicode.cpp and Unicode.h file.
+ *
+ * Current version: 7.0.0
+ *
+ * Based on mkrunetype from the Go language.
+ *
+ * Adapted to generated C++ code.
+ */
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * make is(upper|lower|title|space|alpha)rune and
+ * to(upper|lower|title)rune from a UnicodeData.txt file.
+ * these can be found at unicode.org
+ *
+ * with -c, runs a check of the existing runetype functions vs.
+ * those extracted from UnicodeData.
+ *
+ * with -p, generates tables for pairs of chars, as well as for ranges
+ * and singletons.
+ *
+ * UnicodeData defines 4 fields of interest:
+ * 1) a category
+ * 2) an upper case mapping
+ * 3) a lower case mapping
+ * 4) a title case mapping
+ *
+ * toupper, tolower, and totitle are defined directly from the mapping.
+ *
+ * isalpharune(c) is true iff c is a "letter" category
+ * isupperrune(c) is true iff c is the target of toupperrune,
+ *	or is in the uppercase letter category
+ * similarly for islowerrune and istitlerune.
+ * isspacerune is true for space category chars, "C" locale white space chars,
+ *	and two additions:
+ *	0085	"next line" control char
+ *	feff]	"zero-width non-break space"
+ * isdigitrune is true iff c is a numeric-digit category.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "utf.h"
+#include "utfdef.h"
+
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+
+enum {
+	/*
+	 * fields in the unicode data file
+	 */
+	FIELD_CODE,
+	FIELD_NAME,
+	FIELD_CATEGORY,
+	FIELD_COMBINING,
+	FIELD_BIDIR,
+	FIELD_DECOMP,
+	FIELD_DECIMAL_DIG,
+	FIELD_DIG,
+	FIELD_NUMERIC_VAL,
+	FIELD_MIRRORED,
+	FIELD_UNICODE_1_NAME,
+	FIELD_COMMENT,
+	FIELD_UPPER,
+	FIELD_LOWER,
+	FIELD_TITLE,
+	NFIELDS,
+
+	MAX_LINE	= 1024,
+
+	TO_OFFSET	= 1 << 20,
+
+	NRUNES		= 1 << 21,
+};
+
+#define TO_DELTA(xmapped,x)	(TO_OFFSET + (xmapped) - (x))
+
+static char	myisspace[NRUNES];
+static char	myisalpha[NRUNES];
+static char	myisdigit[NRUNES];
+static char	myisupper[NRUNES];
+static char	myislower[NRUNES];
+static char	myistitle[NRUNES];
+
+static int	mytoupper[NRUNES];
+static int	mytolower[NRUNES];
+static int	mytotitle[NRUNES];
+
+static void	check(void);
+static void	mktables(char *src, int usepairs);
+static void	fatal(const char *fmt, ...);
+static int	mygetfields(char **fields, int nfields, char *str, const char *delim);
+static int	getunicodeline(FILE *in, char **fields, char *buf);
+static int	getcode(char *s);
+
+static void
+usage(void)
+{
+	fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n");
+	exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+	FILE *in;
+	char buf[MAX_LINE], buf2[MAX_LINE];
+	char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
+	char *p;
+	int i, code, last, usepairs;
+
+	usepairs = 0;
+
+	--argc;
+	++argv;
+
+	if (argc != 1){
+		usage();
+	}
+
+	in = fopen(argv[0], "r");
+	if (in == NULL){
+		fatal("can't open %s", argv[0]);
+	}
+
+	for(i = 0; i < NRUNES; i++){
+		mytoupper[i] = i;
+		mytolower[i] = i;
+		mytotitle[i] = i;
+	}
+
+	/*
+	 * make sure isspace has all of the "C" locale whitespace chars
+	 */
+	myisspace['\t'] = 1;
+	myisspace['\n'] = 1;
+	myisspace['\r'] = 1;
+	myisspace['\f'] = 1;
+	myisspace['\v'] = 1;
+
+	/*
+	 * a couple of other exceptions
+	 */
+	myisspace[0x85] = 1;	/* control char, "next line" */
+	myisspace[0xfeff] = 1;	/* zero-width non-break space */
+
+	last = -1;
+	while(getunicodeline(in, fields, buf)){
+		code = getcode(fields[FIELD_CODE]);
+		if (code >= NRUNES)
+			fatal("code-point value too big: %x", code);
+		if(code <= last)
+			fatal("bad code sequence: %x then %x", last, code);
+		last = code;
+
+		/*
+		 * check for ranges
+		 */
+		p = fields[FIELD_CATEGORY];
+		if(strstr(fields[FIELD_NAME], ", First>") != NULL){
+			if(!getunicodeline(in, fields2, buf2))
+				fatal("range start at eof");
+			if (strstr(fields2[FIELD_NAME], ", Last>") == NULL)
+				fatal("range start not followed by range end");
+			last = getcode(fields2[FIELD_CODE]);
+			if(last <= code)
+				fatal("range out of sequence: %x then %x", code, last);
+			if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
+				fatal("range with mismatched category");
+		}
+
+		/*
+		 * set properties and conversions
+		 */
+		for (; code <= last; code++){
+			if(p[0] == 'L')
+				myisalpha[code] = 1;
+			if(p[0] == 'Z')
+				myisspace[code] = 1;
+
+			if(strcmp(p, "Lu") == 0)
+				myisupper[code] = 1;
+			if(strcmp(p, "Ll") == 0)
+				myislower[code] = 1;
+
+			if(strcmp(p, "Lt") == 0)
+				myistitle[code] = 1;
+
+			if(strcmp(p, "Nd") == 0)
+				myisdigit[code] = 1;
+
+			/*
+			 * when finding conversions, also need to mark
+			 * upper/lower case, since some chars, like
+			 * "III" (0x2162), aren't defined as letters but have a
+			 * lower case mapping ("iii" (0x2172)).
+			 */
+			if(fields[FIELD_UPPER][0] != '\0'){
+				mytoupper[code] = getcode(fields[FIELD_UPPER]);
+			}
+			if(fields[FIELD_LOWER][0] != '\0'){
+				mytolower[code] = getcode(fields[FIELD_LOWER]);
+			}
+			if(fields[FIELD_TITLE][0] != '\0'){
+				mytotitle[code] = getcode(fields[FIELD_TITLE]);
+			}
+		}
+	}
+
+	fclose(in);
+
+	/*
+	 * check for codes with no totitle mapping but a toupper mapping.
+	 * these appear in UnicodeData-2.0.14.txt, but are almost certainly
+	 * erroneous.
+	 */
+	for(i = 0; i < NRUNES; i++){
+		if(mytotitle[i] == i
+		&& mytoupper[i] != i
+		&& !myistitle[i])
+			fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]);
+	}
+
+	/*
+	 * make sure isupper[c] is true if for some x toupper[x]  == c
+	 * ditto for islower and istitle
+	 */
+	for(i = 0; i < NRUNES; i++) {
+		if(mytoupper[i] != i)
+			myisupper[mytoupper[i]] = 1;
+		if(mytolower[i] != i)
+			myislower[mytolower[i]] = 1;
+		if(mytotitle[i] != i)
+			myistitle[mytotitle[i]] = 1;
+	}
+
+	mktables(argv[0], usepairs);
+	exit(0);
+}
+
+/*
+ * generate a properties array for ranges, clearing those cases covered.
+ * if force, generate one-entry ranges for singletons.
+ */
+static int
+mkisrange(const char* label, char* prop, int force)
+{
+	int start, stop, some;
+
+	/*
+	 * first, the ranges
+	 */
+	some = 0;
+	for(start = 0; start < NRUNES; ) {
+		if(!prop[start]){
+			start++;
+			continue;
+		}
+
+		for(stop = start + 1; stop < NRUNES; stop++){
+			if(!prop[stop]){
+				break;
+			}
+			prop[stop] = 0;
+		}
+		if(force || stop != start + 1){
+			if(!some){
+				printf("static char32_t is%sr[] = {\n", label);
+				some = 1;
+			}
+			prop[start] = 0;
+			printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1);
+		}
+
+		start = stop;
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate a mapping array for pairs with a skip between,
+ * clearing those entries covered.
+ */
+static int
+mkispair(const char *label, char *prop)
+{
+	int start, stop, some;
+
+	some = 0;
+	for(start = 0; start + 2 < NRUNES; ) {
+		if(!prop[start]){
+			start++;
+			continue;
+		}
+
+		for(stop = start + 2; stop < NRUNES; stop += 2){
+			if(!prop[stop]){
+				break;
+			}
+			prop[stop] = 0;
+		}
+		if(stop != start + 2){
+			if(!some){
+				printf("static char32_t is%sp[] = {\n", label);
+				some = 1;
+			}
+			prop[start] = 0;
+			printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2);
+		}
+
+		start = stop;
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate a properties array for singletons, clearing those cases covered.
+ */
+static int
+mkissingle(const char *label, char *prop)
+{
+	int start, some;
+
+	some = 0;
+	for(start = 0; start < NRUNES; start++) {
+		if(!prop[start]){
+			continue;
+		}
+
+		if(!some){
+			printf("static char32_t is%ss[] = {\n", label);
+			some = 1;
+		}
+		prop[start] = 0;
+		printf("\t0x%.4x,\n", start);
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate tables and a function for is<label>rune
+ */
+static void
+mkis(const char* label, char* prop, int usepairs)
+{
+	int isr, isp, iss;
+
+	isr = mkisrange(label, prop, 0);
+	isp = 0;
+	if(usepairs)
+		isp = mkispair(label, prop);
+	iss = mkissingle(label, prop);
+
+	printf(
+		"bool Unicode::is%s(char32_t c) noexcept\n"
+		"{\n"
+		"	char32_t *p;\n"
+		"\n",
+		label);
+
+	if(isr)
+		printf(
+			"	p = rbsearch(c, is%sr, nelem (is%sr)/2, 2);\n"
+			"	if (p && c >= p[0] && c <= p[1])\n"
+			"		return true;\n",
+			label, label);
+
+	if(isp)
+		printf(
+			"\n	p = rbsearch(c, is%sp, nelem (is%sp)/2, 2);\n"
+			"	if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+			"		return true;\n",
+			label, label);
+
+	if(iss)
+		printf(
+			"\n	p = rbsearch(c, is%ss, nelem (is%ss), 1);\n"
+			"	if (p && c == p[0])\n"
+			"		return true;\n",
+			label, label);
+
+
+	printf(
+		"\n	return false;\n"
+		"}\n"
+		"\n"
+	);
+}
+
+/*
+ * generate a mapping array for ranges, clearing those entries covered.
+ * if force, generate one-entry ranges for singletons.
+ */
+static int
+mktorange(const char* label, int* map, int force)
+{
+	int start, stop, delta, some;
+
+	some = 0;
+	for(start = 0; start < NRUNES; ) {
+		if(map[start] == start){
+			start++;
+			continue;
+		}
+
+		delta = TO_DELTA(map[start], start);
+		if(delta != (Rune)delta)
+			fatal("bad map delta %d", delta);
+		for(stop = start + 1; stop < NRUNES; stop++){
+			if(TO_DELTA(map[stop], stop) != delta){
+				break;
+			}
+			map[stop] = stop;
+		}
+		if(stop != start + 1){
+			if(!some){
+				printf("char32_t to%sr[] = {\n", label);
+				some = 1;
+			}
+			map[start] = start;
+			printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta);
+		}
+
+		start = stop;
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate a mapping array for pairs with a skip between,
+ * clearing those entries covered.
+ */
+static int
+mktopair(const char* label, int* map)
+{
+	int start, stop, delta, some;
+
+	some = 0;
+	for(start = 0; start + 2 < NRUNES; ) {
+		if(map[start] == start){
+			start++;
+			continue;
+		}
+
+		delta = TO_DELTA(map[start], start);
+		if(delta != (Rune)delta)
+			fatal("bad map delta %d", delta);
+		for(stop = start + 2; stop < NRUNES; stop += 2){
+			if(TO_DELTA(map[stop], stop) != delta){
+				break;
+			}
+			map[stop] = stop;
+		}
+		if(stop != start + 2){
+			if(!some){
+				printf("static char32_t to%sp[] = {\n", label);
+				some = 1;
+			}
+			map[start] = start;
+			printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta);
+		}
+
+		start = stop;
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate a mapping array for singletons, clearing those entries covered.
+ */
+static int
+mktosingle(const char* label, int* map)
+{
+	int start, delta, some;
+
+	some = 0;
+	for(start = 0; start < NRUNES; start++) {
+		if(map[start] == start){
+			continue;
+		}
+
+		delta = TO_DELTA(map[start], start);
+		if(delta != (Rune)delta)
+			fatal("bad map delta %d", delta);
+		if(!some){
+			printf("static char32_t to%ss[] = {\n", label);
+			some = 1;
+		}
+		map[start] = start;
+		printf("\t0x%.4x, %d,\n", start, delta);
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate tables and a function for to<label>rune
+ */
+static void
+mkto(const char* label, int* map, int usepairs)
+{
+	int tor, top, tos;
+
+	tor = mktorange(label, map, 0);
+	top = 0;
+	if(usepairs)
+		top = mktopair(label, map);
+	tos = mktosingle(label, map);
+
+	printf(
+		"char32_t Unicode::to%s(char32_t c) noexcept\n"
+		"{\n"
+		"	char32_t *p;\n"
+		"\n",
+		label);
+
+	if(tor)
+		printf(
+			"	p = rbsearch(c, to%sr, nelem (to%sr)/3, 3);\n"
+			"	if (p && c >= p[0] && c <= p[1])\n"
+			"		return c + p[2] - %d;\n",
+			label, label, TO_OFFSET);
+
+	if(top)
+		printf(
+			"\n	p = rbsearch(c, to%sp, nelem (to%sp)/3, 3);\n"
+			"	if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+			"		return c + p[2] - %d;\n",
+			label, label, TO_OFFSET);
+
+	if(tos)
+		printf(
+			"\n	p = rbsearch(c, to%ss, nelem (to%ss)/2, 2);\n"
+			"	if (p && c == p[0])\n"
+			"		return c + p[1] - %d;\n",
+			label, label, TO_OFFSET);
+
+	printf(
+		"\n	return c;\n"
+		"}\n"
+		"\n"
+	);
+}
+
+// Make only range tables and a function for is<label>rune.
+static void
+mkisronly(const char* label, char* prop)
+{
+	mkisrange(label, prop, 1);
+	printf(
+		"bool Unicode::is%s(char32_t c) noexcept\n"
+		"{\n"
+		"	char32_t *p;\n"
+		"\n"
+		"	p = rbsearch(c, is%sr, nelem (is%sr)/2, 2);\n"
+		"	if (p && c >= p[0] && c <= p[1])\n"
+		"		return true;\n\n"
+		"	return false;\n"
+		"}\n"
+		"\n",
+	        label, label, label);
+}
+
+/*
+ * generate the body of runetype.
+ * assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne);
+ */
+static void
+mktables(char *src, int usepairs)
+{
+	/* Add nelem macro */
+	printf(
+		"#define nelem(x) (sizeof (x) / sizeof ((x)[0]))\n\n"
+	);
+
+	/* Add the rbsearch function */
+	printf(
+		"char32_t *rbsearch(char32_t c, char32_t *t, int n, int ne) noexcept\n"
+		"{\n"
+		"	char32_t *p;\n"
+		"	int m;\n\n"
+		"	while (n > 1) {\n"
+		"		m = n >> 1;\n"
+		"		p = t + m * ne;\n\n"
+		"		if (c >= p[0]) {\n"
+		"			t = p;\n"
+		"			n = n - m;\n"
+		"		} else {\n"
+		"			n = m;\n"
+		"		}\n"
+		"	}\n\n"
+		"	if (n && c >= t[0])\n"
+		"		return t;\n\n"
+		"	return nullptr;\n"
+		"}\n\n"
+	);
+
+	/*
+	 * we special case the space and digit tables, since they are assumed
+	 * to be small with several ranges.
+	 */
+	mkisronly("space", myisspace);
+	mkisronly("digit", myisdigit);
+
+	mkis("alpha", myisalpha, 0);
+	mkis("upper", myisupper, usepairs);
+	mkis("lower", myislower, usepairs);
+	mkis("title", myistitle, usepairs);
+
+	mkto("upper", mytoupper, usepairs);
+	mkto("lower", mytolower, usepairs);
+	mkto("title", mytotitle, usepairs);
+}
+
+static int
+mygetfields(char **fields, int nfields, char *str, const char *delim)
+{
+	int nf;
+
+	fields[0] = str;
+	nf = 1;
+	if(nf >= nfields)
+		return nf;
+
+	for(; *str; str++){
+		if(strchr(delim, *str) != NULL){
+			*str = '\0';
+			fields[nf++] = str + 1;
+			if(nf >= nfields)
+				break;
+		}
+	}
+	return nf;
+}
+
+static int
+getunicodeline(FILE *in, char **fields, char *buf)
+{
+	char *p;
+
+	if(fgets(buf, MAX_LINE, in) == NULL)
+		return 0;
+
+	p = strchr(buf, '\n');
+	if (p == NULL)
+		fatal("line too long");
+	*p = '\0';
+
+	if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS)
+		fatal("bad number of fields");
+
+	return 1;
+}
+
+static int
+getcode(char *s)
+{
+	int i, code;
+
+	code = 0;
+	i = 0;
+	/* Parse a hex number */
+	while(s[i]) {
+		code <<= 4;
+		if(s[i] >= '0' && s[i] <= '9')
+			code += s[i] - '0';
+		else if(s[i] >= 'A' && s[i] <= 'F')
+			code += s[i] - 'A' + 10;
+		else
+			fatal("bad code char '%c'", s[i]);
+		i++;
+	}
+	return code;
+}
+
+static void
+fatal(const char *fmt, ...)
+{
+	va_list arg;
+
+	fprintf(stderr, "mkunicode: fatal error: ");
+	va_start(arg, fmt);
+	vfprintf(stderr, fmt, arg);
+	va_end(arg);
+	fprintf(stderr, "\n");
+
+	exit(1);
+}
\ No newline at end of file