changeset 23:4983392b356f

unicode: add totitle conversions
author David Demelier <markand@malikania.fr>
date Wed, 23 Mar 2022 11:46:41 +0100
parents 9cfc559a57c2
children 23ceab03a393
files gen/mkutf.awk tests/test-unicode.c unicode.c
diffstat 3 files changed, 50 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/gen/mkutf.awk	Tue Mar 22 20:28:17 2022 +0100
+++ b/gen/mkutf.awk	Wed Mar 23 11:46:41 2022 +0100
@@ -81,7 +81,7 @@
 $3 == "Cc" { cntrlv[cntrlc++] = $1; }
 $3 == "Lu" { upperv[upperc++] = $1; tolowerv[uppercc++] = ($14 == "") ? $1 : $14; }
 $3 == "Ll" { lowerv[lowerc++] = $1; toupperv[lowercc++] = ($13 == "") ? $1 : $13; }
-$3 == "Lt" { titlev[titlec++] = $1; }
+$3 == "Lt" { titlev[titlec++] = $1; totitlev[lotitlecc++] = ($13 == "") ? $1 : $13;}
 $3 == "Nd" { digitv[digitc++] = $1; }
 
 END {
@@ -90,7 +90,7 @@
 	mkis("control", cntrlv, cntrlc, q, "");
 	mkis("upper", upperv, upperc, tolowerv, "lower");
 	mkis("lower", lowerv, lowerc, toupperv, "upper");
-	mkis("title", titlev, titlec, q, "");
+	mkis("title", titlev, titlec, totitlev, "title");
 	mkis("digit", digitv, digitc, q, "");
 }
 
@@ -104,7 +104,7 @@
 	return x;
 }
 
-# generate 'is<name>rune' unicode lookup function
+# generate 'uni_is<name>' unicode lookup function
 function mkis(name, runev, runec, casev, casename) {
 	rune1c = 0;
 	rune2c = 0;
@@ -258,27 +258,27 @@
 
 	# generate case conversion function
 	if(length(casev) > 0) {
-		print "\nuint32_t\nuni_to"casename"(uint32_t r)\n{\n\tuint32_t *match;\n";
+		print "uint32_t\nuni_to"casename"(uint32_t r)\n{\n\tuint32_t *match;\n";
 		if(rune4c > 0) {
-			print "\tmatch = bsearch(&r, "name"4, nelem("name"4), sizeof *"name"4, &cmp2);";
+			print "\tmatch = bsearch(&r, "name"4, nelem("name"4), sizeof *"name"4, &cmp2);\n";
 			print "\tif (match)";
 			print "\t\treturn ((r - match[0]) % 2) ? r : r - 1;";
 		}
 		if(rune3c > 0) {
-			print "\tmatch = bsearch(&r, "name"3, nelem("name"3), sizeof *"name"3, &cmp2);";
+			print "\tmatch = bsearch(&r, "name"3, nelem("name"3), sizeof *"name"3, &cmp2);\n";
 			print "\tif (match)";
 			print "\t\treturn ((r - match[0]) % 2) ? r : r + 1;";
 		}
 		if(rune2c > 0) {
-			print "\tmatch = bsearch(&r, "name"2, nelem("name"2), sizeof *"name"2, &cmp2);";
+			print "\tmatch = bsearch(&r, "name"2, nelem("name"2), sizeof *"name"2, &cmp2);\n";
 			print "\tif (match)";
 			print "\t\treturn match[2] + (r - match[0]);";
 		}
 		if(rune1c > 0) {
-			print "\tmatch = bsearch(&r, "name"1, nelem("name"1), sizeof *"name"1, &cmp1);";
+			print "\tmatch = bsearch(&r, "name"1, nelem("name"1), sizeof *"name"1, &cmp1);\n";
 			print "\tif (match)";
 			print "\t\treturn match[1];";
 		}
-		print "\treturn r;\n}";
+		print "\treturn r;\n}\n";
 	}
 }
--- a/tests/test-unicode.c	Tue Mar 22 20:28:17 2022 +0100
+++ b/tests/test-unicode.c	Wed Mar 23 11:46:41 2022 +0100
@@ -322,6 +322,11 @@
 	RX_REQUIRE(uni_isupper(U'É'));
 }
 
+RX_TEST_CASE(misc, toupper)
+{
+	RX_INT_REQUIRE_EQUAL(uni_totitle(U's'), 'S');
+}
+
 int
 main(int argc, char **argv)
 {
--- a/unicode.c	Tue Mar 22 20:28:17 2022 +0100
+++ b/unicode.c	Wed Mar 23 11:46:41 2022 +0100
@@ -1141,23 +1141,26 @@
 	return 0;
 }
 
-
 uint32_t
 uni_tolower(uint32_t r)
 {
 	uint32_t *match;
 
 	match = bsearch(&r, upper3, nelem(upper3), sizeof *upper3, &cmp2);
+
 	if (match)
 		return ((r - match[0]) % 2) ? r : r + 1;
 	match = bsearch(&r, upper2, nelem(upper2), sizeof *upper2, &cmp2);
+
 	if (match)
 		return match[2] + (r - match[0]);
 	match = bsearch(&r, upper1, nelem(upper1), sizeof *upper1, &cmp1);
+
 	if (match)
 		return match[1];
 	return r;
 }
+
 static uint32_t lower4[][2] = {
 	{ 0x0101, 0x012F },
 	{ 0x0133, 0x0137 },
@@ -1492,37 +1495,40 @@
 	return 0;
 }
 
-
 uint32_t
 uni_toupper(uint32_t r)
 {
 	uint32_t *match;
 
 	match = bsearch(&r, lower4, nelem(lower4), sizeof *lower4, &cmp2);
+
 	if (match)
 		return ((r - match[0]) % 2) ? r : r - 1;
 	match = bsearch(&r, lower2, nelem(lower2), sizeof *lower2, &cmp2);
+
 	if (match)
 		return match[2] + (r - match[0]);
 	match = bsearch(&r, lower1, nelem(lower1), sizeof *lower1, &cmp1);
+
 	if (match)
 		return match[1];
 	return r;
 }
-static uint32_t title2[][2] = {
-	{ 0x1F88, 0x1F8F },
-	{ 0x1F98, 0x1F9F },
-	{ 0x1FA8, 0x1FAF },
+
+static uint32_t title2[][3] = {
+	{ 0x1F88, 0x1F8F, 0x1F88 },
+	{ 0x1F98, 0x1F9F, 0x1F98 },
+	{ 0x1FA8, 0x1FAF, 0x1FA8 },
 };
 
-static uint32_t title1[] = {
-	0x01C5,
-	0x01C8,
-	0x01CB,
-	0x01F2,
-	0x1FBC,
-	0x1FCC,
-	0x1FFC,
+static uint32_t title1[][2] = {
+	{ 0x01C5, 0x01C4 },
+	{ 0x01C8, 0x01C7 },
+	{ 0x01CB, 0x01CA },
+	{ 0x01F2, 0x01F1 },
+	{ 0x1FBC, 0x1FBC },
+	{ 0x1FCC, 0x1FCC },
+	{ 0x1FFC, 0x1FFC },
 };
 
 int
@@ -1536,6 +1542,22 @@
 	return 0;
 }
 
+uint32_t
+uni_totitle(uint32_t r)
+{
+	uint32_t *match;
+
+	match = bsearch(&r, title2, nelem(title2), sizeof *title2, &cmp2);
+
+	if (match)
+		return match[2] + (r - match[0]);
+	match = bsearch(&r, title1, nelem(title1), sizeof *title1, &cmp1);
+
+	if (match)
+		return match[1];
+	return r;
+}
+
 static uint32_t digit2[][2] = {
 	{ 0x0030, 0x0039 },
 	{ 0x0660, 0x0669 },