diff gen/mkutf.awk @ 20:496cd52a50ec

unicode: switch to sbase's mkrunetype.awk While here, update to last version of UnicodeData.txt and add uni_iscontrol function as well.
author David Demelier <markand@malikania.fr>
date Mon, 21 Mar 2022 09:00:42 +0100
parents
children 4983392b356f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gen/mkutf.awk	Mon Mar 21 09:00:42 2022 +0100
@@ -0,0 +1,284 @@
+#
+# This file comes from sbase (https://git.suckless.org/sbase/file/libutf/Makefile.html)
+# and has been modified to match libunicode's API.
+#
+# Original license is as following:
+#
+# MIT/X Consortium License
+#
+# © 2011 Connor Lane Smith <cls@lubutu.com>
+# © 2011-2016 Dimitris Papastamos <sin@2f30.org>
+# © 2014-2016 Laslo Hunhold <dev@frign.de>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+# Authors/contributors include:
+#
+# © 2011 Kamil Cholewiński <harry666t@gmail.com>
+# © 2011 Rob Pilling <robpilling@gmail.com>
+# © 2011 Hiltjo Posthuma <hiltjo@codemadness.org>
+# © 2011 pancake <pancake@youterm.com>
+# © 2011 Random832 <random832@fastmail.us>
+# © 2012 William Haddon <william@haddonthethird.net>
+# © 2012 Kurt H. Maier <khm@sciops.net>
+# © 2012 Christoph Lohmann <20h@r-36.net>
+# © 2012 David Galos <galosd83@students.rowan.edu>
+# © 2012 Robert Ransom <rransom.8774@gmail.com>
+# © 2013 Jakob Kramer <jakob.kramer@gmx.de>
+# © 2013 Anselm R Garbe <anselm@garbe.us>
+# © 2013 Truls Becken <truls.becken@gmail.com>
+# © 2013 dsp <dsp@2f30.org>
+# © 2013 Markus Teich <markus.teich@stusta.mhn.de>
+# © 2013 Jesse Ogle <jesse.p.ogle@gmail.com>
+# © 2013 Lorenzo Cogotti <miciamail@hotmail.it>
+# © 2013 Federico G. Benavento <benavento@gmail.com>
+# © 2013 Roberto E. Vargas Caballero <k0ga@shike2.com>
+# © 2013 Christian Hesse <mail@eworm.de>
+# © 2013 Markus Wichmann <nullplan@gmx.net>
+# © 2014 Silvan Jegen <s.jegen@gmail.com>
+# © 2014 Daniel Bainton <dpb@driftaway.org>
+# © 2014 Tuukka Kataja <stuge@xor.fi>
+# © 2014 Jeffrey Picard <jeff@jeffreypicard.com>
+# © 2014 Evan Gates <evan.gates@gmail.com>
+# © 2014 Michael Forney <mforney@mforney.org>
+# © 2014 Ari Malinen <ari.malinen@gmail.com>
+# © 2014 Brandon Mulcahy <brandon@jangler.info>
+# © 2014 Adria Garriga <rhaps0dy@installgentoo.com>
+# © 2014-2015 Greg Reagle <greg.reagle@umbc.edu>
+# © 2015 Tai Chi Minh Ralph Eastwood <tcmreastwood@gmail.com>
+# © 2015 Quentin Rameau <quinq@quinq.eu.org>
+# © 2015 Dionysis Grigoropoulos <info@erethon.com>
+# © 2015 Wolfgang Corcoran-Mathe <wcm@sigwinch.xyz>
+# © 2016 Mattias Andrée <maandree@kth.se>
+# © 2016 Eivind Uggedal <eivind@uggedal.com>
+#
+
+BEGIN {
+	FS = ";"
+	# set up hexadecimal lookup table
+	for(i = 0; i < 16; i++)
+		hex[sprintf("%X",i)] = i;
+}
+
+$3  ~ /^L/ { alphav[alphac++] = $1; }
+($3  ~ /^Z/) || ($5 == "WS") || ($5 == "S") || ($5 == "B") { spacev[spacec++] = $1; }
+$3 == "Cc" { cntrlv[cntrlc++] = $1; }
+$3 == "Lu" { upperv[upperc++] = $1; tolowerv[uppercc++] = ($14 == "") ? $1 : $14; }
+$3 == "Ll" { lowerv[lowerc++] = $1; toupperv[lowercc++] = ($13 == "") ? $1 : $13; }
+$3 == "Lt" { titlev[titlec++] = $1; }
+$3 == "Nd" { digitv[digitc++] = $1; }
+
+END {
+	mkis("alpha", alphav, alphac, q, "");
+	mkis("space", spacev, spacec, q, "");
+	mkis("control", cntrlv, cntrlc, q, "");
+	mkis("upper", upperv, upperc, tolowerv, "lower");
+	mkis("lower", lowerv, lowerc, toupperv, "upper");
+	mkis("title", titlev, titlec, q, "");
+	mkis("digit", digitv, digitc, q, "");
+}
+
+# parse hexadecimal rune index to int
+function code(s) {
+	x = 0;
+	for(i = 1; i <= length(s); i++) {
+		c = substr(s, i, 1);
+		x = (x*16) + hex[c];
+	}
+	return x;
+}
+
+# generate 'is<name>rune' unicode lookup function
+function mkis(name, runev, runec, casev, casename) {
+	rune1c = 0;
+	rune2c = 0;
+	rune3c = 0;
+	rune4c = 0;
+	mode = 1;
+
+	#sort rune groups into singletons, ranges and laces
+	for(j = 0; j < runec; j++) {
+		# range
+		if(code(runev[j+1]) == code(runev[j])+1 && ((length(casev) == 0) ||
+		   code(casev[j+1]) == code(casev[j])+1) && j+1 < runec) {
+			if (mode == 2) {
+				continue;
+			} else if (mode == 3) {
+				rune3v1[rune3c] = runev[j];
+				rune3c++;
+			} else if (mode == 4) {
+				rune4v1[rune4c] = runev[j];
+				rune4c++;
+			}
+			mode = 2;
+			rune2v0[rune2c] = runev[j];
+			if(length(casev) > 0) {
+				case2v[rune2c] = casev[j];
+			}
+			continue;
+		}
+		# lace 1
+		if(code(runev[j+1]) == code(runev[j])+2 && ((length(casev) == 0) ||
+		   (code(casev[j+1]) == code(runev[j+1])+1 && code(casev[j]) == code(runev[j])+1)) &&
+		   j+1 < runec) {
+			if (mode == 3) {
+				continue;
+			} else if (mode == 2) {
+				rune2v1[rune2c] = runev[j];
+				rune2c++;
+			} else if (mode == 4) {
+				rune4v1[rune2c] = runev[j];
+				rune4c++;
+			}
+			mode = 3;
+			rune3v0[rune3c] = runev[j];
+			continue;
+		}
+		# lace 2
+		if(code(runev[j+1]) == code(runev[j])+2 && ((length(casev) == 0) ||
+		   (code(casev[j+1]) == code(runev[j+1])-1 && code(casev[j]) == code(runev[j])-1)) &&
+		   j+1 < runec) {
+			if (mode == 4) {
+				continue;
+			} else if (mode == 2) {
+				rune2v1[rune2c] = runev[j];
+				rune2c++;
+			} else if (mode == 3) {
+				rune3v1[rune2c] = runev[j];
+				rune3c++;
+			}
+			mode = 4;
+			rune4v0[rune4c] = runev[j];
+			continue;
+		}
+		# terminating case
+		if (mode == 1) {
+			rune1v[rune1c] = runev[j];
+			if (length(casev) > 0) {
+				case1v[rune1c] = casev[j];
+			}
+			rune1c++;
+		} else if (mode == 2) {
+			rune2v1[rune2c] = runev[j];
+			rune2c++;
+		} else if (mode == 3) {
+			rune3v1[rune3c] = runev[j];
+			rune3c++;
+		} else { #lace 2
+			rune4v1[rune4c] = runev[j];
+			rune4c++;
+		}
+		mode = 1;
+	}
+
+	#generate list of laces 1
+	if(rune3c > 0) {
+		print "static uint32_t "name"3[][2] = {";
+		for(j = 0; j < rune3c; j++) {
+			print "\t{ 0x"rune3v0[j]", 0x"rune3v1[j]" },";
+		}
+		print "};\n";
+	}
+
+	#generate list of laces 2
+	if(rune4c > 0) {
+		print "static uint32_t "name"4[][2] = {";
+		for(j = 0; j < rune4c; j++) {
+			print "\t{ 0x"rune4v0[j]", 0x"rune4v1[j]" },";
+		}
+		print "};\n";
+	}
+
+	# generate list of ranges
+	if(rune2c > 0) {
+		if(length(casev) > 0) {
+			print "static uint32_t "name"2[][3] = {";
+			for(j = 0; j < rune2c; j++) {
+				print "\t{ 0x"rune2v0[j]", 0x"rune2v1[j]", 0x"case2v[j]" },";
+			}
+		} else {
+			print "static uint32_t "name"2[][2] = {"
+			for(j = 0; j < rune2c; j++) {
+				print "\t{ 0x"rune2v0[j]", 0x"rune2v1[j]" },";
+			}
+		}
+		print "};\n";
+	}
+
+	# generate list of singletons
+	if(rune1c > 0) {
+		if(length(casev) > 0) {
+			print "static uint32_t "name"1[][2] = {";
+			for(j = 0; j < rune1c; j++) {
+				print "\t{ 0x"rune1v[j]", 0x"case1v[j]" },";
+			}
+		} else {
+			print "static uint32_t "name"1[] = {";
+			for(j = 0; j < rune1c; j++) {
+				print "\t0x"rune1v[j]",";
+			}
+		}
+		print "};\n";
+	}
+	# generate lookup function
+	print "int\nuni_is"name"(uint32_t r)\n{";
+	if(rune4c > 0 || rune3c > 0)
+		print "\tconst uint32_t *match;\n";
+	if(rune4c > 0) {
+		print "\tif ((match = bsearch(&r, "name"4, nelem("name"4), sizeof *"name"4, &cmp2)))";
+		print "\t\treturn !((r - match[0]) % 2);";
+	}
+	if(rune3c > 0) {
+		print "\tif ((match = bsearch(&r, "name"3, nelem("name"3), sizeof *"name"3, &cmp2)))";
+		print "\t\treturn !((r - match[0]) % 2);";
+	}
+	if(rune2c > 0) {
+		print "\tif (bsearch(&r, "name"2, nelem("name"2), sizeof *"name"2, &cmp2))\n\t\treturn 1;";
+	}
+	if(rune1c > 0) {
+		print "\tif (bsearch(&r, "name"1, nelem("name"1), sizeof *"name"1, &cmp1))\n\t\treturn 1;";
+	}
+	print "\n\treturn 0;\n}\n";
+
+	# generate case conversion function
+	if(length(casev) > 0) {
+		print "\nuint32_t\nuni_to"casename"(uint32_t r)\n{\n\tuint32_t *match;\n";
+		if(rune4c > 0) {
+			print "\tmatch = bsearch(&r, "name"4, nelem("name"4), sizeof *"name"4, &cmp2);";
+			print "\tif (match)";
+			print "\t\treturn ((r - match[0]) % 2) ? r : r - 1;";
+		}
+		if(rune3c > 0) {
+			print "\tmatch = bsearch(&r, "name"3, nelem("name"3), sizeof *"name"3, &cmp2);";
+			print "\tif (match)";
+			print "\t\treturn ((r - match[0]) % 2) ? r : r + 1;";
+		}
+		if(rune2c > 0) {
+			print "\tmatch = bsearch(&r, "name"2, nelem("name"2), sizeof *"name"2, &cmp2);";
+			print "\tif (match)";
+			print "\t\treturn match[2] + (r - match[0]);";
+		}
+		if(rune1c > 0) {
+			print "\tmatch = bsearch(&r, "name"1, nelem("name"1), sizeof *"name"1, &cmp1);";
+			print "\tif (match)";
+			print "\t\treturn match[1];";
+		}
+		print "\treturn r;\n}";
+	}
+}