Mercurial > libunicode
changeset 23:4983392b356f
unicode: add totitle conversions
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 23 Mar 2022 11:46:41 +0100 |
parents | 9cfc559a57c2 |
children | 23ceab03a393 |
files | gen/mkutf.awk tests/test-unicode.c unicode.c |
diffstat | 3 files changed, 50 insertions(+), 23 deletions(-) [+] |
line wrap: on
line diff
--- a/gen/mkutf.awk Tue Mar 22 20:28:17 2022 +0100 +++ b/gen/mkutf.awk Wed Mar 23 11:46:41 2022 +0100 @@ -81,7 +81,7 @@ $3 == "Cc" { cntrlv[cntrlc++] = $1; } $3 == "Lu" { upperv[upperc++] = $1; tolowerv[uppercc++] = ($14 == "") ? $1 : $14; } $3 == "Ll" { lowerv[lowerc++] = $1; toupperv[lowercc++] = ($13 == "") ? $1 : $13; } -$3 == "Lt" { titlev[titlec++] = $1; } +$3 == "Lt" { titlev[titlec++] = $1; totitlev[lotitlecc++] = ($13 == "") ? $1 : $13;} $3 == "Nd" { digitv[digitc++] = $1; } END { @@ -90,7 +90,7 @@ mkis("control", cntrlv, cntrlc, q, ""); mkis("upper", upperv, upperc, tolowerv, "lower"); mkis("lower", lowerv, lowerc, toupperv, "upper"); - mkis("title", titlev, titlec, q, ""); + mkis("title", titlev, titlec, totitlev, "title"); mkis("digit", digitv, digitc, q, ""); } @@ -104,7 +104,7 @@ return x; } -# generate 'is<name>rune' unicode lookup function +# generate 'uni_is<name>' unicode lookup function function mkis(name, runev, runec, casev, casename) { rune1c = 0; rune2c = 0; @@ -258,27 +258,27 @@ # generate case conversion function if(length(casev) > 0) { - print "\nuint32_t\nuni_to"casename"(uint32_t r)\n{\n\tuint32_t *match;\n"; + print "uint32_t\nuni_to"casename"(uint32_t r)\n{\n\tuint32_t *match;\n"; if(rune4c > 0) { - print "\tmatch = bsearch(&r, "name"4, nelem("name"4), sizeof *"name"4, &cmp2);"; + print "\tmatch = bsearch(&r, "name"4, nelem("name"4), sizeof *"name"4, &cmp2);\n"; print "\tif (match)"; print "\t\treturn ((r - match[0]) % 2) ? r : r - 1;"; } if(rune3c > 0) { - print "\tmatch = bsearch(&r, "name"3, nelem("name"3), sizeof *"name"3, &cmp2);"; + print "\tmatch = bsearch(&r, "name"3, nelem("name"3), sizeof *"name"3, &cmp2);\n"; print "\tif (match)"; print "\t\treturn ((r - match[0]) % 2) ? r : r + 1;"; } if(rune2c > 0) { - print "\tmatch = bsearch(&r, "name"2, nelem("name"2), sizeof *"name"2, &cmp2);"; + print "\tmatch = bsearch(&r, "name"2, nelem("name"2), sizeof *"name"2, &cmp2);\n"; print "\tif (match)"; print "\t\treturn match[2] + (r - match[0]);"; } if(rune1c > 0) { - print "\tmatch = bsearch(&r, "name"1, nelem("name"1), sizeof *"name"1, &cmp1);"; + print "\tmatch = bsearch(&r, "name"1, nelem("name"1), sizeof *"name"1, &cmp1);\n"; print "\tif (match)"; print "\t\treturn match[1];"; } - print "\treturn r;\n}"; + print "\treturn r;\n}\n"; } }
--- a/tests/test-unicode.c Tue Mar 22 20:28:17 2022 +0100 +++ b/tests/test-unicode.c Wed Mar 23 11:46:41 2022 +0100 @@ -322,6 +322,11 @@ RX_REQUIRE(uni_isupper(U'É')); } +RX_TEST_CASE(misc, toupper) +{ + RX_INT_REQUIRE_EQUAL(uni_totitle(U's'), 'S'); +} + int main(int argc, char **argv) {
--- a/unicode.c Tue Mar 22 20:28:17 2022 +0100 +++ b/unicode.c Wed Mar 23 11:46:41 2022 +0100 @@ -1141,23 +1141,26 @@ return 0; } - uint32_t uni_tolower(uint32_t r) { uint32_t *match; match = bsearch(&r, upper3, nelem(upper3), sizeof *upper3, &cmp2); + if (match) return ((r - match[0]) % 2) ? r : r + 1; match = bsearch(&r, upper2, nelem(upper2), sizeof *upper2, &cmp2); + if (match) return match[2] + (r - match[0]); match = bsearch(&r, upper1, nelem(upper1), sizeof *upper1, &cmp1); + if (match) return match[1]; return r; } + static uint32_t lower4[][2] = { { 0x0101, 0x012F }, { 0x0133, 0x0137 }, @@ -1492,37 +1495,40 @@ return 0; } - uint32_t uni_toupper(uint32_t r) { uint32_t *match; match = bsearch(&r, lower4, nelem(lower4), sizeof *lower4, &cmp2); + if (match) return ((r - match[0]) % 2) ? r : r - 1; match = bsearch(&r, lower2, nelem(lower2), sizeof *lower2, &cmp2); + if (match) return match[2] + (r - match[0]); match = bsearch(&r, lower1, nelem(lower1), sizeof *lower1, &cmp1); + if (match) return match[1]; return r; } -static uint32_t title2[][2] = { - { 0x1F88, 0x1F8F }, - { 0x1F98, 0x1F9F }, - { 0x1FA8, 0x1FAF }, + +static uint32_t title2[][3] = { + { 0x1F88, 0x1F8F, 0x1F88 }, + { 0x1F98, 0x1F9F, 0x1F98 }, + { 0x1FA8, 0x1FAF, 0x1FA8 }, }; -static uint32_t title1[] = { - 0x01C5, - 0x01C8, - 0x01CB, - 0x01F2, - 0x1FBC, - 0x1FCC, - 0x1FFC, +static uint32_t title1[][2] = { + { 0x01C5, 0x01C4 }, + { 0x01C8, 0x01C7 }, + { 0x01CB, 0x01CA }, + { 0x01F2, 0x01F1 }, + { 0x1FBC, 0x1FBC }, + { 0x1FCC, 0x1FCC }, + { 0x1FFC, 0x1FFC }, }; int @@ -1536,6 +1542,22 @@ return 0; } +uint32_t +uni_totitle(uint32_t r) +{ + uint32_t *match; + + match = bsearch(&r, title2, nelem(title2), sizeof *title2, &cmp2); + + if (match) + return match[2] + (r - match[0]); + match = bsearch(&r, title1, nelem(title1), sizeof *title1, &cmp1); + + if (match) + return match[1]; + return r; +} + static uint32_t digit2[][2] = { { 0x0030, 0x0039 }, { 0x0660, 0x0669 },