Mercurial > libunicode
changeset 11:43a9d763656b
unicode: improve C API, removing dynamic allocations
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 25 Mar 2020 14:33:03 +0100 |
parents | ae1003c2a284 |
children | 083f11d2536f |
files | .hgignore Doxyfile Makefile doc/mainpage.cpp gen/unicode-after.c test/unicode.c unicode.c unicode.h |
diffstat | 8 files changed, 769 insertions(+), 415 deletions(-) [+] |
line wrap: on
line diff
--- a/.hgignore Wed Mar 25 09:56:05 2020 +0100 +++ b/.hgignore Wed Mar 25 14:33:03 2020 +0100 @@ -3,6 +3,9 @@ \.swp$ \.swo$ +# Doxygen. +^doxygen/ + # Generator files. ^gen/mkunicode-c$ ^gen/mkunicode-cpp$
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Doxyfile Wed Mar 25 14:33:03 2020 +0100 @@ -0,0 +1,37 @@ +# +# Doxyfile -- generate API documentation for Molko's Adventure +# +# Copyright (c) 2013-2020 David Demelier <markand@malikania.fr> +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# + +DOXYFILE_ENCODING = UTF-8 +PROJECT_NAME = "libunicode" +PROJECT_NUMBER = "0.1.0" +PROJECT_BRIEF = "UTF-8 to UTF-32 conversions and various operations" +PROJECT_LOGO = +OUTPUT_DIRECTORY = doxygen +ALLOW_UNICODE_NAMES = YES +STRIP_FROM_PATH = ./ +TAB_SIZE = 8 +OPTIMIZE_OUTPUT_FOR_C = YES +AUTOLINK_SUPPORT = NO +QUIET = YES +WARNINGS = YES +INPUT = unicode.h unicode.hpp +INPUT_ENCODING = UTF-8 +RECURSIVE = NO +GENERATE_LATEX = NO +GENERATE_MAN = NO +MAX_INITIALIZER_LINES = 0
--- a/Makefile Wed Mar 25 09:56:05 2020 +0100 +++ b/Makefile Wed Mar 25 14:33:03 2020 +0100 @@ -14,6 +14,7 @@ # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# .POSIX: @@ -50,10 +51,12 @@ tests: test/unicode test/unicode++ test/unicode - test/unicode++ + +doxygen: + doxygen Doxyfile clean: rm -f gen/mkunicode-c gen/mkunicode-cpp rm -f test/unicode test/unicode++ -.PHONY: all clean tests +.PHONY: all clean doxygen tests
--- a/doc/mainpage.cpp Wed Mar 25 09:56:05 2020 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -/** - * \mainpage - * - * Welcome to the unicode library. - * - * ## Introduction - * - * This library provides UTF-8 to UTF-32 conversions and routines to test - * category of characters. It works on std::string and std::u32string. - * - * With C++17, you can also use std::string_view and std::u32string_view. - * - * ## Installation (C++ variant) - * - * Just copy the files unicode.cpp and unicode.hpp and add them to your project. - * - * ## Installation (C variant) - * - * Copy the files unicode.c and unicode.h and add them to your project. - */
--- a/gen/unicode-after.c Wed Mar 25 09:56:05 2020 +0100 +++ b/gen/unicode-after.c Wed Mar 25 14:33:03 2020 +0100 @@ -1,101 +1,103 @@ -static size_t -requires(const uint32_t *src) +size_t +uni8_encode(uint8_t dst[], size_t dstsz, uint32_t point) { - size_t size = 0; - int nb; + assert(dst); + + size_t written; - while (*src) { - if ((nb = uni_requires(*src++)) == -1) { - errno = EILSEQ; - return -1; - } + switch ((written = uni32_sizeof(point))) { + case 1: + if (dstsz < 1) + goto erange; - if (nb > SIZE_MAX - size) + dst[0] = (uint8_t)point; + break; + case 2: + if (dstsz < 2) goto erange; - size += nb; - } - - /* If SIZE_MAX -> no space for '\0' */ - if (size == SIZE_MAX) - goto erange; - - return size; - -erange: - errno = ERANGE; - return -1; -} - -void -uni_encode(uint32_t c, char *dst) -{ - assert(uni_requires(c) != -1); - assert(dst); - - switch (uni_requires(c)) { - case 1: - dst[0] = (char)c; - dst[1] = '\0'; - break; - case 2: - dst[0] = 0xC0 | ((c >> 6) & 0x1F); - dst[1] = 0x80 | (c & 0x3F); - dst[2] = '\0'; + dst[0] = 0xC0 | ((point >> 6) & 0x1F); + dst[1] = 0x80 | (point & 0x3F); break; case 3: - dst[0] = 0xE0 | ((c >> 12) & 0xF ); - dst[1] = 0x80 | ((c >> 6) & 0x3F); - dst[2] = 0x80 | (c & 0x3F); - dst[3] = '\0'; + if (dstsz < 3) + goto erange; + + dst[0] = 0xE0 | ((point >> 12) & 0xF ); + dst[1] = 0x80 | ((point >> 6) & 0x3F); + dst[2] = 0x80 | (point & 0x3F); break; case 4: - dst[0] = 0xF0 | ((c >> 18) & 0x7 ); - dst[1] = 0x80 | ((c >> 12) & 0x3F); - dst[2] = 0x80 | ((c >> 6) & 0x3F); - dst[3] = 0x80 | (c & 0x3F); - dst[4] = '\0'; + if (dstsz < 4) + goto erange; + + dst[0] = 0xF0 | ((point >> 18) & 0x7 ); + dst[1] = 0x80 | ((point >> 12) & 0x3F); + dst[2] = 0x80 | ((point >> 6) & 0x3F); + dst[3] = 0x80 | (point & 0x3F); break; default: break; } + + return written; + +erange: + errno = ERANGE; + + return -1; } -uint32_t -uni_decode(const char *src) +size_t +uni8_decode(const uint8_t src[], uint32_t *point) { assert(src); - assert(uni_sizeof(*src) != -1); + assert(point); - uint32_t c = 0; + size_t parsed; - switch (uni_sizeof(*src)) { + switch ((parsed = uni8_sizeof(*src))) { case 1: - c = src[0]; + *point = src[0]; break; case 2: - c = (src[0] & 0x1f) << 6; - c |= (src[1] & 0x3f); + if (!src[1]) + goto eilseq; + + *point = (src[0] & 0x1f) << 6; + *point |= (src[1] & 0x3f); break; case 3: - c = (src[0] & 0x0f) << 12; - c |= (src[1] & 0x3f) << 6; - c |= (src[2] & 0x3f); + if (!src[1] || !src[2]) + goto eilseq; + + *point = (src[0] & 0x0f) << 12; + *point |= (src[1] & 0x3f) << 6; + *point |= (src[2] & 0x3f); break; case 4: - c = (src[0] & 0x07) << 16; - c |= (src[1] & 0x3f) << 12; - c |= (src[2] & 0x3f) << 6; - c |= (src[3] & 0x3f); + if (!src[1] || !src[2] || !src[3]) + goto eilseq; + + *point = (src[0] & 0x07) << 16; + *point |= (src[1] & 0x3f) << 12; + *point |= (src[2] & 0x3f) << 6; + *point |= (src[3] & 0x3f); + break; default: break; } - return c; + return parsed; + +eilseq: + errno = EILSEQ; + + return -1; } -int -uni_sizeof(unsigned char c) +size_t +uni8_sizeof(uint8_t c) { if (c <= 127) return 1; @@ -106,11 +108,57 @@ if ((c & 0xF8) == 0xF0) return 4; + errno = EILSEQ; return -1; } -int -uni_requires(uint32_t c) +size_t +uni8_length(const uint8_t src[]) +{ + assert(src); + + size_t total = 0, gap; + + while (*src) { + if ((gap = uni8_sizeof(*src)) == (size_t)-1) + return -1; + + total += gap; + src += gap; + } + + return total; +} + +size_t +uni8_to32(const uint8_t src[], uint32_t dst[], size_t dstsz) +{ + assert(src); + assert(dst); + + size_t nwritten = 0, gap; + + for (; *src && dstsz; --dstsz) { + if ((gap = uni8_decode(src, dst++)) == (size_t)-1) + return -1; + + src += gap; + ++nwritten; + } + + /* No more space to store NUL. */ + if (dstsz == 0) { + errno = ERANGE; + return -1; + } + + *dst = 0; + + return nwritten; +} + +size_t +uni32_sizeof(uint32_t c) { if (c <= 0x7F) return 1; @@ -121,82 +169,67 @@ if (c <= 0x1FFFFF) return 4; + errno = EILSEQ; return -1; } size_t -uni_length(const char *src) +uni32_length(const uint32_t src[]) { + assert(src); + size_t total = 0; - int gap; + + while (*src++) + total++; + + return total; +} + +size_t +uni32_requires(const uint32_t src[]) +{ + assert(src); + + size_t total = 0, gap; while (*src) { - if ((gap = uni_sizeof(*src)) == -1) { - errno = EILSEQ; + if ((gap = uni32_sizeof(*src++)) == (size_t)-1) + return -1; + if (gap >= SIZE_MAX - total) { + errno = ERANGE; return -1; } total += gap; - src += gap; } return total; } -char * -uni_toutf8(const uint32_t *src) -{ - assert(src); - - size_t total; - char *out, *ptr; - int nb; - - if ((total = requires(src)) == -1) - return NULL; - if (!(out = malloc(total + 1))) - return NULL; - - ptr = out; - - while (*src) { - nb = uni_requires(*src); - uni_encode(*src++, ptr); - ptr += nb; - } - - return out; -} - -uint32_t * -uni_toutf32(const char *src) +size_t +uni32_to8(const uint32_t src[], uint8_t dst[], size_t dstsz) { assert(src); + assert(dst); - size_t length; - uint32_t *out, *ptr; - int nb; + size_t nwritten = 0, gap; - if ((length = uni_length(src)) == -1) - return NULL; - if (length == SIZE_MAX) { - errno = ERANGE; - return NULL; - } - if (!(out = malloc(sizeof (uint32_t) * length + 1))) - return NULL; + while (*src && dstsz) { + if ((gap = uni8_encode(dst, dstsz, *src++)) == (size_t)-1) + return -1; - ptr = out; - - while (*src) { - /* No checks needed, uni_length already did it for us. */ - int nb = uni_sizeof(*src); - - *ptr++ = uni_decode(src); - src += nb; + dst += gap; + dstsz -= gap; + nwritten += gap; } - *ptr = 0; + if (dstsz == 0) { + errno = ERANGE; + return -1; + } - return out; + *dst = 0; + + return nwritten; }
--- a/test/unicode.c Wed Mar 25 09:56:05 2020 +0100 +++ b/test/unicode.c Wed Mar 25 14:33:03 2020 +0100 @@ -47,99 +47,346 @@ return l1 == l2 && memcmp(s1, s2, l1) == 0; } -/* - * Conversion UTF32 -> UTF8 - * ------------------------------------------------------------------ - */ +GREATEST_TEST +test_uni8_encode_basic(void) +{ + size_t r; + + /* a -> 1 bytes. */ + { + uint8_t buffer[5] = { 0 }; + + r = uni8_encode(buffer, sizeof (buffer), U'a'); + GREATEST_ASSERT_EQ(r, 1); + GREATEST_ASSERT_STR_EQ(buffer, u8"a"); + } + + /* é -> 2 bytes. */ + { + uint8_t buffer[5] = { 0 }; + + r = uni8_encode(buffer, sizeof (buffer), U'é'); + GREATEST_ASSERT_EQ(r, 2); + GREATEST_ASSERT_STR_EQ(buffer, u8"é"); + } + + GREATEST_PASS(); +} GREATEST_TEST -utf32_to_utf8_ascii(void) +test_uni8_encode_invalid(void) { - const uint32_t u32[] = { U'a', U'b', U'c', 0 }; - char *s = uni_toutf8(u32); + size_t r; + uint8_t buffer[5] = { 0 }; - GREATEST_ASSERT_STR_EQ(s, "abc"); + r = uni8_encode(buffer, sizeof (buffer), 0xffffffff); + GREATEST_ASSERT_EQ(r, (size_t)-1); + GREATEST_ASSERT_EQ(errno, EILSEQ); GREATEST_PASS(); } GREATEST_TEST -utf32_to_utf8_valid(void) +test_uni8_encode_toosmall(void) +{ + size_t r; + uint8_t buffer[1] = { 0 }; + + r = uni8_encode(buffer, sizeof (buffer), U'é'); + GREATEST_ASSERT_EQ(r, (size_t)-1); + GREATEST_ASSERT_EQ(errno, ERANGE); + GREATEST_PASS(); +} + +GREATEST_SUITE(suite_uni8_encode) +{ + GREATEST_RUN_TEST(test_uni8_encode_basic); + GREATEST_RUN_TEST(test_uni8_encode_invalid); + GREATEST_RUN_TEST(test_uni8_encode_toosmall); +} + +GREATEST_TEST +test_uni8_decode_basic(void) { - const uint32_t u32[] = { 'a', U'é', 'c', U'𠀀', 0 }; - char *s = uni_toutf8(u32); + size_t r; + + /* a -> 1 bytes. */ + { + uint32_t code = -1; - GREATEST_ASSERT_STR_EQ(s, u8"aéc𠀀"); + r = uni8_decode(u8"a", &code); + GREATEST_ASSERT_EQ(r, 1); + GREATEST_ASSERT_EQ(code, 'a'); + } + + /* é -> 2 bytes. */ + { + uint32_t code = -1; + + r = uni8_decode(u8"é", &code); + GREATEST_ASSERT_EQ(r, 2); + GREATEST_ASSERT_EQ(code, U'é'); + } + GREATEST_PASS(); } GREATEST_TEST -utf32_to_utf8_invalid(void) +test_uni8_decode_invalid(void) +{ + size_t r; + + /* Invalid UTF-8 sequence. */ + { + uint32_t code = -1; + + r = uni8_decode(u8"\xff""a", &code); + GREATEST_ASSERT_EQ(r, (size_t)-1); + GREATEST_ASSERT_EQ(code, (uint32_t)-1); + GREATEST_ASSERT_EQ(errno, EILSEQ); + } + + /* Valid "€" but unfinished sequence. */ + { + uint32_t code = -1; + + r = uni8_decode((const uint8_t []){ -30, 0 }, &code); + GREATEST_ASSERT_EQ(r, (size_t)-1); + GREATEST_ASSERT_EQ(code, (uint32_t)-1); + GREATEST_ASSERT_EQ(errno, EILSEQ); + } + + GREATEST_PASS(); +} + +GREATEST_SUITE(suite_uni8_decode) { - const uint32_t u32[] = { 'a', 0xFFFFFFFF, 'c', 0 }; - char *s = uni_toutf8(u32); + GREATEST_RUN_TEST(test_uni8_decode_basic); + GREATEST_RUN_TEST(test_uni8_decode_invalid); +} + +GREATEST_TEST +test_uni8_sizeof_basic(void) +{ + GREATEST_ASSERT_EQ(1, uni8_sizeof(u8"a"[0])); + GREATEST_ASSERT_EQ(2, uni8_sizeof(u8"é"[0])); + GREATEST_ASSERT_EQ(3, uni8_sizeof(u8"€"[0])); + GREATEST_ASSERT_EQ(4, uni8_sizeof(u8"𐍈"[0])); + GREATEST_PASS(); +} - GREATEST_ASSERT(!s); +GREATEST_TEST +test_uni8_sizeof_invalid(void) +{ + GREATEST_ASSERT_EQ((size_t)-1, uni8_sizeof(u8"\xff"[0])); + GREATEST_ASSERT_EQ(errno, EILSEQ); + GREATEST_PASS(); +} + +GREATEST_SUITE(suite_uni8_sizeof) +{ + GREATEST_RUN_TEST(test_uni8_sizeof_basic); + GREATEST_RUN_TEST(test_uni8_sizeof_invalid); +} + +GREATEST_TEST +test_uni8_length_basic(void) +{ + GREATEST_ASSERT_EQ(3, uni8_length("abc")); + GREATEST_ASSERT_EQ(4, uni8_length("5€")); + GREATEST_PASS(); +} + +GREATEST_TEST +test_uni8_length_invalid(void) +{ + GREATEST_ASSERT_EQ((size_t)-1, uni8_length("a""\xff""b")); GREATEST_ASSERT_EQ(errno, EILSEQ); GREATEST_PASS(); } -GREATEST_SUITE(utf32_to_utf8) +GREATEST_SUITE(suite_uni8_length) +{ + GREATEST_RUN_TEST(test_uni8_length_basic); + GREATEST_RUN_TEST(test_uni8_length_invalid); +} + +GREATEST_TEST +test_uni8_to32_basic(void) { - GREATEST_RUN_TEST(utf32_to_utf8_ascii); - GREATEST_RUN_TEST(utf32_to_utf8_valid); - GREATEST_RUN_TEST(utf32_to_utf8_invalid); + size_t r; + + { + uint32_t buffer[10] = { 0 }; + uint32_t expected[] = { U'a', U'b', U'c', 0 }; + + r = uni8_to32("abc", buffer, 10); + GREATEST_ASSERT_EQ(r, 3); + GREATEST_ASSERT(u32cmp(buffer, expected)); + } + + { + uint32_t buffer[10] = { 0 }; + uint32_t expected[] = { U'a', U'é', U'c', 0 }; + + r = uni8_to32("aéc", buffer, 10); + GREATEST_ASSERT_EQ(r, 3); + GREATEST_ASSERT(u32cmp(buffer, expected)); + } + + GREATEST_PASS(); } -/* - * Conversion UTF8 -> UTF32 - * ------------------------------------------------------------------ - */ +GREATEST_TEST +test_uni8_to32_invalid(void) +{ + size_t r; + uint32_t buffer[10] = { 0 }; + + /* Invalid UTF-8 sequence. */ + r = uni8_to32(u8"\xff""a", buffer, 10); + GREATEST_ASSERT_EQ(r, (size_t)-1); + GREATEST_ASSERT_EQ(errno, EILSEQ); + + /* Valid "€" but unfinished sequence. */ + r = uni8_to32((const uint8_t []){ -30, 0 }, buffer, 10); + GREATEST_ASSERT_EQ(r, (size_t)-1); + GREATEST_ASSERT_EQ(errno, EILSEQ); + + GREATEST_PASS(); +} GREATEST_TEST -utf8_to_utf32_ascii(void) +test_uni8_to32_toosmall(void) +{ + size_t r; + uint32_t buffer[4] = { 0 }; + + r = uni8_to32(u8"bonjour à tous", buffer, 1); + GREATEST_ASSERT_EQ(r, (size_t)-1); + GREATEST_ASSERT_EQ(errno, ERANGE); + GREATEST_PASS(); +} + +GREATEST_SUITE(suite_uni8_to32) { - const char *s = "abc"; - const uint32_t expected[] = { U'a', U'b', U'c', 0 }; + GREATEST_RUN_TEST(test_uni8_to32_basic); + GREATEST_RUN_TEST(test_uni8_to32_invalid); + GREATEST_RUN_TEST(test_uni8_to32_toosmall); +} - GREATEST_ASSERT(u32cmp(expected, uni_toutf32(s))); +GREATEST_TEST +test_uni32_sizeof_basic(void) +{ + GREATEST_ASSERT_EQ(1, uni32_sizeof(U'a')); + GREATEST_ASSERT_EQ(2, uni32_sizeof(U'é')); + GREATEST_ASSERT_EQ(3, uni32_sizeof(U'€')); + GREATEST_ASSERT_EQ(4, uni32_sizeof(U'𐍈')); GREATEST_PASS(); } GREATEST_TEST -utf8_to_utf32_valid(void) +test_uni32_sizeof_invalid(void) +{ + GREATEST_ASSERT_EQ((size_t)-1, uni32_sizeof(0xffffffff)); + GREATEST_ASSERT_EQ(errno, EILSEQ); + GREATEST_PASS(); +} + +GREATEST_SUITE(suite_uni32_sizeof) { - const char *s = u8"aéc𠀀"; - const uint32_t expected[] = { U'a', U'é', U'c', U'𠀀', 0 }; + GREATEST_RUN_TEST(test_uni32_sizeof_basic); + GREATEST_RUN_TEST(test_uni32_sizeof_invalid); +} + +GREATEST_TEST +test_uni32_length(void) +{ + GREATEST_ASSERT_EQ(3, uni32_length((const uint32_t []){ U'a', U'é', U'c', 0 })); + GREATEST_PASS(); +} - GREATEST_ASSERT(u32cmp(expected, uni_toutf32(s))); +GREATEST_SUITE(suite_uni32_length) +{ + GREATEST_RUN_TEST(test_uni32_length); +} + +GREATEST_TEST +test_uni32_requires_basic(void) +{ + GREATEST_ASSERT_EQ(3, uni32_requires(U"abc")); + GREATEST_ASSERT_EQ(9, uni32_requires(U"é€𐍈")); GREATEST_PASS(); } GREATEST_TEST -utf8_to_utf32_invalid(void) +test_uni32_requires_invalid(void) { - const char *s = "a" "\xff""b"; - const uint32_t *result = uni_toutf32(s); - - GREATEST_ASSERT(!result); + GREATEST_ASSERT_EQ((size_t)-1, uni32_requires(U"\xffffffff")); GREATEST_ASSERT_EQ(errno, EILSEQ); GREATEST_PASS(); } -GREATEST_SUITE(utf8_to_utf32) +GREATEST_SUITE(suite_uni32_requires) +{ + GREATEST_RUN_TEST(test_uni32_requires_basic); + GREATEST_RUN_TEST(test_uni32_requires_invalid); +} + +GREATEST_TEST +test_uni32_to8_basic(void) { - GREATEST_RUN_TEST(utf8_to_utf32_ascii); - GREATEST_RUN_TEST(utf8_to_utf32_valid); - GREATEST_RUN_TEST(utf8_to_utf32_invalid); + size_t r; + + { + uint8_t buffer[10] = { 0 }; + + r = uni32_to8(U"abc", buffer, sizeof (buffer)); + GREATEST_ASSERT_EQ(r, 3); + GREATEST_ASSERT_STR_EQ(buffer, u8"abc"); + } + + { + uint8_t buffer[20] = { 0 }; + + r = uni32_to8(U"ça va, 5€ ?", buffer, sizeof (buffer)); + GREATEST_ASSERT_EQ(r, 14); + GREATEST_ASSERT_STR_EQ(buffer, u8"ça va, 5€ ?"); + } + + GREATEST_PASS(); } -/* - * Checks functions - * ------------------------------------------------------------------ - */ +GREATEST_TEST +test_uni32_to8_invalid(void) +{ + uint8_t buffer[10] = { 0 }; + + GREATEST_ASSERT_EQ(uni32_to8(U"\xffffffff", buffer, sizeof (buffer)), (size_t)-1); + GREATEST_ASSERT_EQ(errno, EILSEQ); + GREATEST_PASS(); +} GREATEST_TEST -checks_isalpha(void) +test_uni32_to8_toosmall(void) +{ + size_t r; + uint8_t buffer[3] = { 0 }; + + r = uni32_to8(U"ça va ?", buffer, sizeof (buffer)); + GREATEST_ASSERT_EQ(r, (size_t)-1); + GREATEST_ASSERT_EQ(errno, ERANGE); + GREATEST_PASS(); +} + +GREATEST_SUITE(suite_uni32_to8) +{ + GREATEST_RUN_TEST(test_uni32_to8_basic); + GREATEST_RUN_TEST(test_uni32_to8_invalid); + GREATEST_RUN_TEST(test_uni32_to8_toosmall); +} + +GREATEST_TEST +test_misc_isalpha(void) { GREATEST_ASSERT(uni_isalpha(U'é')); GREATEST_ASSERT(!uni_isalpha(U'€')); @@ -147,7 +394,7 @@ } GREATEST_TEST -checks_isdigit(void) +test_misc_isdigit(void) { GREATEST_ASSERT(uni_isdigit(U'۱')); GREATEST_ASSERT(!uni_isdigit(U'€')); @@ -155,7 +402,7 @@ } GREATEST_TEST -checks_islower(void) +test_misc_islower(void) { GREATEST_ASSERT(uni_islower(U'a')); GREATEST_ASSERT(uni_islower(U'é')); @@ -165,7 +412,7 @@ } GREATEST_TEST -checks_isspace(void) +test_misc_isspace(void) { GREATEST_ASSERT(uni_isspace(U' ')); GREATEST_ASSERT(!uni_isspace(U'é')); @@ -173,7 +420,7 @@ } GREATEST_TEST -checks_istitle(void) +test_misc_istitle(void) { GREATEST_ASSERT(uni_istitle(U'Dž')); GREATEST_ASSERT(!uni_istitle(U'€')); @@ -181,7 +428,7 @@ } GREATEST_TEST -checks_isupper(void) +test_misc_isupper(void) { GREATEST_ASSERT(!uni_isupper('a')); GREATEST_ASSERT(!uni_isupper(U'é')); @@ -190,50 +437,14 @@ GREATEST_PASS(); } -GREATEST_SUITE(checks) -{ - GREATEST_RUN_TEST(checks_isalpha); - GREATEST_RUN_TEST(checks_isdigit); - GREATEST_RUN_TEST(checks_islower); - GREATEST_RUN_TEST(checks_isspace); - GREATEST_RUN_TEST(checks_istitle); - GREATEST_RUN_TEST(checks_isupper); -} - -/* - * Miscellaneous - * ------------------------------------------------------------------ - */ - -GREATEST_TEST -misc_requires(void) +GREATEST_SUITE(suite_misc) { - GREATEST_ASSERT(uni_requires('a') == 1); - GREATEST_ASSERT(uni_requires(U'é') == 2); - GREATEST_ASSERT(uni_requires(U'€') == 3); - GREATEST_ASSERT(uni_requires(U'𠀀') == 4); - GREATEST_PASS(); -} - -GREATEST_TEST -misc_sizeof(void) -{ - const char *s1 = u8"a"; - const char *s2 = u8"é"; - const char *s3 = u8"€"; - const char *s4 = u8"𠀀"; - - GREATEST_ASSERT(uni_sizeof(s1[0]) == 1); - GREATEST_ASSERT(uni_sizeof(s2[0]) == 2); - GREATEST_ASSERT(uni_sizeof(s3[0]) == 3); - GREATEST_ASSERT(uni_sizeof(s4[0]) == 4); - GREATEST_PASS(); -} - -GREATEST_SUITE(misc) -{ - GREATEST_RUN_TEST(misc_requires); - GREATEST_RUN_TEST(misc_sizeof); + GREATEST_RUN_TEST(test_misc_isalpha); + GREATEST_RUN_TEST(test_misc_isdigit); + GREATEST_RUN_TEST(test_misc_islower); + GREATEST_RUN_TEST(test_misc_isspace); + GREATEST_RUN_TEST(test_misc_istitle); + GREATEST_RUN_TEST(test_misc_isupper); } GREATEST_MAIN_DEFS(); @@ -242,9 +453,15 @@ main(int argc, char **argv) { GREATEST_MAIN_BEGIN(); - GREATEST_RUN_SUITE(utf32_to_utf8); - GREATEST_RUN_SUITE(utf8_to_utf32); - GREATEST_RUN_SUITE(checks); - GREATEST_RUN_SUITE(misc); + GREATEST_RUN_SUITE(suite_uni8_encode); + GREATEST_RUN_SUITE(suite_uni8_decode); + GREATEST_RUN_SUITE(suite_uni8_sizeof); + GREATEST_RUN_SUITE(suite_uni8_length); + GREATEST_RUN_SUITE(suite_uni8_to32); + GREATEST_RUN_SUITE(suite_uni32_sizeof); + GREATEST_RUN_SUITE(suite_uni32_length); + GREATEST_RUN_SUITE(suite_uni32_requires); + GREATEST_RUN_SUITE(suite_uni32_to8); + GREATEST_RUN_SUITE(suite_misc); GREATEST_MAIN_END(); }
--- a/unicode.c Wed Mar 25 09:56:05 2020 +0100 +++ b/unicode.c Wed Mar 25 14:33:03 2020 +0100 @@ -4608,104 +4608,106 @@ return c; } -static size_t -requires(const uint32_t *src) +size_t +uni8_encode(uint8_t dst[], size_t dstsz, uint32_t point) { - size_t size = 0; - int nb; + assert(dst); + + size_t written; - while (*src) { - if ((nb = uni_requires(*src++)) == -1) { - errno = EILSEQ; - return -1; - } + switch ((written = uni32_sizeof(point))) { + case 1: + if (dstsz < 1) + goto erange; - if (nb > SIZE_MAX - size) + dst[0] = (uint8_t)point; + break; + case 2: + if (dstsz < 2) goto erange; - size += nb; - } - - /* If SIZE_MAX -> no space for '\0' */ - if (size == SIZE_MAX) - goto erange; - - return size; - -erange: - errno = ERANGE; - return -1; -} - -void -uni_encode(uint32_t c, char *dst) -{ - assert(uni_requires(c) != -1); - assert(dst); - - switch (uni_requires(c)) { - case 1: - dst[0] = (char)c; - dst[1] = '\0'; - break; - case 2: - dst[0] = 0xC0 | ((c >> 6) & 0x1F); - dst[1] = 0x80 | (c & 0x3F); - dst[2] = '\0'; + dst[0] = 0xC0 | ((point >> 6) & 0x1F); + dst[1] = 0x80 | (point & 0x3F); break; case 3: - dst[0] = 0xE0 | ((c >> 12) & 0xF ); - dst[1] = 0x80 | ((c >> 6) & 0x3F); - dst[2] = 0x80 | (c & 0x3F); - dst[3] = '\0'; + if (dstsz < 3) + goto erange; + + dst[0] = 0xE0 | ((point >> 12) & 0xF ); + dst[1] = 0x80 | ((point >> 6) & 0x3F); + dst[2] = 0x80 | (point & 0x3F); break; case 4: - dst[0] = 0xF0 | ((c >> 18) & 0x7 ); - dst[1] = 0x80 | ((c >> 12) & 0x3F); - dst[2] = 0x80 | ((c >> 6) & 0x3F); - dst[3] = 0x80 | (c & 0x3F); - dst[4] = '\0'; + if (dstsz < 4) + goto erange; + + dst[0] = 0xF0 | ((point >> 18) & 0x7 ); + dst[1] = 0x80 | ((point >> 12) & 0x3F); + dst[2] = 0x80 | ((point >> 6) & 0x3F); + dst[3] = 0x80 | (point & 0x3F); break; default: break; } + + return written; + +erange: + errno = ERANGE; + + return -1; } -uint32_t -uni_decode(const char *src) +size_t +uni8_decode(const uint8_t src[], uint32_t *point) { assert(src); - assert(uni_sizeof(*src) != -1); + assert(point); - uint32_t c = 0; + size_t parsed; - switch (uni_sizeof(*src)) { + switch ((parsed = uni8_sizeof(*src))) { case 1: - c = src[0]; + *point = src[0]; break; case 2: - c = (src[0] & 0x1f) << 6; - c |= (src[1] & 0x3f); + if (!src[1]) + goto eilseq; + + *point = (src[0] & 0x1f) << 6; + *point |= (src[1] & 0x3f); break; case 3: - c = (src[0] & 0x0f) << 12; - c |= (src[1] & 0x3f) << 6; - c |= (src[2] & 0x3f); + if (!src[1] || !src[2]) + goto eilseq; + + *point = (src[0] & 0x0f) << 12; + *point |= (src[1] & 0x3f) << 6; + *point |= (src[2] & 0x3f); break; case 4: - c = (src[0] & 0x07) << 16; - c |= (src[1] & 0x3f) << 12; - c |= (src[2] & 0x3f) << 6; - c |= (src[3] & 0x3f); + if (!src[1] || !src[2] || !src[3]) + goto eilseq; + + *point = (src[0] & 0x07) << 16; + *point |= (src[1] & 0x3f) << 12; + *point |= (src[2] & 0x3f) << 6; + *point |= (src[3] & 0x3f); + break; default: break; } - return c; + return parsed; + +eilseq: + errno = EILSEQ; + + return -1; } -int -uni_sizeof(unsigned char c) +size_t +uni8_sizeof(uint8_t c) { if (c <= 127) return 1; @@ -4716,11 +4718,57 @@ if ((c & 0xF8) == 0xF0) return 4; + errno = EILSEQ; return -1; } -int -uni_requires(uint32_t c) +size_t +uni8_length(const uint8_t src[]) +{ + assert(src); + + size_t total = 0, gap; + + while (*src) { + if ((gap = uni8_sizeof(*src)) == (size_t)-1) + return -1; + + total += gap; + src += gap; + } + + return total; +} + +size_t +uni8_to32(const uint8_t src[], uint32_t dst[], size_t dstsz) +{ + assert(src); + assert(dst); + + size_t nwritten = 0, gap; + + for (; *src && dstsz; --dstsz) { + if ((gap = uni8_decode(src, dst++)) == (size_t)-1) + return -1; + + src += gap; + ++nwritten; + } + + /* No more space to store NUL. */ + if (dstsz == 0) { + errno = ERANGE; + return -1; + } + + *dst = 0; + + return nwritten; +} + +size_t +uni32_sizeof(uint32_t c) { if (c <= 0x7F) return 1; @@ -4731,82 +4779,67 @@ if (c <= 0x1FFFFF) return 4; + errno = EILSEQ; return -1; } size_t -uni_length(const char *src) +uni32_length(const uint32_t src[]) { + assert(src); + size_t total = 0; - int gap; + + while (*src++) + total++; + + return total; +} + +size_t +uni32_requires(const uint32_t src[]) +{ + assert(src); + + size_t total = 0, gap; while (*src) { - if ((gap = uni_sizeof(*src)) == -1) { - errno = EILSEQ; + if ((gap = uni32_sizeof(*src++)) == (size_t)-1) + return -1; + if (gap >= SIZE_MAX - total) { + errno = ERANGE; return -1; } total += gap; - src += gap; } return total; } -char * -uni_toutf8(const uint32_t *src) -{ - assert(src); - - size_t total; - char *out, *ptr; - int nb; - - if ((total = requires(src)) == -1) - return NULL; - if (!(out = malloc(total + 1))) - return NULL; - - ptr = out; - - while (*src) { - nb = uni_requires(*src); - uni_encode(*src++, ptr); - ptr += nb; - } - - return out; -} - -uint32_t * -uni_toutf32(const char *src) +size_t +uni32_to8(const uint32_t src[], uint8_t dst[], size_t dstsz) { assert(src); + assert(dst); - size_t length; - uint32_t *out, *ptr; - int nb; + size_t nwritten = 0, gap; - if ((length = uni_length(src)) == -1) - return NULL; - if (length == SIZE_MAX) { - errno = ERANGE; - return NULL; - } - if (!(out = malloc(sizeof (uint32_t) * length + 1))) - return NULL; + while (*src && dstsz) { + if ((gap = uni8_encode(dst, dstsz, *src++)) == (size_t)-1) + return -1; - ptr = out; - - while (*src) { - /* No checks needed, uni_length already did it for us. */ - int nb = uni_sizeof(*src); - - *ptr++ = uni_decode(src); - src += nb; + dst += gap; + dstsz -= gap; + nwritten += gap; } - *ptr = 0; + if (dstsz == 0) { + errno = ERANGE; + return -1; + } - return out; + *dst = 0; + + return nwritten; }
--- a/unicode.h Wed Mar 25 09:56:05 2020 +0100 +++ b/unicode.h Wed Mar 25 14:33:03 2020 +0100 @@ -32,34 +32,31 @@ /** * Encode the unicode code point into multibyte string. * - * \pre point must be valid - * \pre destination must have space for at least 5 bytes - * \param point the unicode code point - * \param res the output buffer - * \see \ref uni_requires + * To make sure that buffer is always large enough, you may pass a buffer of + * size 4 as it's the largest UTF-8 string for now. + * + * \pre dst != NULL + * \param dst the UTF-8 buffer destination + * \param dstsz the size available in dst + * \param point the unicode character + * \return The number of bytes written (excluding the null terminator) or -1 on + * error and sets errno. + * \warning The destination is **not** NUL terminated. */ -void -uni_encode(uint32_t point, char *res); +size_t +uni8_encode(uint8_t dst[], size_t dstsz, uint32_t point); /** * Decode the multibyte buffer into an unicode code point. * - * \pre src must be a valid UTF-8 string - * \param src the source string - * \return the converted code point - * \see \ref uni_sizeof + * \pre src != NULL + * \pre point != NULL + * \param src UTF-8 the source string + * \param point the unicode character destination + * \return The number of bytes parsed in src or -1 on error and sets errno. */ -uint32_t -uni_decode(const char *src); - -/** - * Get the number of bytes required for the unicode point. - * - * \param point the unicode point - * \return the number of bytes [1-4] or -1 if invalid - */ -int -uni_requires(uint32_t point); +size_t +uni8_decode(const uint8_t src[], uint32_t *point); /** * Get the number of bytes that follow this UTF-8 character. @@ -68,48 +65,99 @@ * character. * * \param c the first multi byte character - * \return the number of bytes [1-4] or -1 if invalid + * \return The number of bytes [1-4] or -1 if invalid and sets errno. + * \warning You may still need to verify that following characters are valid as + * this function only returns the number of bytes that *should* + * exists after this one. */ -int -uni_sizeof(unsigned char c); +size_t +uni8_sizeof(uint8_t c); + +/** + * Get real number of unicode character in a string. + * + * \pre src != NULL + * \param src the UTF-8 string + * \return The number of unicode characters or -1 on error and sets errno. + */ +size_t +uni8_length(const uint8_t src[]); /** - * Get real number of character in a string. + * Convert a UTF-8 string to UTF-32 string. * + * This function will write at most dstsz bytes in dst including the NUL + * terminator. Caller is responsible to provide an area large enough to store + * the required number of unicode characters plus the NUL terminator. + * + * Use \ref uni8_length to determine the number of characters required. + * + * \pre src != NULL + * \pre dst != NULL * \param src the UTF-8 string - * \return the number of unicode codepoints or -1 on error and sets errno - * accordingly. + * \param dst the UTF-32 destination + * \param dstsz the size of the destination + * \return The number of bytes written (excluding the null terminator) or -1 on + * error and sets errno. + * \see \ref uni8_length */ size_t -uni_length(const char *src); +uni8_to32(const uint8_t src[], uint32_t dst[], size_t dstsz); + +/** + * Get the number of bytes required for the unicode point. + * + * \param point the unicode point + * \return The number of bytes [1-4] or -1 on error and sets errno. + */ +size_t +uni32_sizeof(uint32_t point); + +/** + * Get the number of characters in src. + * + * \pre src != NULL + * \param src the NUL terminated UTF-32 string + * \return The number of unicode characters. + */ +size_t +uni32_length(const uint32_t src[]); + +/** + * Determine the number of UTF-8 characters excluding the NUL terminator that + * are needed to convert this UTF-32 string to UTF-8. + * + * \pre src != NULL + * \param src the UTF-32 source string + * \return The number of bytes required excluding the NUL terminator or -1 on + * error and sets errno. + */ +size_t +uni32_requires(const uint32_t src[]); /** * Convert a UTF-32 string to UTF-8 string. * - * \pre src != NULL - * \param src the UTF-32 string - * \return a nul-terminated string or NULL on error and sets errno accordingly - * \note The returned string must be free'ed by the caller - */ -char * -uni_toutf8(const uint32_t *src); - -/** - * Convert a UTF-8 string to UTF-32 string. + * The output buffer will be filled with at most `dstsize` bytes including the + * nul terminator. The function \ref uni32_requires can be used to determine + * the number of codepoints required. * * \pre src != NULL - * \param src the UTF-8 string - * \return a nul-terminated string or NULL on error and sets errno accordingly - * \note The returned string must be free'ed by the caller + * \pre dst != NULL + * \param src the UTF-32 string + * \param dst the string destination + * \param dstsz the number of bytes available in dst + * \return the number of bytes written or -1 on error and sets errno + * accordingly. */ -uint32_t * -uni_toutf32(const char *src); +size_t +uni32_to8(const uint32_t src[], uint8_t dst[], size_t dstsz); /** * Check if the unicode character is alpha category. * * \param c the character - * \return true if alpha + * \return True if alpha. */ bool uni_isalpha(uint32_t c); @@ -118,7 +166,7 @@ * Check if the unicode character is digit. * * \param c the character - * \return true if digit + * \return True if digit. */ bool uni_isdigit(uint32_t c); @@ -127,7 +175,7 @@ * Check if the unicode character is lower case. * * \param c the character - * \return true if lower case + * \return True if lower case. */ bool uni_islower(uint32_t c); @@ -136,7 +184,7 @@ * Check if the unicode character is space. * * \param c the character - * \return true if space + * \return True if space. */ bool uni_isspace(uint32_t c); @@ -145,7 +193,7 @@ * Check if the unicode character is title case. * * \param c the character - * \return true if title case + * \return True if title case. */ bool uni_istitle(uint32_t c); @@ -154,7 +202,7 @@ * Check if the unicode character is upper case. * * \param c the character - * \return true if upper case + * \return True if upper case. */ bool uni_isupper(uint32_t c); @@ -163,7 +211,7 @@ * Convert to upper case. * * \param c the character - * \return the upper case character + * \return The upper case character. */ uint32_t uni_toupper(uint32_t c); @@ -172,7 +220,7 @@ * Convert to lower case. * * \param c the character - * \return the lower case character + * \return The lower case character. */ uint32_t uni_tolower(uint32_t c); @@ -181,7 +229,7 @@ * Convert to title case. * * \param c the character - * \return the title case character + * \return The title case character. */ uint32_t uni_totitle(uint32_t c);