unicode: improve C API, removing dynamic allocations

Wed, 25 Mar 2020 14:33:03 +0100

author
David Demelier <markand@malikania.fr>
date
Wed, 25 Mar 2020 14:33:03 +0100
changeset 11
43a9d763656b
parent 10
ae1003c2a284
child 12
083f11d2536f

unicode: improve C API, removing dynamic allocations

.hgignore file | annotate | diff | comparison | revisions
Doxyfile file | annotate | diff | comparison | revisions
Makefile file | annotate | diff | comparison | revisions
doc/mainpage.cpp file | annotate | diff | comparison | revisions
gen/unicode-after.c file | annotate | diff | comparison | revisions
test/unicode.c file | annotate | diff | comparison | revisions
unicode.c file | annotate | diff | comparison | revisions
unicode.h file | annotate | diff | comparison | revisions
--- a/.hgignore	Wed Mar 25 09:56:05 2020 +0100
+++ b/.hgignore	Wed Mar 25 14:33:03 2020 +0100
@@ -3,6 +3,9 @@
 \.swp$
 \.swo$
 
+# Doxygen.
+^doxygen/
+
 # Generator files.
 ^gen/mkunicode-c$
 ^gen/mkunicode-cpp$
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Doxyfile	Wed Mar 25 14:33:03 2020 +0100
@@ -0,0 +1,37 @@
+#
+# Doxyfile -- generate API documentation for Molko's Adventure
+#
+# Copyright (c) 2013-2020 David Demelier <markand@malikania.fr>
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+
+DOXYFILE_ENCODING      = UTF-8
+PROJECT_NAME           = "libunicode"
+PROJECT_NUMBER         = "0.1.0"
+PROJECT_BRIEF          = "UTF-8 to UTF-32 conversions and various operations"
+PROJECT_LOGO           =
+OUTPUT_DIRECTORY       = doxygen
+ALLOW_UNICODE_NAMES    = YES
+STRIP_FROM_PATH        = ./
+TAB_SIZE               = 8
+OPTIMIZE_OUTPUT_FOR_C  = YES
+AUTOLINK_SUPPORT       = NO
+QUIET                  = YES
+WARNINGS               = YES
+INPUT                  = unicode.h unicode.hpp
+INPUT_ENCODING         = UTF-8
+RECURSIVE              = NO
+GENERATE_LATEX         = NO
+GENERATE_MAN           = NO
+MAX_INITIALIZER_LINES  = 0
--- a/Makefile	Wed Mar 25 09:56:05 2020 +0100
+++ b/Makefile	Wed Mar 25 14:33:03 2020 +0100
@@ -14,6 +14,7 @@
 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
 
 .POSIX:
 
@@ -50,10 +51,12 @@
 
 tests: test/unicode test/unicode++
 	test/unicode
-	test/unicode++
+
+doxygen:
+	doxygen Doxyfile
 
 clean:
 	rm -f gen/mkunicode-c gen/mkunicode-cpp
 	rm -f test/unicode test/unicode++
 
-.PHONY: all clean tests
+.PHONY: all clean doxygen tests
--- a/doc/mainpage.cpp	Wed Mar 25 09:56:05 2020 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,20 +0,0 @@
-/**
- * \mainpage
- *
- * Welcome to the unicode library.
- *
- * ## Introduction
- *
- * This library provides UTF-8 to UTF-32 conversions and routines to test
- * category of characters. It works on std::string and std::u32string.
- *
- * With C++17, you can also use std::string_view and std::u32string_view.
- *
- * ## Installation (C++ variant)
- *
- * Just copy the files unicode.cpp and unicode.hpp and add them to your project.
- *
- * ## Installation (C variant)
- *
- * Copy the files unicode.c and unicode.h and add them to your project.
- */
--- a/gen/unicode-after.c	Wed Mar 25 09:56:05 2020 +0100
+++ b/gen/unicode-after.c	Wed Mar 25 14:33:03 2020 +0100
@@ -1,101 +1,103 @@
-static size_t
-requires(const uint32_t *src)
+size_t
+uni8_encode(uint8_t dst[], size_t dstsz, uint32_t point)
 {
-	size_t size = 0;
-	int nb;
+	assert(dst);
+
+	size_t written;
 
-	while (*src) {
-		if ((nb = uni_requires(*src++)) == -1) {
-			errno = EILSEQ;
-			return -1;
-		}
+	switch ((written = uni32_sizeof(point))) {
+	case 1:
+		if (dstsz < 1)
+			goto erange;
 
-		if (nb > SIZE_MAX - size)
+		dst[0] = (uint8_t)point;
+		break;
+	case 2:
+		if (dstsz < 2)
 			goto erange;
 
-		size += nb;
-	}
-
-	/* If SIZE_MAX -> no space for '\0' */
-	if (size == SIZE_MAX)
-		goto erange;
-
-	return size;
-
-erange:
-	errno = ERANGE;
-	return -1;
-}
-
-void
-uni_encode(uint32_t c, char *dst)
-{
-	assert(uni_requires(c) != -1);
-	assert(dst);
-
-	switch (uni_requires(c)) {
-	case 1:
-		dst[0] = (char)c;
-		dst[1] = '\0';
-		break;
-	case 2:
-		dst[0] = 0xC0 | ((c >> 6)  & 0x1F);
-		dst[1] = 0x80 | (c & 0x3F);
-		dst[2] = '\0';
+		dst[0] = 0xC0 | ((point >> 6)  & 0x1F);
+		dst[1] = 0x80 | (point & 0x3F);
 		break;
 	case 3:
-		dst[0] = 0xE0 | ((c >> 12) & 0xF );
-		dst[1] = 0x80 | ((c >> 6)  & 0x3F);
-		dst[2] = 0x80 | (c & 0x3F);
-		dst[3] = '\0';
+		if (dstsz < 3)
+			goto erange;
+
+		dst[0] = 0xE0 | ((point >> 12) & 0xF );
+		dst[1] = 0x80 | ((point >> 6)  & 0x3F);
+		dst[2] = 0x80 | (point & 0x3F);
 		break;
 	case 4:
-		dst[0] = 0xF0 | ((c >> 18) & 0x7 );
-		dst[1] = 0x80 | ((c >> 12) & 0x3F);
-		dst[2] = 0x80 | ((c >> 6)  & 0x3F);
-		dst[3] = 0x80 | (c & 0x3F);
-		dst[4] = '\0';
+		if (dstsz < 4)
+			goto erange;
+
+		dst[0] = 0xF0 | ((point >> 18) & 0x7 );
+		dst[1] = 0x80 | ((point >> 12) & 0x3F);
+		dst[2] = 0x80 | ((point >> 6)  & 0x3F);
+		dst[3] = 0x80 | (point & 0x3F);
 		break;
 	default:
 		break;
 	}
+
+	return written;
+
+erange:
+	errno = ERANGE;
+
+	return -1;
 }
 
-uint32_t
-uni_decode(const char *src)
+size_t
+uni8_decode(const uint8_t src[], uint32_t *point)
 {
 	assert(src);
-	assert(uni_sizeof(*src) != -1);
+	assert(point);
 
-	uint32_t c = 0;
+	size_t parsed;
 
-	switch (uni_sizeof(*src)) {
+	switch ((parsed = uni8_sizeof(*src))) {
 	case 1:
-		c = src[0];
+		*point = src[0];
 		break;
 	case 2:
-		c =  (src[0] & 0x1f) << 6;
-		c |= (src[1] & 0x3f);
+		if (!src[1])
+			goto eilseq;
+
+		*point =  (src[0] & 0x1f) << 6;
+		*point |= (src[1] & 0x3f);
 		break;
 	case 3:
-		c =  (src[0] & 0x0f) << 12;
-		c |= (src[1] & 0x3f) << 6;
-		c |= (src[2] & 0x3f);
+		if (!src[1] || !src[2])
+			goto eilseq;
+
+		*point =  (src[0] & 0x0f) << 12;
+		*point |= (src[1] & 0x3f) << 6;
+		*point |= (src[2] & 0x3f);
 		break;
 	case 4:
-		c =  (src[0] & 0x07) << 16;
-		c |= (src[1] & 0x3f) << 12;
-		c |= (src[2] & 0x3f) << 6;
-		c |= (src[3] & 0x3f);
+		if (!src[1] || !src[2] || !src[3])
+			goto eilseq;
+
+		*point =  (src[0] & 0x07) << 16;
+		*point |= (src[1] & 0x3f) << 12;
+		*point |= (src[2] & 0x3f) << 6;
+		*point |= (src[3] & 0x3f);
+		break;
 	default:
 		break;
 	}
 
-	return c;
+	return parsed;
+
+eilseq:
+	errno = EILSEQ;
+
+	return -1;
 }
 
-int
-uni_sizeof(unsigned char c)
+size_t
+uni8_sizeof(uint8_t c)
 {
 	if (c <= 127)
 		return 1;
@@ -106,11 +108,57 @@
 	if ((c & 0xF8) == 0xF0)
 		return 4;
 
+	errno = EILSEQ;
 	return -1;
 }
 
-int
-uni_requires(uint32_t c)
+size_t
+uni8_length(const uint8_t src[])
+{
+	assert(src);
+
+	size_t total = 0, gap;
+
+	while (*src) {
+		if ((gap = uni8_sizeof(*src)) == (size_t)-1)
+			return -1;
+
+		total += gap;
+		src += gap;
+	}
+
+	return total;
+}
+
+size_t
+uni8_to32(const uint8_t src[], uint32_t dst[], size_t dstsz)
+{
+	assert(src);
+	assert(dst);
+
+	size_t nwritten = 0, gap;
+
+	for (; *src && dstsz; --dstsz) {
+		if ((gap = uni8_decode(src, dst++)) == (size_t)-1)
+			return -1;
+
+		src += gap;
+		++nwritten;
+	}
+
+	/* No more space to store NUL. */
+	if (dstsz == 0) {
+		errno = ERANGE;
+		return -1;
+	}
+
+	*dst = 0;
+
+	return nwritten;
+}
+
+size_t
+uni32_sizeof(uint32_t c)
 {
 	if (c <= 0x7F)
 		return 1;
@@ -121,82 +169,67 @@
 	if (c <= 0x1FFFFF)
 		return 4;
 
+	errno = EILSEQ;
 	return -1;
 }
 
 size_t
-uni_length(const char *src)
+uni32_length(const uint32_t src[])
 {
+	assert(src);
+
 	size_t total = 0;
-	int gap;
+
+	while (*src++)
+		total++;
+
+	return total;
+}
+
+size_t
+uni32_requires(const uint32_t src[])
+{
+	assert(src);
+
+	size_t total = 0, gap;
 
 	while (*src) {
-		if ((gap = uni_sizeof(*src)) == -1) {
-			errno = EILSEQ;
+		if ((gap = uni32_sizeof(*src++)) == (size_t)-1)
+			return -1;
+		if (gap >= SIZE_MAX - total) {
+			errno = ERANGE;
 			return -1;
 		}
 
 		total += gap;
-		src += gap;
 	}
 
 	return total;
 }
 
-char *
-uni_toutf8(const uint32_t *src)
-{
-	assert(src);
-
-	size_t total;
-	char *out, *ptr;
-	int nb;
-
-	if ((total = requires(src)) == -1)
-		return NULL;
-	if (!(out = malloc(total + 1)))
-		return NULL;
-
-	ptr = out;
-
-	while (*src) {
-		nb = uni_requires(*src);
-		uni_encode(*src++, ptr);
-		ptr += nb;
-	}
-
-	return out;
-}
-
-uint32_t *
-uni_toutf32(const char *src)
+size_t
+uni32_to8(const uint32_t src[], uint8_t dst[], size_t dstsz)
 {
 	assert(src);
+	assert(dst);
 
-	size_t length;
-	uint32_t *out, *ptr;
-	int nb;
+	size_t nwritten = 0, gap;
 
-	if ((length = uni_length(src)) == -1)
-		return NULL;
-	if (length == SIZE_MAX) {
-		errno = ERANGE;
-		return NULL;
-	}
-	if (!(out = malloc(sizeof (uint32_t) * length + 1)))
-		return NULL;
+	while (*src && dstsz) {
+		if ((gap = uni8_encode(dst, dstsz, *src++)) == (size_t)-1)
+			return -1;
 
-	ptr = out;
-
-	while (*src) {
-		/* No checks needed, uni_length already did it for us. */
-		int nb = uni_sizeof(*src);
-
-		*ptr++ = uni_decode(src);
-		src += nb;
+		dst += gap;
+		dstsz -= gap;
+		nwritten += gap;
 	}
 
-	*ptr = 0;
+	if (dstsz == 0) {
+		errno = ERANGE;
+		return -1;
+	}
 
-	return out;
+	*dst = 0;
+
+	return nwritten;
 }
--- a/test/unicode.c	Wed Mar 25 09:56:05 2020 +0100
+++ b/test/unicode.c	Wed Mar 25 14:33:03 2020 +0100
@@ -47,99 +47,346 @@
 	return l1 == l2 && memcmp(s1, s2, l1) == 0;
 }
 
-/*
- * Conversion UTF32 -> UTF8
- * ------------------------------------------------------------------
- */
+GREATEST_TEST
+test_uni8_encode_basic(void)
+{
+	size_t r;
+
+	/* a -> 1 bytes. */
+	{
+		uint8_t buffer[5] = { 0 };
+
+		r = uni8_encode(buffer, sizeof (buffer), U'a');
+		GREATEST_ASSERT_EQ(r, 1);
+		GREATEST_ASSERT_STR_EQ(buffer, u8"a");
+	}
+
+	/* é -> 2 bytes. */
+	{
+		uint8_t buffer[5] = { 0 };
+
+		r = uni8_encode(buffer, sizeof (buffer), U'é');
+		GREATEST_ASSERT_EQ(r, 2);
+		GREATEST_ASSERT_STR_EQ(buffer, u8"é");
+	}
+
+	GREATEST_PASS();
+}
 
 GREATEST_TEST
-utf32_to_utf8_ascii(void)
+test_uni8_encode_invalid(void)
 {
-	const uint32_t u32[] = { U'a', U'b', U'c', 0 };
-	char *s = uni_toutf8(u32);
+	size_t r;
+	uint8_t buffer[5] = { 0 };
 
-	GREATEST_ASSERT_STR_EQ(s, "abc");
+	r = uni8_encode(buffer, sizeof (buffer), 0xffffffff);
+	GREATEST_ASSERT_EQ(r, (size_t)-1);
+	GREATEST_ASSERT_EQ(errno, EILSEQ);
 	GREATEST_PASS();
 }
 
 GREATEST_TEST
-utf32_to_utf8_valid(void)
+test_uni8_encode_toosmall(void)
+{
+	size_t r;
+	uint8_t buffer[1] = { 0 };
+
+	r = uni8_encode(buffer, sizeof (buffer), U'é');
+	GREATEST_ASSERT_EQ(r, (size_t)-1);
+	GREATEST_ASSERT_EQ(errno, ERANGE);
+	GREATEST_PASS();
+}
+
+GREATEST_SUITE(suite_uni8_encode)
+{
+	GREATEST_RUN_TEST(test_uni8_encode_basic);
+	GREATEST_RUN_TEST(test_uni8_encode_invalid);
+	GREATEST_RUN_TEST(test_uni8_encode_toosmall);
+}
+
+GREATEST_TEST
+test_uni8_decode_basic(void)
 {
-	const uint32_t u32[] = { 'a', U'é', 'c', U'𠀀', 0 };
-	char *s = uni_toutf8(u32);
+	size_t r;
+
+	/* a -> 1 bytes. */
+	{
+		uint32_t code = -1;
 
-	GREATEST_ASSERT_STR_EQ(s, u8"aéc𠀀");
+		r = uni8_decode(u8"a", &code);
+		GREATEST_ASSERT_EQ(r, 1);
+		GREATEST_ASSERT_EQ(code, 'a');
+	}
+
+	/* é -> 2 bytes. */
+	{
+		uint32_t code = -1;
+
+		r = uni8_decode(u8"é", &code);
+		GREATEST_ASSERT_EQ(r, 2);
+		GREATEST_ASSERT_EQ(code, U'é');
+	}
+
 	GREATEST_PASS();
 }
 
 GREATEST_TEST
-utf32_to_utf8_invalid(void)
+test_uni8_decode_invalid(void)
+{
+	size_t r;
+
+	/* Invalid UTF-8 sequence. */
+	{
+		uint32_t code = -1;
+
+		r = uni8_decode(u8"\xff""a", &code);
+		GREATEST_ASSERT_EQ(r, (size_t)-1);
+		GREATEST_ASSERT_EQ(code, (uint32_t)-1);
+		GREATEST_ASSERT_EQ(errno, EILSEQ);
+	}
+
+	/* Valid "€" but unfinished sequence. */
+	{
+		uint32_t code = -1;
+
+		r = uni8_decode((const uint8_t []){ -30, 0 }, &code);
+		GREATEST_ASSERT_EQ(r, (size_t)-1);
+		GREATEST_ASSERT_EQ(code, (uint32_t)-1);
+		GREATEST_ASSERT_EQ(errno, EILSEQ);
+	}
+
+	GREATEST_PASS();
+}
+
+GREATEST_SUITE(suite_uni8_decode)
 {
-	const uint32_t u32[] = { 'a', 0xFFFFFFFF, 'c', 0 };
-	char *s = uni_toutf8(u32);
+	GREATEST_RUN_TEST(test_uni8_decode_basic);
+	GREATEST_RUN_TEST(test_uni8_decode_invalid);
+}
+
+GREATEST_TEST
+test_uni8_sizeof_basic(void)
+{
+	GREATEST_ASSERT_EQ(1, uni8_sizeof(u8"a"[0]));
+	GREATEST_ASSERT_EQ(2, uni8_sizeof(u8"é"[0]));
+	GREATEST_ASSERT_EQ(3, uni8_sizeof(u8"€"[0]));
+	GREATEST_ASSERT_EQ(4, uni8_sizeof(u8"𐍈"[0]));
+	GREATEST_PASS();
+}
 
-	GREATEST_ASSERT(!s);
+GREATEST_TEST
+test_uni8_sizeof_invalid(void)
+{
+	GREATEST_ASSERT_EQ((size_t)-1, uni8_sizeof(u8"\xff"[0]));
+	GREATEST_ASSERT_EQ(errno, EILSEQ);
+	GREATEST_PASS();
+}
+
+GREATEST_SUITE(suite_uni8_sizeof)
+{
+	GREATEST_RUN_TEST(test_uni8_sizeof_basic);
+	GREATEST_RUN_TEST(test_uni8_sizeof_invalid);
+}
+
+GREATEST_TEST
+test_uni8_length_basic(void)
+{
+	GREATEST_ASSERT_EQ(3, uni8_length("abc"));
+	GREATEST_ASSERT_EQ(4, uni8_length("5€"));
+	GREATEST_PASS();
+}
+
+GREATEST_TEST
+test_uni8_length_invalid(void)
+{
+	GREATEST_ASSERT_EQ((size_t)-1, uni8_length("a""\xff""b"));
 	GREATEST_ASSERT_EQ(errno, EILSEQ);
 	GREATEST_PASS();
 }
 
-GREATEST_SUITE(utf32_to_utf8)
+GREATEST_SUITE(suite_uni8_length)
+{
+	GREATEST_RUN_TEST(test_uni8_length_basic);
+	GREATEST_RUN_TEST(test_uni8_length_invalid);
+}
+
+GREATEST_TEST
+test_uni8_to32_basic(void)
 {
-	GREATEST_RUN_TEST(utf32_to_utf8_ascii);
-	GREATEST_RUN_TEST(utf32_to_utf8_valid);
-	GREATEST_RUN_TEST(utf32_to_utf8_invalid);
+	size_t r;
+
+	{
+		uint32_t buffer[10] = { 0 };
+		uint32_t expected[] = { U'a', U'b', U'c', 0 };
+
+		r = uni8_to32("abc", buffer, 10);
+		GREATEST_ASSERT_EQ(r, 3);
+		GREATEST_ASSERT(u32cmp(buffer, expected));
+	}
+
+	{
+		uint32_t buffer[10] = { 0 };
+		uint32_t expected[] = { U'a', U'é', U'c', 0 };
+
+		r = uni8_to32("aéc", buffer, 10);
+		GREATEST_ASSERT_EQ(r, 3);
+		GREATEST_ASSERT(u32cmp(buffer, expected));
+	}
+
+	GREATEST_PASS();
 }
 
-/*
- * Conversion UTF8 -> UTF32
- * ------------------------------------------------------------------
- */
+GREATEST_TEST
+test_uni8_to32_invalid(void)
+{
+	size_t r;
+	uint32_t buffer[10] = { 0 };
+
+	/* Invalid UTF-8 sequence. */
+	r = uni8_to32(u8"\xff""a", buffer, 10);
+	GREATEST_ASSERT_EQ(r, (size_t)-1);
+	GREATEST_ASSERT_EQ(errno, EILSEQ);
+
+	/* Valid "€" but unfinished sequence. */
+	r = uni8_to32((const uint8_t []){ -30, 0 }, buffer, 10);
+	GREATEST_ASSERT_EQ(r, (size_t)-1);
+	GREATEST_ASSERT_EQ(errno, EILSEQ);
+
+	GREATEST_PASS();
+}
 
 GREATEST_TEST
-utf8_to_utf32_ascii(void)
+test_uni8_to32_toosmall(void)
+{
+	size_t r;
+	uint32_t buffer[4] = { 0 };
+
+	r = uni8_to32(u8"bonjour à tous", buffer, 1);
+	GREATEST_ASSERT_EQ(r, (size_t)-1);
+	GREATEST_ASSERT_EQ(errno, ERANGE);
+	GREATEST_PASS();
+}
+
+GREATEST_SUITE(suite_uni8_to32)
 {
-	const char *s = "abc";
-	const uint32_t expected[] = { U'a', U'b', U'c', 0 };
+	GREATEST_RUN_TEST(test_uni8_to32_basic);
+	GREATEST_RUN_TEST(test_uni8_to32_invalid);
+	GREATEST_RUN_TEST(test_uni8_to32_toosmall);
+}
 
-	GREATEST_ASSERT(u32cmp(expected, uni_toutf32(s)));
+GREATEST_TEST
+test_uni32_sizeof_basic(void)
+{
+	GREATEST_ASSERT_EQ(1, uni32_sizeof(U'a'));
+	GREATEST_ASSERT_EQ(2, uni32_sizeof(U'é'));
+	GREATEST_ASSERT_EQ(3, uni32_sizeof(U'€'));
+	GREATEST_ASSERT_EQ(4, uni32_sizeof(U'𐍈'));
 	GREATEST_PASS();
 }
 
 GREATEST_TEST
-utf8_to_utf32_valid(void)
+test_uni32_sizeof_invalid(void)
+{
+	GREATEST_ASSERT_EQ((size_t)-1, uni32_sizeof(0xffffffff));
+	GREATEST_ASSERT_EQ(errno, EILSEQ);
+	GREATEST_PASS();
+}
+
+GREATEST_SUITE(suite_uni32_sizeof)
 {
-	const char *s = u8"aéc𠀀";
-	const uint32_t expected[] = { U'a', U'é', U'c', U'𠀀', 0 };
+	GREATEST_RUN_TEST(test_uni32_sizeof_basic);
+	GREATEST_RUN_TEST(test_uni32_sizeof_invalid);
+}
+
+GREATEST_TEST
+test_uni32_length(void)
+{
+	GREATEST_ASSERT_EQ(3, uni32_length((const uint32_t []){ U'a', U'é', U'c', 0 }));
+	GREATEST_PASS();
+}
 
-	GREATEST_ASSERT(u32cmp(expected, uni_toutf32(s)));
+GREATEST_SUITE(suite_uni32_length)
+{
+	GREATEST_RUN_TEST(test_uni32_length);
+}
+
+GREATEST_TEST
+test_uni32_requires_basic(void)
+{
+	GREATEST_ASSERT_EQ(3, uni32_requires(U"abc"));
+	GREATEST_ASSERT_EQ(9, uni32_requires(U"é€𐍈"));
 	GREATEST_PASS();
 }
 
 GREATEST_TEST
-utf8_to_utf32_invalid(void)
+test_uni32_requires_invalid(void)
 {
-	const char *s = "a" "\xff""b";
-	const uint32_t *result = uni_toutf32(s);
-
-	GREATEST_ASSERT(!result);
+	GREATEST_ASSERT_EQ((size_t)-1, uni32_requires(U"\xffffffff"));
 	GREATEST_ASSERT_EQ(errno, EILSEQ);
 	GREATEST_PASS();
 }
 
-GREATEST_SUITE(utf8_to_utf32)
+GREATEST_SUITE(suite_uni32_requires)
+{
+	GREATEST_RUN_TEST(test_uni32_requires_basic);
+	GREATEST_RUN_TEST(test_uni32_requires_invalid);
+}
+
+GREATEST_TEST
+test_uni32_to8_basic(void)
 {
-	GREATEST_RUN_TEST(utf8_to_utf32_ascii);
-	GREATEST_RUN_TEST(utf8_to_utf32_valid);
-	GREATEST_RUN_TEST(utf8_to_utf32_invalid);
+	size_t r;
+
+	{
+		uint8_t buffer[10] = { 0 };
+
+		r = uni32_to8(U"abc", buffer, sizeof (buffer));
+		GREATEST_ASSERT_EQ(r, 3);
+		GREATEST_ASSERT_STR_EQ(buffer, u8"abc");
+	}
+
+	{
+		uint8_t buffer[20] = { 0 };
+
+		r = uni32_to8(U"ça va, 5€ ?", buffer, sizeof (buffer));
+		GREATEST_ASSERT_EQ(r, 14);
+		GREATEST_ASSERT_STR_EQ(buffer, u8"ça va, 5€ ?");
+	}
+
+	GREATEST_PASS();
 }
 
-/*
- * Checks functions
- * ------------------------------------------------------------------
- */
+GREATEST_TEST
+test_uni32_to8_invalid(void)
+{
+	uint8_t buffer[10] = { 0 };
+
+	GREATEST_ASSERT_EQ(uni32_to8(U"\xffffffff", buffer, sizeof (buffer)), (size_t)-1);
+	GREATEST_ASSERT_EQ(errno, EILSEQ);
+	GREATEST_PASS();
+}
 
 GREATEST_TEST
-checks_isalpha(void)
+test_uni32_to8_toosmall(void)
+{
+	size_t r;
+	uint8_t buffer[3] = { 0 };
+
+	r = uni32_to8(U"ça va ?", buffer, sizeof (buffer));
+	GREATEST_ASSERT_EQ(r, (size_t)-1);
+	GREATEST_ASSERT_EQ(errno, ERANGE);
+	GREATEST_PASS();
+}
+
+GREATEST_SUITE(suite_uni32_to8)
+{
+	GREATEST_RUN_TEST(test_uni32_to8_basic);
+	GREATEST_RUN_TEST(test_uni32_to8_invalid);
+	GREATEST_RUN_TEST(test_uni32_to8_toosmall);
+}
+
+GREATEST_TEST
+test_misc_isalpha(void)
 {
 	GREATEST_ASSERT(uni_isalpha(U'é'));
 	GREATEST_ASSERT(!uni_isalpha(U'€'));
@@ -147,7 +394,7 @@
 }
 
 GREATEST_TEST
-checks_isdigit(void)
+test_misc_isdigit(void)
 {
 	GREATEST_ASSERT(uni_isdigit(U'۱'));
 	GREATEST_ASSERT(!uni_isdigit(U'€'));
@@ -155,7 +402,7 @@
 }
 
 GREATEST_TEST
-checks_islower(void)
+test_misc_islower(void)
 {
 	GREATEST_ASSERT(uni_islower(U'a'));
 	GREATEST_ASSERT(uni_islower(U'é'));
@@ -165,7 +412,7 @@
 }
 
 GREATEST_TEST
-checks_isspace(void)
+test_misc_isspace(void)
 {
 	GREATEST_ASSERT(uni_isspace(U' '));
 	GREATEST_ASSERT(!uni_isspace(U'é'));
@@ -173,7 +420,7 @@
 }
 
 GREATEST_TEST
-checks_istitle(void)
+test_misc_istitle(void)
 {
 	GREATEST_ASSERT(uni_istitle(U'Dž'));
 	GREATEST_ASSERT(!uni_istitle(U'€'));
@@ -181,7 +428,7 @@
 }
 
 GREATEST_TEST
-checks_isupper(void)
+test_misc_isupper(void)
 {
 	GREATEST_ASSERT(!uni_isupper('a'));
 	GREATEST_ASSERT(!uni_isupper(U'é'));
@@ -190,50 +437,14 @@
 	GREATEST_PASS();
 }
 
-GREATEST_SUITE(checks)
-{
-	GREATEST_RUN_TEST(checks_isalpha);
-	GREATEST_RUN_TEST(checks_isdigit);
-	GREATEST_RUN_TEST(checks_islower);
-	GREATEST_RUN_TEST(checks_isspace);
-	GREATEST_RUN_TEST(checks_istitle);
-	GREATEST_RUN_TEST(checks_isupper);
-}
-
-/*
- * Miscellaneous
- * ------------------------------------------------------------------
- */
-
-GREATEST_TEST
-misc_requires(void)
+GREATEST_SUITE(suite_misc)
 {
-	GREATEST_ASSERT(uni_requires('a') == 1);
-	GREATEST_ASSERT(uni_requires(U'é') == 2);
-	GREATEST_ASSERT(uni_requires(U'€') == 3);
-	GREATEST_ASSERT(uni_requires(U'𠀀') == 4);
-	GREATEST_PASS();
-}
-
-GREATEST_TEST
-misc_sizeof(void)
-{
-	const char *s1 = u8"a";
-	const char *s2 = u8"é";
-	const char *s3 = u8"€";
-	const char *s4 = u8"𠀀";
-
-	GREATEST_ASSERT(uni_sizeof(s1[0]) == 1);
-	GREATEST_ASSERT(uni_sizeof(s2[0]) == 2);
-	GREATEST_ASSERT(uni_sizeof(s3[0]) == 3);
-	GREATEST_ASSERT(uni_sizeof(s4[0]) == 4);
-	GREATEST_PASS();
-}
-
-GREATEST_SUITE(misc)
-{
-	GREATEST_RUN_TEST(misc_requires);
-	GREATEST_RUN_TEST(misc_sizeof);
+	GREATEST_RUN_TEST(test_misc_isalpha);
+	GREATEST_RUN_TEST(test_misc_isdigit);
+	GREATEST_RUN_TEST(test_misc_islower);
+	GREATEST_RUN_TEST(test_misc_isspace);
+	GREATEST_RUN_TEST(test_misc_istitle);
+	GREATEST_RUN_TEST(test_misc_isupper);
 }
 
 GREATEST_MAIN_DEFS();
@@ -242,9 +453,15 @@
 main(int argc, char **argv)
 {
 	GREATEST_MAIN_BEGIN();
-	GREATEST_RUN_SUITE(utf32_to_utf8);
-	GREATEST_RUN_SUITE(utf8_to_utf32);
-	GREATEST_RUN_SUITE(checks);
-	GREATEST_RUN_SUITE(misc);
+	GREATEST_RUN_SUITE(suite_uni8_encode);
+	GREATEST_RUN_SUITE(suite_uni8_decode);
+	GREATEST_RUN_SUITE(suite_uni8_sizeof);
+	GREATEST_RUN_SUITE(suite_uni8_length);
+	GREATEST_RUN_SUITE(suite_uni8_to32);
+	GREATEST_RUN_SUITE(suite_uni32_sizeof);
+	GREATEST_RUN_SUITE(suite_uni32_length);
+	GREATEST_RUN_SUITE(suite_uni32_requires);
+	GREATEST_RUN_SUITE(suite_uni32_to8);
+	GREATEST_RUN_SUITE(suite_misc);
 	GREATEST_MAIN_END();
 }
--- a/unicode.c	Wed Mar 25 09:56:05 2020 +0100
+++ b/unicode.c	Wed Mar 25 14:33:03 2020 +0100
@@ -4608,104 +4608,106 @@
 	return c;
 }
 
-static size_t
-requires(const uint32_t *src)
+size_t
+uni8_encode(uint8_t dst[], size_t dstsz, uint32_t point)
 {
-	size_t size = 0;
-	int nb;
+	assert(dst);
+
+	size_t written;
 
-	while (*src) {
-		if ((nb = uni_requires(*src++)) == -1) {
-			errno = EILSEQ;
-			return -1;
-		}
+	switch ((written = uni32_sizeof(point))) {
+	case 1:
+		if (dstsz < 1)
+			goto erange;
 
-		if (nb > SIZE_MAX - size)
+		dst[0] = (uint8_t)point;
+		break;
+	case 2:
+		if (dstsz < 2)
 			goto erange;
 
-		size += nb;
-	}
-
-	/* If SIZE_MAX -> no space for '\0' */
-	if (size == SIZE_MAX)
-		goto erange;
-
-	return size;
-
-erange:
-	errno = ERANGE;
-	return -1;
-}
-
-void
-uni_encode(uint32_t c, char *dst)
-{
-	assert(uni_requires(c) != -1);
-	assert(dst);
-
-	switch (uni_requires(c)) {
-	case 1:
-		dst[0] = (char)c;
-		dst[1] = '\0';
-		break;
-	case 2:
-		dst[0] = 0xC0 | ((c >> 6)  & 0x1F);
-		dst[1] = 0x80 | (c & 0x3F);
-		dst[2] = '\0';
+		dst[0] = 0xC0 | ((point >> 6)  & 0x1F);
+		dst[1] = 0x80 | (point & 0x3F);
 		break;
 	case 3:
-		dst[0] = 0xE0 | ((c >> 12) & 0xF );
-		dst[1] = 0x80 | ((c >> 6)  & 0x3F);
-		dst[2] = 0x80 | (c & 0x3F);
-		dst[3] = '\0';
+		if (dstsz < 3)
+			goto erange;
+
+		dst[0] = 0xE0 | ((point >> 12) & 0xF );
+		dst[1] = 0x80 | ((point >> 6)  & 0x3F);
+		dst[2] = 0x80 | (point & 0x3F);
 		break;
 	case 4:
-		dst[0] = 0xF0 | ((c >> 18) & 0x7 );
-		dst[1] = 0x80 | ((c >> 12) & 0x3F);
-		dst[2] = 0x80 | ((c >> 6)  & 0x3F);
-		dst[3] = 0x80 | (c & 0x3F);
-		dst[4] = '\0';
+		if (dstsz < 4)
+			goto erange;
+
+		dst[0] = 0xF0 | ((point >> 18) & 0x7 );
+		dst[1] = 0x80 | ((point >> 12) & 0x3F);
+		dst[2] = 0x80 | ((point >> 6)  & 0x3F);
+		dst[3] = 0x80 | (point & 0x3F);
 		break;
 	default:
 		break;
 	}
+
+	return written;
+
+erange:
+	errno = ERANGE;
+
+	return -1;
 }
 
-uint32_t
-uni_decode(const char *src)
+size_t
+uni8_decode(const uint8_t src[], uint32_t *point)
 {
 	assert(src);
-	assert(uni_sizeof(*src) != -1);
+	assert(point);
 
-	uint32_t c = 0;
+	size_t parsed;
 
-	switch (uni_sizeof(*src)) {
+	switch ((parsed = uni8_sizeof(*src))) {
 	case 1:
-		c = src[0];
+		*point = src[0];
 		break;
 	case 2:
-		c =  (src[0] & 0x1f) << 6;
-		c |= (src[1] & 0x3f);
+		if (!src[1])
+			goto eilseq;
+
+		*point =  (src[0] & 0x1f) << 6;
+		*point |= (src[1] & 0x3f);
 		break;
 	case 3:
-		c =  (src[0] & 0x0f) << 12;
-		c |= (src[1] & 0x3f) << 6;
-		c |= (src[2] & 0x3f);
+		if (!src[1] || !src[2])
+			goto eilseq;
+
+		*point =  (src[0] & 0x0f) << 12;
+		*point |= (src[1] & 0x3f) << 6;
+		*point |= (src[2] & 0x3f);
 		break;
 	case 4:
-		c =  (src[0] & 0x07) << 16;
-		c |= (src[1] & 0x3f) << 12;
-		c |= (src[2] & 0x3f) << 6;
-		c |= (src[3] & 0x3f);
+		if (!src[1] || !src[2] || !src[3])
+			goto eilseq;
+
+		*point =  (src[0] & 0x07) << 16;
+		*point |= (src[1] & 0x3f) << 12;
+		*point |= (src[2] & 0x3f) << 6;
+		*point |= (src[3] & 0x3f);
+		break;
 	default:
 		break;
 	}
 
-	return c;
+	return parsed;
+
+eilseq:
+	errno = EILSEQ;
+
+	return -1;
 }
 
-int
-uni_sizeof(unsigned char c)
+size_t
+uni8_sizeof(uint8_t c)
 {
 	if (c <= 127)
 		return 1;
@@ -4716,11 +4718,57 @@
 	if ((c & 0xF8) == 0xF0)
 		return 4;
 
+	errno = EILSEQ;
 	return -1;
 }
 
-int
-uni_requires(uint32_t c)
+size_t
+uni8_length(const uint8_t src[])
+{
+	assert(src);
+
+	size_t total = 0, gap;
+
+	while (*src) {
+		if ((gap = uni8_sizeof(*src)) == (size_t)-1)
+			return -1;
+
+		total += gap;
+		src += gap;
+	}
+
+	return total;
+}
+
+size_t
+uni8_to32(const uint8_t src[], uint32_t dst[], size_t dstsz)
+{
+	assert(src);
+	assert(dst);
+
+	size_t nwritten = 0, gap;
+
+	for (; *src && dstsz; --dstsz) {
+		if ((gap = uni8_decode(src, dst++)) == (size_t)-1)
+			return -1;
+
+		src += gap;
+		++nwritten;
+	}
+
+	/* No more space to store NUL. */
+	if (dstsz == 0) {
+		errno = ERANGE;
+		return -1;
+	}
+
+	*dst = 0;
+
+	return nwritten;
+}
+
+size_t
+uni32_sizeof(uint32_t c)
 {
 	if (c <= 0x7F)
 		return 1;
@@ -4731,82 +4779,67 @@
 	if (c <= 0x1FFFFF)
 		return 4;
 
+	errno = EILSEQ;
 	return -1;
 }
 
 size_t
-uni_length(const char *src)
+uni32_length(const uint32_t src[])
 {
+	assert(src);
+
 	size_t total = 0;
-	int gap;
+
+	while (*src++)
+		total++;
+
+	return total;
+}
+
+size_t
+uni32_requires(const uint32_t src[])
+{
+	assert(src);
+
+	size_t total = 0, gap;
 
 	while (*src) {
-		if ((gap = uni_sizeof(*src)) == -1) {
-			errno = EILSEQ;
+		if ((gap = uni32_sizeof(*src++)) == (size_t)-1)
+			return -1;
+		if (gap >= SIZE_MAX - total) {
+			errno = ERANGE;
 			return -1;
 		}
 
 		total += gap;
-		src += gap;
 	}
 
 	return total;
 }
 
-char *
-uni_toutf8(const uint32_t *src)
-{
-	assert(src);
-
-	size_t total;
-	char *out, *ptr;
-	int nb;
-
-	if ((total = requires(src)) == -1)
-		return NULL;
-	if (!(out = malloc(total + 1)))
-		return NULL;
-
-	ptr = out;
-
-	while (*src) {
-		nb = uni_requires(*src);
-		uni_encode(*src++, ptr);
-		ptr += nb;
-	}
-
-	return out;
-}
-
-uint32_t *
-uni_toutf32(const char *src)
+size_t
+uni32_to8(const uint32_t src[], uint8_t dst[], size_t dstsz)
 {
 	assert(src);
+	assert(dst);
 
-	size_t length;
-	uint32_t *out, *ptr;
-	int nb;
+	size_t nwritten = 0, gap;
 
-	if ((length = uni_length(src)) == -1)
-		return NULL;
-	if (length == SIZE_MAX) {
-		errno = ERANGE;
-		return NULL;
-	}
-	if (!(out = malloc(sizeof (uint32_t) * length + 1)))
-		return NULL;
+	while (*src && dstsz) {
+		if ((gap = uni8_encode(dst, dstsz, *src++)) == (size_t)-1)
+			return -1;
 
-	ptr = out;
-
-	while (*src) {
-		/* No checks needed, uni_length already did it for us. */
-		int nb = uni_sizeof(*src);
-
-		*ptr++ = uni_decode(src);
-		src += nb;
+		dst += gap;
+		dstsz -= gap;
+		nwritten += gap;
 	}
 
-	*ptr = 0;
+	if (dstsz == 0) {
+		errno = ERANGE;
+		return -1;
+	}
 
-	return out;
+	*dst = 0;
+
+	return nwritten;
 }
--- a/unicode.h	Wed Mar 25 09:56:05 2020 +0100
+++ b/unicode.h	Wed Mar 25 14:33:03 2020 +0100
@@ -32,34 +32,31 @@
 /**
  * Encode the unicode code point into multibyte string.
  *
- * \pre point must be valid
- * \pre destination must have space for at least 5 bytes
- * \param point the unicode code point
- * \param res the output buffer
- * \see \ref uni_requires
+ * To make sure that buffer is always large enough, you may pass a buffer of
+ * size 4 as it's the largest UTF-8 string for now.
+ *
+ * \pre dst != NULL
+ * \param dst the UTF-8 buffer destination
+ * \param dstsz the size available in dst
+ * \param point the unicode character
+ * \return The number of bytes written (excluding the null terminator) or -1 on
+ *         error and sets errno.
+ * \warning The destination is **not** NUL terminated.
  */
-void
-uni_encode(uint32_t point, char *res);
+size_t
+uni8_encode(uint8_t dst[], size_t dstsz, uint32_t point);
 
 /**
  * Decode the multibyte buffer into an unicode code point.
  *
- * \pre src must be a valid UTF-8 string
- * \param src the source string
- * \return the converted code point
- * \see \ref uni_sizeof
+ * \pre src != NULL
+ * \pre point != NULL
+ * \param src UTF-8 the source string
+ * \param point the unicode character destination
+ * \return The number of bytes parsed in src or -1 on error and sets errno.
  */
-uint32_t
-uni_decode(const char *src);
-
-/**
- * Get the number of bytes required for the unicode point.
- *
- * \param point the unicode point
- * \return the number of bytes [1-4] or -1 if invalid
- */
-int
-uni_requires(uint32_t point);
+size_t
+uni8_decode(const uint8_t src[], uint32_t *point);
 
 /**
  * Get the number of bytes that follow this UTF-8 character.
@@ -68,48 +65,99 @@
  * character.
  *
  * \param c the first multi byte character
- * \return the number of bytes [1-4] or -1 if invalid
+ * \return The number of bytes [1-4] or -1 if invalid and sets errno.
+ * \warning You may still need to verify that following characters are valid as
+ *          this function only returns the number of bytes that *should*
+ *          exists after this one.
  */
-int
-uni_sizeof(unsigned char c);
+size_t
+uni8_sizeof(uint8_t c);
+
+/**
+ * Get real number of unicode character in a string.
+ *
+ * \pre src != NULL
+ * \param src the UTF-8 string
+ * \return The number of unicode characters or -1 on error and sets errno.
+ */
+size_t
+uni8_length(const uint8_t src[]);
 
 /**
- * Get real number of character in a string.
+ * Convert a UTF-8 string to UTF-32 string.
  *
+ * This function will write at most dstsz bytes in dst including the NUL
+ * terminator. Caller is responsible to provide an area large enough to store
+ * the required number of unicode characters plus the NUL terminator.
+ *
+ * Use \ref uni8_length to determine the number of characters required.
+ *
+ * \pre src != NULL
+ * \pre dst != NULL
  * \param src the UTF-8 string
- * \return the number of unicode codepoints or -1 on error and sets errno
- *         accordingly.
+ * \param dst the UTF-32 destination
+ * \param dstsz the size of the destination
+ * \return The number of bytes written (excluding the null terminator) or -1 on
+ *         error and sets errno.
+ * \see \ref uni8_length
  */
 size_t
-uni_length(const char *src);
+uni8_to32(const uint8_t src[], uint32_t dst[], size_t dstsz);
+
+/**
+ * Get the number of bytes required for the unicode point.
+ *
+ * \param point the unicode point
+ * \return The number of bytes [1-4] or -1 on error and sets errno.
+ */
+size_t
+uni32_sizeof(uint32_t point);
+
+/**
+ * Get the number of characters in src.
+ *
+ * \pre src != NULL
+ * \param src the NUL terminated UTF-32 string
+ * \return The number of unicode characters.
+ */
+size_t
+uni32_length(const uint32_t src[]);
+
+/**
+ * Determine the number of UTF-8 characters excluding the NUL terminator that
+ * are needed to convert this UTF-32 string to UTF-8.
+ *
+ * \pre src != NULL
+ * \param src the UTF-32 source string
+ * \return The number of bytes required excluding the NUL terminator or -1 on
+ *         error and sets errno.
+ */
+size_t
+uni32_requires(const uint32_t src[]);
 
 /**
  * Convert a UTF-32 string to UTF-8 string.
  *
- * \pre src != NULL
- * \param src the UTF-32 string
- * \return a nul-terminated string or NULL on error and sets errno accordingly
- * \note The returned string must be free'ed by the caller
- */
-char *
-uni_toutf8(const uint32_t *src);
-
-/**
- * Convert a UTF-8 string to UTF-32 string.
+ * The output buffer will be filled with at most `dstsize` bytes including the
+ * nul terminator. The function \ref uni32_requires can be used to determine
+ * the number of codepoints required.
  *
  * \pre src != NULL
- * \param src the UTF-8 string
- * \return a nul-terminated string or NULL on error and sets errno accordingly
- * \note The returned string must be free'ed by the caller
+ * \pre dst != NULL
+ * \param src the UTF-32 string
+ * \param dst the string destination
+ * \param dstsz the number of bytes available in dst
+ * \return the number of bytes written or -1 on error and sets errno
+ *         accordingly.
  */
-uint32_t *
-uni_toutf32(const char *src);
+size_t
+uni32_to8(const uint32_t src[], uint8_t dst[], size_t dstsz);
 
 /**
  * Check if the unicode character is alpha category.
  *
  * \param c the character
- * \return true if alpha
+ * \return True if alpha.
  */
 bool
 uni_isalpha(uint32_t c);
@@ -118,7 +166,7 @@
  * Check if the unicode character is digit.
  *
  * \param c the character
- * \return true if digit
+ * \return True if digit.
  */
 bool
 uni_isdigit(uint32_t c);
@@ -127,7 +175,7 @@
  * Check if the unicode character is lower case.
  *
  * \param c the character
- * \return true if lower case
+ * \return True if lower case.
  */
 bool
 uni_islower(uint32_t c);
@@ -136,7 +184,7 @@
  * Check if the unicode character is space.
  *
  * \param c the character
- * \return true if space
+ * \return True if space.
  */
 bool
 uni_isspace(uint32_t c);
@@ -145,7 +193,7 @@
  * Check if the unicode character is title case.
  *
  * \param c the character
- * \return true if title case
+ * \return True if title case.
  */
 bool
 uni_istitle(uint32_t c);
@@ -154,7 +202,7 @@
  * Check if the unicode character is upper case.
  *
  * \param c the character
- * \return true if upper case
+ * \return True if upper case.
  */
 bool
 uni_isupper(uint32_t c);
@@ -163,7 +211,7 @@
  * Convert to upper case.
  *
  * \param c the character
- * \return the upper case character
+ * \return The upper case character.
  */
 uint32_t
 uni_toupper(uint32_t c);
@@ -172,7 +220,7 @@
  * Convert to lower case.
  *
  * \param c the character
- * \return the lower case character
+ * \return The lower case character.
  */
 uint32_t
 uni_tolower(uint32_t c);
@@ -181,7 +229,7 @@
  * Convert to title case.
  *
  * \param c the character
- * \return the title case character
+ * \return The title case character.
  */
 uint32_t
 uni_totitle(uint32_t c);

mercurial