Mercurial > libunicode
diff unicode.h @ 11:43a9d763656b
unicode: improve C API, removing dynamic allocations
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 25 Mar 2020 14:33:03 +0100 |
parents | ae1003c2a284 |
children | 153c09cc6dcb |
line wrap: on
line diff
--- a/unicode.h Wed Mar 25 09:56:05 2020 +0100 +++ b/unicode.h Wed Mar 25 14:33:03 2020 +0100 @@ -32,34 +32,31 @@ /** * Encode the unicode code point into multibyte string. * - * \pre point must be valid - * \pre destination must have space for at least 5 bytes - * \param point the unicode code point - * \param res the output buffer - * \see \ref uni_requires + * To make sure that buffer is always large enough, you may pass a buffer of + * size 4 as it's the largest UTF-8 string for now. + * + * \pre dst != NULL + * \param dst the UTF-8 buffer destination + * \param dstsz the size available in dst + * \param point the unicode character + * \return The number of bytes written (excluding the null terminator) or -1 on + * error and sets errno. + * \warning The destination is **not** NUL terminated. */ -void -uni_encode(uint32_t point, char *res); +size_t +uni8_encode(uint8_t dst[], size_t dstsz, uint32_t point); /** * Decode the multibyte buffer into an unicode code point. * - * \pre src must be a valid UTF-8 string - * \param src the source string - * \return the converted code point - * \see \ref uni_sizeof + * \pre src != NULL + * \pre point != NULL + * \param src UTF-8 the source string + * \param point the unicode character destination + * \return The number of bytes parsed in src or -1 on error and sets errno. */ -uint32_t -uni_decode(const char *src); - -/** - * Get the number of bytes required for the unicode point. - * - * \param point the unicode point - * \return the number of bytes [1-4] or -1 if invalid - */ -int -uni_requires(uint32_t point); +size_t +uni8_decode(const uint8_t src[], uint32_t *point); /** * Get the number of bytes that follow this UTF-8 character. @@ -68,48 +65,99 @@ * character. * * \param c the first multi byte character - * \return the number of bytes [1-4] or -1 if invalid + * \return The number of bytes [1-4] or -1 if invalid and sets errno. + * \warning You may still need to verify that following characters are valid as + * this function only returns the number of bytes that *should* + * exists after this one. */ -int -uni_sizeof(unsigned char c); +size_t +uni8_sizeof(uint8_t c); + +/** + * Get real number of unicode character in a string. + * + * \pre src != NULL + * \param src the UTF-8 string + * \return The number of unicode characters or -1 on error and sets errno. + */ +size_t +uni8_length(const uint8_t src[]); /** - * Get real number of character in a string. + * Convert a UTF-8 string to UTF-32 string. * + * This function will write at most dstsz bytes in dst including the NUL + * terminator. Caller is responsible to provide an area large enough to store + * the required number of unicode characters plus the NUL terminator. + * + * Use \ref uni8_length to determine the number of characters required. + * + * \pre src != NULL + * \pre dst != NULL * \param src the UTF-8 string - * \return the number of unicode codepoints or -1 on error and sets errno - * accordingly. + * \param dst the UTF-32 destination + * \param dstsz the size of the destination + * \return The number of bytes written (excluding the null terminator) or -1 on + * error and sets errno. + * \see \ref uni8_length */ size_t -uni_length(const char *src); +uni8_to32(const uint8_t src[], uint32_t dst[], size_t dstsz); + +/** + * Get the number of bytes required for the unicode point. + * + * \param point the unicode point + * \return The number of bytes [1-4] or -1 on error and sets errno. + */ +size_t +uni32_sizeof(uint32_t point); + +/** + * Get the number of characters in src. + * + * \pre src != NULL + * \param src the NUL terminated UTF-32 string + * \return The number of unicode characters. + */ +size_t +uni32_length(const uint32_t src[]); + +/** + * Determine the number of UTF-8 characters excluding the NUL terminator that + * are needed to convert this UTF-32 string to UTF-8. + * + * \pre src != NULL + * \param src the UTF-32 source string + * \return The number of bytes required excluding the NUL terminator or -1 on + * error and sets errno. + */ +size_t +uni32_requires(const uint32_t src[]); /** * Convert a UTF-32 string to UTF-8 string. * - * \pre src != NULL - * \param src the UTF-32 string - * \return a nul-terminated string or NULL on error and sets errno accordingly - * \note The returned string must be free'ed by the caller - */ -char * -uni_toutf8(const uint32_t *src); - -/** - * Convert a UTF-8 string to UTF-32 string. + * The output buffer will be filled with at most `dstsize` bytes including the + * nul terminator. The function \ref uni32_requires can be used to determine + * the number of codepoints required. * * \pre src != NULL - * \param src the UTF-8 string - * \return a nul-terminated string or NULL on error and sets errno accordingly - * \note The returned string must be free'ed by the caller + * \pre dst != NULL + * \param src the UTF-32 string + * \param dst the string destination + * \param dstsz the number of bytes available in dst + * \return the number of bytes written or -1 on error and sets errno + * accordingly. */ -uint32_t * -uni_toutf32(const char *src); +size_t +uni32_to8(const uint32_t src[], uint8_t dst[], size_t dstsz); /** * Check if the unicode character is alpha category. * * \param c the character - * \return true if alpha + * \return True if alpha. */ bool uni_isalpha(uint32_t c); @@ -118,7 +166,7 @@ * Check if the unicode character is digit. * * \param c the character - * \return true if digit + * \return True if digit. */ bool uni_isdigit(uint32_t c); @@ -127,7 +175,7 @@ * Check if the unicode character is lower case. * * \param c the character - * \return true if lower case + * \return True if lower case. */ bool uni_islower(uint32_t c); @@ -136,7 +184,7 @@ * Check if the unicode character is space. * * \param c the character - * \return true if space + * \return True if space. */ bool uni_isspace(uint32_t c); @@ -145,7 +193,7 @@ * Check if the unicode character is title case. * * \param c the character - * \return true if title case + * \return True if title case. */ bool uni_istitle(uint32_t c); @@ -154,7 +202,7 @@ * Check if the unicode character is upper case. * * \param c the character - * \return true if upper case + * \return True if upper case. */ bool uni_isupper(uint32_t c); @@ -163,7 +211,7 @@ * Convert to upper case. * * \param c the character - * \return the upper case character + * \return The upper case character. */ uint32_t uni_toupper(uint32_t c); @@ -172,7 +220,7 @@ * Convert to lower case. * * \param c the character - * \return the lower case character + * \return The lower case character. */ uint32_t uni_tolower(uint32_t c); @@ -181,7 +229,7 @@ * Convert to title case. * * \param c the character - * \return the title case character + * \return The title case character. */ uint32_t uni_totitle(uint32_t c);