Mercurial > irccd
changeset 181:2b4864b2b5f2
Irccd: update unicode module
author | David Demelier <markand@malikania.fr> |
---|---|
date | Tue, 31 May 2016 22:23:19 +0200 |
parents | b0909b31eccb |
children | e0ca65f5ecd0 |
files | lib/irccd/unicode.cpp lib/irccd/unicode.hpp |
diffstat | 2 files changed, 177 insertions(+), 63 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/irccd/unicode.cpp Tue May 31 22:13:35 2016 +0200 +++ b/lib/irccd/unicode.cpp Tue May 31 22:23:19 2016 +0200 @@ -29,9 +29,11 @@ #define nelem(x) (sizeof (x) / sizeof ((x)[0])) -char32_t *rbsearch(char32_t c, char32_t *t, int n, int ne) noexcept +namespace { + +const char32_t *rbsearch(char32_t c, const char32_t *t, int n, int ne) noexcept { - char32_t *p; + const char32_t *p; int m; while (n > 1) { @@ -41,9 +43,8 @@ if (c >= p[0]) { t = p; n = n - m; - } else { + } else n = m; - } } if (n && c >= t[0]) @@ -52,7 +53,11 @@ return nullptr; } -static char32_t isspacer[] = { +} // !namespace + +namespace { + +const char32_t isspacer[] = { 0x0009, 0x000d, 0x0020, 0x0020, 0x0085, 0x0085, @@ -66,18 +71,23 @@ 0xfeff, 0xfeff, }; +} // !namespace + bool isspace(char32_t c) noexcept { - char32_t *p; + const char32_t *p; p = rbsearch(c, isspacer, nelem (isspacer)/2, 2); + if (p && c >= p[0] && c <= p[1]) return true; return false; } -static char32_t isdigitr[] = { +namespace { + +const char32_t isdigitr[] = { 0x0030, 0x0039, 0x0660, 0x0669, 0x06f0, 0x06f9, @@ -130,18 +140,23 @@ 0x1d7ce, 0x1d7ff, }; +} // !namespace + bool isdigit(char32_t c) noexcept { - char32_t *p; + const char32_t *p; p = rbsearch(c, isdigitr, nelem (isdigitr)/2, 2); + if (p && c >= p[0] && c <= p[1]) return true; return false; } -static char32_t isalphar[] = { +namespace { + +const char32_t isalphar[] = { 0x0041, 0x005a, 0x0061, 0x007a, 0x00c0, 0x00d6, @@ -572,7 +587,11 @@ 0x2f800, 0x2fa1d, }; -static char32_t isalphas[] = { +} // !namespace + +namespace { + +const char32_t isalphas[] = { 0x00aa, 0x00b5, 0x00ba, @@ -684,22 +703,28 @@ 0x1ee7e, }; +} // !namespace + bool isalpha(char32_t c) noexcept { - char32_t *p; + const char32_t *p; p = rbsearch(c, isalphar, nelem (isalphar)/2, 2); + if (p && c >= p[0] && c <= p[1]) return true; p = rbsearch(c, isalphas, nelem (isalphas), 1); + if (p && c == p[0]) return true; return false; } -static char32_t isupperr[] = { +namespace { + +const char32_t isupperr[] = { 0x0041, 0x005a, 0x00c0, 0x00d6, 0x00d8, 0x00de, @@ -791,7 +816,11 @@ 0x1d790, 0x1d7a8, }; -static char32_t isuppers[] = { +} // !namespace + +namespace { + +const char32_t isuppers[] = { 0x0100, 0x0102, 0x0104, @@ -1332,22 +1361,28 @@ 0x1d7ca, }; +} // !namespace + bool isupper(char32_t c) noexcept { - char32_t *p; + const char32_t *p; p = rbsearch(c, isupperr, nelem (isupperr)/2, 2); + if (p && c >= p[0] && c <= p[1]) return true; p = rbsearch(c, isuppers, nelem (isuppers), 1); + if (p && c == p[0]) return true; return false; } -static char32_t islowerr[] = { +namespace { + +const char32_t islowerr[] = { 0x0061, 0x007a, 0x00df, 0x00f6, 0x00f8, 0x00ff, @@ -1447,7 +1482,11 @@ 0x1d7c4, 0x1d7c9, }; -static char32_t islowers[] = { +} // !namespace + +namespace { + +const char32_t islowers[] = { 0x00b5, 0x0101, 0x0103, @@ -1980,22 +2019,28 @@ 0x1d7cb, }; +} // !namespace + bool islower(char32_t c) noexcept { - char32_t *p; + const char32_t *p; p = rbsearch(c, islowerr, nelem (islowerr)/2, 2); + if (p && c >= p[0] && c <= p[1]) return true; p = rbsearch(c, islowers, nelem (islowers), 1); + if (p && c == p[0]) return true; return false; } -static char32_t istitler[] = { +namespace { + +const char32_t istitler[] = { 0x0041, 0x005a, 0x00c0, 0x00d6, 0x00d8, 0x00de, @@ -2053,7 +2098,11 @@ 0x118a0, 0x118bf, }; -static char32_t istitles[] = { +} // !namespace + +namespace { + +const char32_t istitles[] = { 0x0100, 0x0102, 0x0104, @@ -2581,22 +2630,28 @@ 0xa7a8, }; +} // !namespace + bool istitle(char32_t c) noexcept { - char32_t *p; + const char32_t *p; p = rbsearch(c, istitler, nelem (istitler)/2, 2); + if (p && c >= p[0] && c <= p[1]) return true; p = rbsearch(c, istitles, nelem (istitles), 1); + if (p && c == p[0]) return true; return false; } -char32_t toupperr[] = { +namespace { + +const char32_t toupperr[] = { 0x0061, 0x007a, 1048544, 0x00e0, 0x00f6, 1048544, 0x00f8, 0x00fe, 1048544, @@ -2638,7 +2693,11 @@ 0x118c0, 0x118df, 1048544, }; -static char32_t touppers[] = { +} // !namespace + +namespace { + +const char32_t touppers[] = { 0x00b5, 1049319, 0x00ff, 1048697, 0x0101, 1048575, @@ -3246,22 +3305,28 @@ 0xa7a9, 1048575, }; +} // !namespace + char32_t toupper(char32_t c) noexcept { - char32_t *p; + const char32_t *p; p = rbsearch(c, toupperr, nelem (toupperr)/3, 3); + if (p && c >= p[0] && c <= p[1]) return c + p[2] - 1048576; p = rbsearch(c, touppers, nelem (touppers)/2, 2); + if (p && c == p[0]) return c + p[1] - 1048576; return c; } -char32_t tolowerr[] = { +namespace { + +const char32_t tolowerr[] = { 0x0041, 0x005a, 1048608, 0x00c0, 0x00d6, 1048608, 0x00d8, 0x00de, 1048608, @@ -3303,7 +3368,11 @@ 0x118a0, 0x118bf, 1048608, }; -static char32_t tolowers[] = { +} // !namespace + +namespace { + +const char32_t tolowers[] = { 0x0100, 1048577, 0x0102, 1048577, 0x0104, 1048577, @@ -3903,22 +3972,28 @@ 0xa7b1, 1006294, }; +} // !namespace + char32_t tolower(char32_t c) noexcept { - char32_t *p; + const char32_t *p; p = rbsearch(c, tolowerr, nelem (tolowerr)/3, 3); + if (p && c >= p[0] && c <= p[1]) return c + p[2] - 1048576; p = rbsearch(c, tolowers, nelem (tolowers)/2, 2); + if (p && c == p[0]) return c + p[1] - 1048576; return c; } -char32_t totitler[] = { +namespace { + +const char32_t totitler[] = { 0x0061, 0x007a, 1048544, 0x00e0, 0x00f6, 1048544, 0x00f8, 0x00fe, 1048544, @@ -3960,7 +4035,11 @@ 0x118c0, 0x118df, 1048544, }; -static char32_t totitles[] = { +} // !namespace + +namespace { + +const char32_t totitles[] = { 0x00b5, 1049319, 0x00ff, 1048697, 0x0101, 1048575, @@ -4568,15 +4647,19 @@ 0xa7a9, 1048575, }; +} // !namespace + char32_t totitle(char32_t c) noexcept { - char32_t *p; + const char32_t *p; p = rbsearch(c, totitler, nelem (totitler)/3, 3); + if (p && c >= p[0] && c <= p[1]) return c + p[2] - 1048576; p = rbsearch(c, totitles, nelem (totitles)/2, 2); + if (p && c == p[0]) return c + p[1] - 1048576; @@ -4587,7 +4670,7 @@ { switch (nbytesPoint(c)) { case 1: - res[0] = c; + res[0] = static_cast<char>(c); res[1] = '\0'; break; case 2: @@ -4642,6 +4725,8 @@ int nbytesUtf8(char c) noexcept { + if (static_cast<unsigned char>(c) <= 127) + return 1; if ((c & 0xE0) == 0xC0) return 2; if ((c & 0xF0) == 0xE0) @@ -4649,7 +4734,7 @@ if ((c & 0xF8) == 0xF0) return 4; - return 1; + return -1; } int nbytesPoint(char32_t c) noexcept @@ -4666,9 +4751,9 @@ return -1; } -int length(const std::string &str) +unsigned length(const std::string &str) { - int total = 0; + unsigned total = 0; forEach(str, [&] (char32_t) { ++ total; @@ -4685,9 +4770,8 @@ char tmp[5]; int size = nbytesPoint(array[i]); - if (size < 0) { + if (size < 0) throw std::invalid_argument("invalid sequence"); - } encode(array[i], tmp); res.insert(res.length(), tmp);
--- a/lib/irccd/unicode.hpp Tue May 31 22:13:35 2016 +0200 +++ b/lib/irccd/unicode.hpp Tue May 31 22:23:19 2016 +0200 @@ -22,19 +22,35 @@ /** * \file unicode.hpp * \brief UTF-8 to UTF-32 conversions + * \author David Demelier <markand@malikania.fr> + * \warning These files are auto-generated! */ #include <stdexcept> #include <string> -#include "sysconfig.hpp" - namespace irccd { +/** + * \brief Unicode namespace. + */ namespace unicode { -IRCCD_EXPORT void encode(char32_t point, char res[5]) noexcept; -IRCCD_EXPORT void decode(char32_t &c, const char *res) noexcept; +/** + * Encode the unicode code point into multibyte string. + * + * \param point the unicode code point + * \param res the output buffer + */ +void encode(char32_t point, char res[5]) noexcept; + +/** + * Decode the multibyte buffer into an unicode code point. + * + * \param c the code point destination + * \param res the multibyte string. + */ +void decode(char32_t &c, const char *res) noexcept; /** * Get the number of bytes for the first multi byte character from a @@ -44,17 +60,17 @@ * real character. * * \param c the first multi byte character - * \return the number of bytes [1-4] + * \return the number of bytes [1-4] or -1 if invalid */ -IRCCD_EXPORT int nbytesUtf8(char c) noexcept; +int nbytesUtf8(char c) noexcept; /** * Get the number of bytes for the unicode point. * * \param point the unicode point - * \return the number of bytes [1-4] or -1 on invalid + * \return the number of bytes [1-4] or -1 if invalid */ -IRCCD_EXPORT int nbytesPoint(char32_t point) noexcept; +int nbytesPoint(char32_t point) noexcept; /** * Get real number of character in a string. @@ -63,7 +79,7 @@ * \return the length * \throw std::invalid_argument on invalid sequence */ -IRCCD_EXPORT int length(const std::string &str); +unsigned length(const std::string &str); /** * Iterate over all real characters in the UTF-8 string. @@ -72,6 +88,7 @@ * void f(char ch) * * \param str the UTF-8 string + * \param function the function callback * \throw std::invalid_argument on invalid sequence */ template <typename Func> @@ -81,9 +98,8 @@ char32_t point = 0; int size = nbytesUtf8(str[i]); - if (size < 0) { + if (size < 0) throw std::invalid_argument("invalid sequence"); - } decode(point, str.data() + i); function(point); @@ -99,7 +115,7 @@ * \return the UTF-8 string * \throw std::invalid_argument on invalid sequence */ -IRCCD_EXPORT std::string toUtf8(const std::u32string &array); +std::string toUtf8(const std::u32string &array); /** * Convert a UTF-8 string to UTF-32 string. @@ -108,7 +124,7 @@ * \return the UTF-32 string * \throw std::invalid_argument on invalid sequence */ -IRCCD_EXPORT std::u32string toUtf32(const std::string &str); +std::u32string toUtf32(const std::string &str); /** * Check if the unicode character is space. @@ -116,7 +132,7 @@ * \param c the character * \return true if space */ -IRCCD_EXPORT bool isspace(char32_t c) noexcept; +bool isspace(char32_t c) noexcept; /** * Check if the unicode character is digit. @@ -124,7 +140,7 @@ * \param c the character * \return true if digit */ -IRCCD_EXPORT bool isdigit(char32_t c) noexcept; +bool isdigit(char32_t c) noexcept; /** * Check if the unicode character is alpha category. @@ -132,7 +148,7 @@ * \param c the character * \return true if alpha */ -IRCCD_EXPORT bool isalpha(char32_t c) noexcept; +bool isalpha(char32_t c) noexcept; /** * Check if the unicode character is upper case. @@ -140,7 +156,7 @@ * \param c the character * \return true if upper case */ -IRCCD_EXPORT bool isupper(char32_t c) noexcept; +bool isupper(char32_t c) noexcept; /** * Check if the unicode character is lower case. @@ -148,7 +164,7 @@ * \param c the character * \return true if lower case */ -IRCCD_EXPORT bool islower(char32_t c) noexcept; +bool islower(char32_t c) noexcept; /** * Check if the unicode character is title case. @@ -156,7 +172,7 @@ * \param c the character * \return true if title case */ -IRCCD_EXPORT bool istitle(char32_t c) noexcept; +bool istitle(char32_t c) noexcept; /** * Convert to upper case. @@ -164,7 +180,7 @@ * \param c the character * \return the upper case character */ -IRCCD_EXPORT char32_t toupper(char32_t c) noexcept; +char32_t toupper(char32_t c) noexcept; /** * Convert to lower case. @@ -172,7 +188,7 @@ * \param c the character * \return the lower case character */ -IRCCD_EXPORT char32_t tolower(char32_t c) noexcept; +char32_t tolower(char32_t c) noexcept; /** * Convert to title case. @@ -180,7 +196,7 @@ * \param c the character * \return the title case character */ -IRCCD_EXPORT char32_t totitle(char32_t c) noexcept; +char32_t totitle(char32_t c) noexcept; /** * Convert the UTF-32 string to upper case. @@ -190,9 +206,8 @@ */ inline std::u32string toupper(std::u32string str) { - for (size_t i = 0; i < str.size(); ++i) { + for (size_t i = 0; i < str.size(); ++i) str[i] = toupper(str[i]); - } return str; } @@ -206,7 +221,15 @@ */ inline std::string toupper(const std::string &str) { - return toUtf8(toupper(toUtf32(str))); + std::string result; + char buffer[5]; + + forEach(str, [&] (char32_t code) { + encode(toupper(code), buffer); + result += buffer; + }); + + return result; } /** @@ -217,9 +240,8 @@ */ inline std::u32string tolower(std::u32string str) { - for (size_t i = 0; i < str.size(); ++i) { + for (size_t i = 0; i < str.size(); ++i) str[i] = tolower(str[i]); - } return str; } @@ -233,7 +255,15 @@ */ inline std::string tolower(const std::string &str) { - return toUtf8(tolower(toUtf32(str))); + std::string result; + char buffer[5]; + + forEach(str, [&] (char32_t code) { + encode(tolower(code), buffer); + result += buffer; + }); + + return result; } } // !unicode