changeset 181:2b4864b2b5f2

Irccd: update unicode module
author David Demelier <markand@malikania.fr>
date Tue, 31 May 2016 22:23:19 +0200
parents b0909b31eccb
children e0ca65f5ecd0
files lib/irccd/unicode.cpp lib/irccd/unicode.hpp
diffstat 2 files changed, 177 insertions(+), 63 deletions(-) [+]
line wrap: on
line diff
--- a/lib/irccd/unicode.cpp	Tue May 31 22:13:35 2016 +0200
+++ b/lib/irccd/unicode.cpp	Tue May 31 22:23:19 2016 +0200
@@ -29,9 +29,11 @@
 
 #define nelem(x) (sizeof (x) / sizeof ((x)[0]))
 
-char32_t *rbsearch(char32_t c, char32_t *t, int n, int ne) noexcept
+namespace {
+
+const char32_t *rbsearch(char32_t c, const char32_t *t, int n, int ne) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 	int m;
 
 	while (n > 1) {
@@ -41,9 +43,8 @@
 		if (c >= p[0]) {
 			t = p;
 			n = n - m;
-		} else {
+		} else
 			n = m;
-		}
 	}
 
 	if (n && c >= t[0])
@@ -52,7 +53,11 @@
 	return nullptr;
 }
 
-static char32_t isspacer[] = {
+} // !namespace
+
+namespace {
+
+const char32_t isspacer[] = {
 	0x0009, 0x000d,
 	0x0020, 0x0020,
 	0x0085, 0x0085,
@@ -66,18 +71,23 @@
 	0xfeff, 0xfeff,
 };
 
+} // !namespace
+
 bool isspace(char32_t c) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 
 	p = rbsearch(c, isspacer, nelem (isspacer)/2, 2);
+
 	if (p && c >= p[0] && c <= p[1])
 		return true;
 
 	return false;
 }
 
-static char32_t isdigitr[] = {
+namespace {
+
+const char32_t isdigitr[] = {
 	0x0030, 0x0039,
 	0x0660, 0x0669,
 	0x06f0, 0x06f9,
@@ -130,18 +140,23 @@
 	0x1d7ce, 0x1d7ff,
 };
 
+} // !namespace
+
 bool isdigit(char32_t c) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 
 	p = rbsearch(c, isdigitr, nelem (isdigitr)/2, 2);
+
 	if (p && c >= p[0] && c <= p[1])
 		return true;
 
 	return false;
 }
 
-static char32_t isalphar[] = {
+namespace {
+
+const char32_t isalphar[] = {
 	0x0041, 0x005a,
 	0x0061, 0x007a,
 	0x00c0, 0x00d6,
@@ -572,7 +587,11 @@
 	0x2f800, 0x2fa1d,
 };
 
-static char32_t isalphas[] = {
+} // !namespace
+
+namespace {
+
+const char32_t isalphas[] = {
 	0x00aa,
 	0x00b5,
 	0x00ba,
@@ -684,22 +703,28 @@
 	0x1ee7e,
 };
 
+} // !namespace
+
 bool isalpha(char32_t c) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 
 	p = rbsearch(c, isalphar, nelem (isalphar)/2, 2);
+
 	if (p && c >= p[0] && c <= p[1])
 		return true;
 
 	p = rbsearch(c, isalphas, nelem (isalphas), 1);
+
 	if (p && c == p[0])
 		return true;
 
 	return false;
 }
 
-static char32_t isupperr[] = {
+namespace {
+
+const char32_t isupperr[] = {
 	0x0041, 0x005a,
 	0x00c0, 0x00d6,
 	0x00d8, 0x00de,
@@ -791,7 +816,11 @@
 	0x1d790, 0x1d7a8,
 };
 
-static char32_t isuppers[] = {
+} // !namespace
+
+namespace {
+
+const char32_t isuppers[] = {
 	0x0100,
 	0x0102,
 	0x0104,
@@ -1332,22 +1361,28 @@
 	0x1d7ca,
 };
 
+} // !namespace
+
 bool isupper(char32_t c) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 
 	p = rbsearch(c, isupperr, nelem (isupperr)/2, 2);
+
 	if (p && c >= p[0] && c <= p[1])
 		return true;
 
 	p = rbsearch(c, isuppers, nelem (isuppers), 1);
+
 	if (p && c == p[0])
 		return true;
 
 	return false;
 }
 
-static char32_t islowerr[] = {
+namespace {
+
+const char32_t islowerr[] = {
 	0x0061, 0x007a,
 	0x00df, 0x00f6,
 	0x00f8, 0x00ff,
@@ -1447,7 +1482,11 @@
 	0x1d7c4, 0x1d7c9,
 };
 
-static char32_t islowers[] = {
+} // !namespace
+
+namespace {
+
+const char32_t islowers[] = {
 	0x00b5,
 	0x0101,
 	0x0103,
@@ -1980,22 +2019,28 @@
 	0x1d7cb,
 };
 
+} // !namespace
+
 bool islower(char32_t c) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 
 	p = rbsearch(c, islowerr, nelem (islowerr)/2, 2);
+
 	if (p && c >= p[0] && c <= p[1])
 		return true;
 
 	p = rbsearch(c, islowers, nelem (islowers), 1);
+
 	if (p && c == p[0])
 		return true;
 
 	return false;
 }
 
-static char32_t istitler[] = {
+namespace {
+
+const char32_t istitler[] = {
 	0x0041, 0x005a,
 	0x00c0, 0x00d6,
 	0x00d8, 0x00de,
@@ -2053,7 +2098,11 @@
 	0x118a0, 0x118bf,
 };
 
-static char32_t istitles[] = {
+} // !namespace
+
+namespace {
+
+const char32_t istitles[] = {
 	0x0100,
 	0x0102,
 	0x0104,
@@ -2581,22 +2630,28 @@
 	0xa7a8,
 };
 
+} // !namespace
+
 bool istitle(char32_t c) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 
 	p = rbsearch(c, istitler, nelem (istitler)/2, 2);
+
 	if (p && c >= p[0] && c <= p[1])
 		return true;
 
 	p = rbsearch(c, istitles, nelem (istitles), 1);
+
 	if (p && c == p[0])
 		return true;
 
 	return false;
 }
 
-char32_t toupperr[] = {
+namespace {
+
+const char32_t toupperr[] = {
 	0x0061, 0x007a, 1048544,
 	0x00e0, 0x00f6, 1048544,
 	0x00f8, 0x00fe, 1048544,
@@ -2638,7 +2693,11 @@
 	0x118c0, 0x118df, 1048544,
 };
 
-static char32_t touppers[] = {
+} // !namespace
+
+namespace {
+
+const char32_t touppers[] = {
 	0x00b5, 1049319,
 	0x00ff, 1048697,
 	0x0101, 1048575,
@@ -3246,22 +3305,28 @@
 	0xa7a9, 1048575,
 };
 
+} // !namespace
+
 char32_t toupper(char32_t c) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 
 	p = rbsearch(c, toupperr, nelem (toupperr)/3, 3);
+
 	if (p && c >= p[0] && c <= p[1])
 		return c + p[2] - 1048576;
 
 	p = rbsearch(c, touppers, nelem (touppers)/2, 2);
+
 	if (p && c == p[0])
 		return c + p[1] - 1048576;
 
 	return c;
 }
 
-char32_t tolowerr[] = {
+namespace {
+
+const char32_t tolowerr[] = {
 	0x0041, 0x005a, 1048608,
 	0x00c0, 0x00d6, 1048608,
 	0x00d8, 0x00de, 1048608,
@@ -3303,7 +3368,11 @@
 	0x118a0, 0x118bf, 1048608,
 };
 
-static char32_t tolowers[] = {
+} // !namespace
+
+namespace {
+
+const char32_t tolowers[] = {
 	0x0100, 1048577,
 	0x0102, 1048577,
 	0x0104, 1048577,
@@ -3903,22 +3972,28 @@
 	0xa7b1, 1006294,
 };
 
+} // !namespace
+
 char32_t tolower(char32_t c) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 
 	p = rbsearch(c, tolowerr, nelem (tolowerr)/3, 3);
+
 	if (p && c >= p[0] && c <= p[1])
 		return c + p[2] - 1048576;
 
 	p = rbsearch(c, tolowers, nelem (tolowers)/2, 2);
+
 	if (p && c == p[0])
 		return c + p[1] - 1048576;
 
 	return c;
 }
 
-char32_t totitler[] = {
+namespace {
+
+const char32_t totitler[] = {
 	0x0061, 0x007a, 1048544,
 	0x00e0, 0x00f6, 1048544,
 	0x00f8, 0x00fe, 1048544,
@@ -3960,7 +4035,11 @@
 	0x118c0, 0x118df, 1048544,
 };
 
-static char32_t totitles[] = {
+} // !namespace
+
+namespace {
+
+const char32_t totitles[] = {
 	0x00b5, 1049319,
 	0x00ff, 1048697,
 	0x0101, 1048575,
@@ -4568,15 +4647,19 @@
 	0xa7a9, 1048575,
 };
 
+} // !namespace
+
 char32_t totitle(char32_t c) noexcept
 {
-	char32_t *p;
+	const char32_t *p;
 
 	p = rbsearch(c, totitler, nelem (totitler)/3, 3);
+
 	if (p && c >= p[0] && c <= p[1])
 		return c + p[2] - 1048576;
 
 	p = rbsearch(c, totitles, nelem (totitles)/2, 2);
+
 	if (p && c == p[0])
 		return c + p[1] - 1048576;
 
@@ -4587,7 +4670,7 @@
 {
 	switch (nbytesPoint(c)) {
 	case 1:
-		res[0] = c;
+		res[0] = static_cast<char>(c);
 		res[1] = '\0';
 		break;
 	case 2:
@@ -4642,6 +4725,8 @@
 
 int nbytesUtf8(char c) noexcept
 {
+	if (static_cast<unsigned char>(c) <= 127)
+		return 1;
 	if ((c & 0xE0) == 0xC0)
 		return 2;
 	if ((c & 0xF0) == 0xE0)
@@ -4649,7 +4734,7 @@
 	if ((c & 0xF8) == 0xF0)
 		return 4;
 
-	return 1;
+	return -1;
 }
 
 int nbytesPoint(char32_t c) noexcept
@@ -4666,9 +4751,9 @@
 	return -1;
 }
 
-int length(const std::string &str)
+unsigned length(const std::string &str)
 {
-	int total = 0;
+	unsigned total = 0;
 
 	forEach(str, [&] (char32_t) {
 		++ total;
@@ -4685,9 +4770,8 @@
 		char tmp[5];
 		int size = nbytesPoint(array[i]);
 
-		if (size < 0) {
+		if (size < 0)
 			throw std::invalid_argument("invalid sequence");
-		}
 
 		encode(array[i], tmp);
 		res.insert(res.length(), tmp);
--- a/lib/irccd/unicode.hpp	Tue May 31 22:13:35 2016 +0200
+++ b/lib/irccd/unicode.hpp	Tue May 31 22:23:19 2016 +0200
@@ -22,19 +22,35 @@
 /**
  * \file unicode.hpp
  * \brief UTF-8 to UTF-32 conversions
+ * \author David Demelier <markand@malikania.fr>
+ * \warning These files are auto-generated!
  */
 
 #include <stdexcept>
 #include <string>
 
-#include "sysconfig.hpp"
-
 namespace irccd {
 
+/**
+ * \brief Unicode namespace.
+ */
 namespace unicode {
 
-IRCCD_EXPORT void encode(char32_t point, char res[5]) noexcept;
-IRCCD_EXPORT void decode(char32_t &c, const char *res) noexcept;
+/**
+ * Encode the unicode code point into multibyte string.
+ *
+ * \param point the unicode code point
+ * \param res the output buffer
+ */
+void encode(char32_t point, char res[5]) noexcept;
+
+/**
+ * Decode the multibyte buffer into an unicode code point.
+ *
+ * \param c the code point destination
+ * \param res the multibyte string.
+ */
+void decode(char32_t &c, const char *res) noexcept;
 
 /**
  * Get the number of bytes for the first multi byte character from a
@@ -44,17 +60,17 @@
  * real character.
  *
  * \param c the first multi byte character
- * \return the number of bytes [1-4]
+ * \return the number of bytes [1-4] or -1 if invalid
  */
-IRCCD_EXPORT int nbytesUtf8(char c) noexcept;
+int nbytesUtf8(char c) noexcept;
 
 /**
  * Get the number of bytes for the unicode point.
  *
  * \param point the unicode point
- * \return the number of bytes [1-4] or -1 on invalid
+ * \return the number of bytes [1-4] or -1 if invalid
  */
-IRCCD_EXPORT int nbytesPoint(char32_t point) noexcept;
+int nbytesPoint(char32_t point) noexcept;
 
 /**
  * Get real number of character in a string.
@@ -63,7 +79,7 @@
  * \return the length
  * \throw std::invalid_argument on invalid sequence
  */
-IRCCD_EXPORT int length(const std::string &str);
+unsigned length(const std::string &str);
 
 /**
  * Iterate over all real characters in the UTF-8 string.
@@ -72,6 +88,7 @@
  *	void f(char ch)
  *
  * \param str the UTF-8 string
+ * \param function the function callback
  * \throw std::invalid_argument on invalid sequence
  */
 template <typename Func>
@@ -81,9 +98,8 @@
 		char32_t point = 0;
 		int size = nbytesUtf8(str[i]);
 
-		if (size < 0) {
+		if (size < 0)
 			throw std::invalid_argument("invalid sequence");
-		}
 
 		decode(point, str.data() + i);
 		function(point);
@@ -99,7 +115,7 @@
  * \return the UTF-8 string
  * \throw std::invalid_argument on invalid sequence
  */
-IRCCD_EXPORT std::string toUtf8(const std::u32string &array);
+std::string toUtf8(const std::u32string &array);
 
 /**
  * Convert a UTF-8 string to UTF-32 string.
@@ -108,7 +124,7 @@
  * \return the UTF-32 string
  * \throw std::invalid_argument on invalid sequence
  */
-IRCCD_EXPORT std::u32string toUtf32(const std::string &str);
+std::u32string toUtf32(const std::string &str);
 
 /**
  * Check if the unicode character is space.
@@ -116,7 +132,7 @@
  * \param c the character
  * \return true if space
  */
-IRCCD_EXPORT bool isspace(char32_t c) noexcept;
+bool isspace(char32_t c) noexcept;
 
 /**
  * Check if the unicode character is digit.
@@ -124,7 +140,7 @@
  * \param c the character
  * \return true if digit
  */
-IRCCD_EXPORT bool isdigit(char32_t c) noexcept;
+bool isdigit(char32_t c) noexcept;
 
 /**
  * Check if the unicode character is alpha category.
@@ -132,7 +148,7 @@
  * \param c the character
  * \return true if alpha
  */
-IRCCD_EXPORT bool isalpha(char32_t c) noexcept;
+bool isalpha(char32_t c) noexcept;
 
 /**
  * Check if the unicode character is upper case.
@@ -140,7 +156,7 @@
  * \param c the character
  * \return true if upper case
  */
-IRCCD_EXPORT bool isupper(char32_t c) noexcept;
+bool isupper(char32_t c) noexcept;
 
 /**
  * Check if the unicode character is lower case.
@@ -148,7 +164,7 @@
  * \param c the character
  * \return true if lower case
  */
-IRCCD_EXPORT bool islower(char32_t c) noexcept;
+bool islower(char32_t c) noexcept;
 
 /**
  * Check if the unicode character is title case.
@@ -156,7 +172,7 @@
  * \param c the character
  * \return true if title case
  */
-IRCCD_EXPORT bool istitle(char32_t c) noexcept;
+bool istitle(char32_t c) noexcept;
 
 /**
  * Convert to upper case.
@@ -164,7 +180,7 @@
  * \param c the character
  * \return the upper case character
  */
-IRCCD_EXPORT char32_t toupper(char32_t c) noexcept;
+char32_t toupper(char32_t c) noexcept;
 
 /**
  * Convert to lower case.
@@ -172,7 +188,7 @@
  * \param c the character
  * \return the lower case character
  */
-IRCCD_EXPORT char32_t tolower(char32_t c) noexcept;
+char32_t tolower(char32_t c) noexcept;
 
 /**
  * Convert to title case.
@@ -180,7 +196,7 @@
  * \param c the character
  * \return the title case character
  */
-IRCCD_EXPORT char32_t totitle(char32_t c) noexcept;
+char32_t totitle(char32_t c) noexcept;
 
 /**
  * Convert the UTF-32 string to upper case.
@@ -190,9 +206,8 @@
  */
 inline std::u32string toupper(std::u32string str)
 {
-	for (size_t i = 0; i < str.size(); ++i) {
+	for (size_t i = 0; i < str.size(); ++i)
 		str[i] = toupper(str[i]);
-	}
 
 	return str;
 }
@@ -206,7 +221,15 @@
  */
 inline std::string toupper(const std::string &str)
 {
-	return toUtf8(toupper(toUtf32(str)));
+	std::string result;
+	char buffer[5];
+
+	forEach(str, [&] (char32_t code) {
+		encode(toupper(code), buffer);
+		result += buffer;
+	});
+
+	return result;
 }
 
 /**
@@ -217,9 +240,8 @@
  */
 inline std::u32string tolower(std::u32string str)
 {
-	for (size_t i = 0; i < str.size(); ++i) {
+	for (size_t i = 0; i < str.size(); ++i)
 		str[i] = tolower(str[i]);
-	}
 
 	return str;
 }
@@ -233,7 +255,15 @@
  */
 inline std::string tolower(const std::string &str)
 {
-	return toUtf8(tolower(toUtf32(str)));
+	std::string result;
+	char buffer[5];
+
+	forEach(str, [&] (char32_t code) {
+		encode(tolower(code), buffer);
+		result += buffer;
+	});
+
+	return result;
 }
 
 } // !unicode