changeset 408:f083259de5e6

Unicode: fix forEach and nbytesPoint on invalid, improve tolower/toupper for UTF-8 while here
author David Demelier <markand@malikania.fr>
date Tue, 06 Oct 2015 14:48:36 +0200
parents 25ef13e25338
children 0d004aba3ff6
files C++/modules/Unicode/Unicode.cpp C++/modules/Unicode/Unicode.h C++/tests/Unicode/main.cpp tools/mkunicode/Unicode-after.cpp tools/mkunicode/Unicode.h
diffstat 5 files changed, 59 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/C++/modules/Unicode/Unicode.cpp	Tue Oct 06 09:05:53 2015 +0200
+++ b/C++/modules/Unicode/Unicode.cpp	Tue Oct 06 14:48:36 2015 +0200
@@ -4640,6 +4640,8 @@
 
 int nbytesUtf8(char c) noexcept
 {
+	if (static_cast<unsigned char>(c) <= 127)
+		return 1;
 	if ((c & 0xE0) == 0xC0)
 		return 2;
 	if ((c & 0xF0) == 0xE0)
@@ -4647,7 +4649,7 @@
 	if ((c & 0xF8) == 0xF0)
 		return 4;
 
-	return 1;
+	return -1;
 }
 
 int nbytesPoint(char32_t c) noexcept
--- a/C++/modules/Unicode/Unicode.h	Tue Oct 06 09:05:53 2015 +0200
+++ b/C++/modules/Unicode/Unicode.h	Tue Oct 06 14:48:36 2015 +0200
@@ -40,7 +40,7 @@
  * real character.
  *
  * @param c the first multi byte character
- * @return the number of bytes [1-4]
+ * @return the number of bytes [1-4] or -1 if invalid
  */
 int nbytesUtf8(char c) noexcept;
 
@@ -48,7 +48,7 @@
  * Get the number of bytes for the unicode point.
  *
  * @param point the unicode point
- * @return the number of bytes [1-4] or -1 on invalid
+ * @return the number of bytes [1-4] or -1 if invalid
  */
 int nbytesPoint(char32_t point) noexcept;
 
@@ -202,7 +202,15 @@
  */
 inline std::string toupper(const std::string &str)
 {
-	return toUtf8(toupper(toUtf32(str)));
+	std::string result;
+	char buffer[5];
+
+	forEach(str, [&] (char32_t code) {
+		encode(toupper(code), buffer);
+		result += buffer;
+	});
+
+	return result;
 }
 
 /**
@@ -229,7 +237,15 @@
  */
 inline std::string tolower(const std::string &str)
 {
-	return toUtf8(tolower(toUtf32(str)));
+	std::string result;
+	char buffer[5];
+
+	forEach(str, [&] (char32_t code) {
+		encode(tolower(code), buffer);
+		result += buffer;
+	});
+
+	return result;
 }
 
 } // !unicode
--- a/C++/tests/Unicode/main.cpp	Tue Oct 06 09:05:53 2015 +0200
+++ b/C++/tests/Unicode/main.cpp	Tue Oct 06 14:48:36 2015 +0200
@@ -255,7 +255,7 @@
 	try {
 		std::string s{"A" "\xFF""B"};
 		std::string r = unicode::tolower(s);
-
+printf("%s\n", r.c_str());
 		FAIL() << "expected a failure";
 	} catch (const std::exception &ex) {
 		SUCCEED();
@@ -341,6 +341,18 @@
 	ASSERT_EQ(4, current);
 }
 
+TEST(Misc, forEachInvalid)
+{
+	std::string s{"a" "\xFF" "b"};
+
+	try {
+		unicode::forEach(s, [&] (char32_t) { });
+
+		FAIL() << "exception expected";
+	} catch (...) {
+	}
+}
+
 int main(int argc, char **argv)
 {
 	InitGoogleTest(&argc, argv);
--- a/tools/mkunicode/Unicode-after.cpp	Tue Oct 06 09:05:53 2015 +0200
+++ b/tools/mkunicode/Unicode-after.cpp	Tue Oct 06 14:48:36 2015 +0200
@@ -57,6 +57,8 @@
 
 int nbytesUtf8(char c) noexcept
 {
+	if (static_cast<unsigned char>(c) <= 127)
+		return 1;
 	if ((c & 0xE0) == 0xC0)
 		return 2;
 	if ((c & 0xF0) == 0xE0)
@@ -64,7 +66,7 @@
 	if ((c & 0xF8) == 0xF0)
 		return 4;
 
-	return 1;
+	return -1;
 }
 
 int nbytesPoint(char32_t c) noexcept
--- a/tools/mkunicode/Unicode.h	Tue Oct 06 09:05:53 2015 +0200
+++ b/tools/mkunicode/Unicode.h	Tue Oct 06 14:48:36 2015 +0200
@@ -40,7 +40,7 @@
  * real character.
  *
  * @param c the first multi byte character
- * @return the number of bytes [1-4]
+ * @return the number of bytes [1-4] or -1 if invalid
  */
 int nbytesUtf8(char c) noexcept;
 
@@ -48,7 +48,7 @@
  * Get the number of bytes for the unicode point.
  *
  * @param point the unicode point
- * @return the number of bytes [1-4] or -1 on invalid
+ * @return the number of bytes [1-4] or -1 if invalid
  */
 int nbytesPoint(char32_t point) noexcept;
 
@@ -202,7 +202,15 @@
  */
 inline std::string toupper(const std::string &str)
 {
-	return toUtf8(toupper(toUtf32(str)));
+	std::string result;
+	char buffer[5];
+
+	forEach(str, [&] (char32_t code) {
+		encode(toupper(code), buffer);
+		result += buffer;
+	});
+
+	return result;
 }
 
 /**
@@ -229,7 +237,15 @@
  */
 inline std::string tolower(const std::string &str)
 {
-	return toUtf8(tolower(toUtf32(str)));
+	std::string result;
+	char buffer[5];
+
+	forEach(str, [&] (char32_t code) {
+		encode(tolower(code), buffer);
+		result += buffer;
+	});
+
+	return result;
 }
 
 } // !unicode