Mercurial > code
changeset 408:f083259de5e6
Unicode: fix forEach and nbytesPoint on invalid, improve tolower/toupper for UTF-8 while here
author | David Demelier <markand@malikania.fr> |
---|---|
date | Tue, 06 Oct 2015 14:48:36 +0200 |
parents | 25ef13e25338 |
children | 0d004aba3ff6 |
files | C++/modules/Unicode/Unicode.cpp C++/modules/Unicode/Unicode.h C++/tests/Unicode/main.cpp tools/mkunicode/Unicode-after.cpp tools/mkunicode/Unicode.h |
diffstat | 5 files changed, 59 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- a/C++/modules/Unicode/Unicode.cpp Tue Oct 06 09:05:53 2015 +0200 +++ b/C++/modules/Unicode/Unicode.cpp Tue Oct 06 14:48:36 2015 +0200 @@ -4640,6 +4640,8 @@ int nbytesUtf8(char c) noexcept { + if (static_cast<unsigned char>(c) <= 127) + return 1; if ((c & 0xE0) == 0xC0) return 2; if ((c & 0xF0) == 0xE0) @@ -4647,7 +4649,7 @@ if ((c & 0xF8) == 0xF0) return 4; - return 1; + return -1; } int nbytesPoint(char32_t c) noexcept
--- a/C++/modules/Unicode/Unicode.h Tue Oct 06 09:05:53 2015 +0200 +++ b/C++/modules/Unicode/Unicode.h Tue Oct 06 14:48:36 2015 +0200 @@ -40,7 +40,7 @@ * real character. * * @param c the first multi byte character - * @return the number of bytes [1-4] + * @return the number of bytes [1-4] or -1 if invalid */ int nbytesUtf8(char c) noexcept; @@ -48,7 +48,7 @@ * Get the number of bytes for the unicode point. * * @param point the unicode point - * @return the number of bytes [1-4] or -1 on invalid + * @return the number of bytes [1-4] or -1 if invalid */ int nbytesPoint(char32_t point) noexcept; @@ -202,7 +202,15 @@ */ inline std::string toupper(const std::string &str) { - return toUtf8(toupper(toUtf32(str))); + std::string result; + char buffer[5]; + + forEach(str, [&] (char32_t code) { + encode(toupper(code), buffer); + result += buffer; + }); + + return result; } /** @@ -229,7 +237,15 @@ */ inline std::string tolower(const std::string &str) { - return toUtf8(tolower(toUtf32(str))); + std::string result; + char buffer[5]; + + forEach(str, [&] (char32_t code) { + encode(tolower(code), buffer); + result += buffer; + }); + + return result; } } // !unicode
--- a/C++/tests/Unicode/main.cpp Tue Oct 06 09:05:53 2015 +0200 +++ b/C++/tests/Unicode/main.cpp Tue Oct 06 14:48:36 2015 +0200 @@ -255,7 +255,7 @@ try { std::string s{"A" "\xFF""B"}; std::string r = unicode::tolower(s); - +printf("%s\n", r.c_str()); FAIL() << "expected a failure"; } catch (const std::exception &ex) { SUCCEED(); @@ -341,6 +341,18 @@ ASSERT_EQ(4, current); } +TEST(Misc, forEachInvalid) +{ + std::string s{"a" "\xFF" "b"}; + + try { + unicode::forEach(s, [&] (char32_t) { }); + + FAIL() << "exception expected"; + } catch (...) { + } +} + int main(int argc, char **argv) { InitGoogleTest(&argc, argv);
--- a/tools/mkunicode/Unicode-after.cpp Tue Oct 06 09:05:53 2015 +0200 +++ b/tools/mkunicode/Unicode-after.cpp Tue Oct 06 14:48:36 2015 +0200 @@ -57,6 +57,8 @@ int nbytesUtf8(char c) noexcept { + if (static_cast<unsigned char>(c) <= 127) + return 1; if ((c & 0xE0) == 0xC0) return 2; if ((c & 0xF0) == 0xE0) @@ -64,7 +66,7 @@ if ((c & 0xF8) == 0xF0) return 4; - return 1; + return -1; } int nbytesPoint(char32_t c) noexcept
--- a/tools/mkunicode/Unicode.h Tue Oct 06 09:05:53 2015 +0200 +++ b/tools/mkunicode/Unicode.h Tue Oct 06 14:48:36 2015 +0200 @@ -40,7 +40,7 @@ * real character. * * @param c the first multi byte character - * @return the number of bytes [1-4] + * @return the number of bytes [1-4] or -1 if invalid */ int nbytesUtf8(char c) noexcept; @@ -48,7 +48,7 @@ * Get the number of bytes for the unicode point. * * @param point the unicode point - * @return the number of bytes [1-4] or -1 on invalid + * @return the number of bytes [1-4] or -1 if invalid */ int nbytesPoint(char32_t point) noexcept; @@ -202,7 +202,15 @@ */ inline std::string toupper(const std::string &str) { - return toUtf8(toupper(toUtf32(str))); + std::string result; + char buffer[5]; + + forEach(str, [&] (char32_t code) { + encode(toupper(code), buffer); + result += buffer; + }); + + return result; } /** @@ -229,7 +237,15 @@ */ inline std::string tolower(const std::string &str) { - return toUtf8(tolower(toUtf32(str))); + std::string result; + char buffer[5]; + + forEach(str, [&] (char32_t code) { + encode(tolower(code), buffer); + result += buffer; + }); + + return result; } } // !unicode