Mercurial > libunicode
diff gen/unicode-after.cpp @ 10:ae1003c2a284
misc: extreme simplification
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 25 Mar 2020 09:56:05 +0100 |
parents | generator/make-unicode/unicode-after.cpp@d9309daa0d7b |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gen/unicode-after.cpp Wed Mar 25 09:56:05 2020 +0100 @@ -0,0 +1,172 @@ +void encode(char32_t c, char res[5]) noexcept +{ + switch (nbytes_point(c)) { + case 1: + res[0] = static_cast<char>(c); + res[1] = '\0'; + break; + case 2: + res[0] = 0xC0 | ((c >> 6) & 0x1F); + res[1] = 0x80 | (c & 0x3F); + res[2] = '\0'; + break; + case 3: + res[0] = 0xE0 | ((c >> 12) & 0xF ); + res[1] = 0x80 | ((c >> 6) & 0x3F); + res[2] = 0x80 | (c & 0x3F); + res[3] = '\0'; + break; + case 4: + res[0] = 0xF0 | ((c >> 18) & 0x7 ); + res[1] = 0x80 | ((c >> 12) & 0x3F); + res[2] = 0x80 | ((c >> 6) & 0x3F); + res[3] = 0x80 | (c & 0x3F); + res[4] = '\0'; + break; + default: + break; + } +} + +void decode(char32_t& c, const char* res) noexcept +{ + c = 0; + + switch (nbytes_utf8(res[0])) { + case 1: + c = res[0]; + break; + case 2: + c = (res[0] & 0x1f) << 6; + c |= (res[1] & 0x3f); + break; + case 3: + c = (res[0] & 0x0f) << 12; + c |= (res[1] & 0x3f) << 6; + c |= (res[2] & 0x3f); + break; + case 4: + c = (res[0] & 0x07) << 16; + c |= (res[1] & 0x3f) << 12; + c |= (res[2] & 0x3f) << 6; + c |= (res[3] & 0x3f); + default: + break; + } +} + +auto nbytes_utf8(char c) noexcept -> int +{ + if (static_cast<unsigned char>(c) <= 127) + return 1; + if ((c & 0xE0) == 0xC0) + return 2; + if ((c & 0xF0) == 0xE0) + return 3; + if ((c & 0xF8) == 0xF0) + return 4; + + return -1; +} + +auto nbytes_point(char32_t c) noexcept -> int +{ + if (c <= 0x7F) + return 1; + if (c <= 0x7FF) + return 2; + if (c <= 0xFFFF) + return 3; + if (c <= 0x1FFFFF) + return 4; + + return -1; +} + +auto length(std::string_view str) -> unsigned +{ + unsigned total = 0; + + for_each(str, [&] (auto) { + ++ total; + }); + + return total; +} + +auto to_utf8(std::u32string_view array) -> std::string +{ + std::string res; + + for (size_t i = 0; i < array.size(); ++i) { + char tmp[5]; + int size = nbytes_point(array[i]); + + if (size < 0) + throw std::invalid_argument("invalid sequence"); + + encode(array[i], tmp); + res.insert(res.length(), tmp); + } + + return res; +} + +auto to_utf32(std::string_view str) -> std::u32string +{ + std::u32string res; + + for_each(str, [&] (char32_t code) { + res.push_back(code); + }); + + return res; +} + +auto toupper(std::u32string_view str) -> std::u32string +{ + std::u32string res(str); + + for (size_t i = 0; i < str.size(); ++i) + res[i] = toupper(str[i]); + + return res; +} + +auto toupper(std::string_view str) -> std::string +{ + std::string res; + char buffer[5]; + + for_each(str, [&] (auto code) { + encode(toupper(code), buffer); + res += buffer; + }); + + return res; +} + +auto tolower(std::u32string_view str) -> std::u32string +{ + std::u32string ret(str); + + for (size_t i = 0; i < str.size(); ++i) + ret[i] = tolower(str[i]); + + return ret; +} + +auto tolower(std::string_view str) -> std::string +{ + std::string res; + char buffer[5]; + + for_each(str, [&] (auto code) { + encode(tolower(code), buffer); + res += buffer; + }); + + return res; +} + +} // !unicode