Mercurial > code
diff tools/mkunicode/Unicode.h @ 352:7fe8d4094983
Utf8:
- Fix invalid decoding from UTF-8 to UTF-32
- Add all files
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 08 Apr 2015 12:33:45 +0200 |
parents | C++/modules/Utf8/Utf8.h@0b576ee64d45 |
children | b78d6d8f2872 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/mkunicode/Unicode.h Wed Apr 08 12:33:45 2015 +0200 @@ -0,0 +1,241 @@ +/* + * Unicode.h -- UTF-8 to UTF-32 conversions and various operations + * + * Copyright (c) 2013, 2014, 2015 David Demelier <markand@malikania.fr> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _UNICODE_H_ +#define _UNICODE_H_ + +/** + * @file Unicode.h + * @brief UTF-8 to UTF-32 conversions + */ + +#include <stdexcept> +#include <string> + +/** + * @class Unicode + * @brief Conversion between UTF-8 and UTF-32 + */ +class Unicode { +private: + static void encode(char32_t point, char res[5]) noexcept; + static void decode(char32_t &c, const char *res) noexcept; + +public: + /** + * Get the number of bytes for the first multi byte character from a + * utf-8 string. + * + * This can be used to iterate a valid UTF-8 string to jump to the next + * real character. + * + * @param c the first multi byte character + * @return the number of bytes [1-4] + */ + static int nbytesUtf8(char c) noexcept; + + /** + * Get the number of bytes for the unicode point. + * + * @param point the unicode point + * @return the number of bytes [1-4] or -1 on invalid + */ + static int nbytesPoint(char32_t point) noexcept; + + /** + * Get real number of character in a string. + * + * @param str the string + * @return the length + * @throw std::invalid_argument on invalid sequence + */ + static int length(const std::string &str); + + /** + * Iterate over all real characters in the UTF-8 string. + * + * The function must have the following signature: + * void f(char ch) + * + * @param str the UTF-8 string + * @throw std::invalid_argument on invalid sequence + */ + template <typename Func> + static void forEach(const std::string &str, Func function) + { + for (size_t i = 0; i < str.size(); ) { + char32_t point = 0; + int size = nbytesUtf8(str[i]); + + if (size < 0) { + throw std::invalid_argument("invalid sequence"); + } + + decode(point, str.data() + i); + function(point); + + i += size; + } + } + + /** + * Convert a UTF-32 string to UTF-8 string. + * + * @param array the UTF-32 string + * @return the UTF-8 string + * @throw std::invalid_argument on invalid sequence + */ + static std::string toUtf8(const std::u32string &array); + + /** + * Convert a UTF-8 string to UTF-32 string. + * + * @param str the UTF-8 string + * @return the UTF-32 string + * @throw std::invalid_argument on invalid sequence + */ + static std::u32string toUtf32(const std::string &str); + + /** + * Check if the unicode character is space. + * + * @param c the character + * @return true if space + */ + static bool isspace(char32_t c) noexcept; + + /** + * Check if the unicode character is digit. + * + * @param c the character + * @return true if digit + */ + static bool isdigit(char32_t c) noexcept; + + /** + * Check if the unicode character is alpha category. + * + * @param c the character + * @return true if alpha + */ + static bool isalpha(char32_t c) noexcept; + + /** + * Check if the unicode character is upper case. + * + * @param c the character + * @return true if upper case + */ + static bool isupper(char32_t c) noexcept; + + /** + * Check if the unicode character is lower case. + * + * @param c the character + * @return true if lower case + */ + static bool islower(char32_t c) noexcept; + + /** + * Check if the unicode character is title case. + * + * @param c the character + * @return true if title case + */ + static bool istitle(char32_t c) noexcept; + + /** + * Convert to upper case. + * + * @param c the character + * @return the upper case character + */ + static char32_t toupper(char32_t c) noexcept; + + /** + * Convert to lower case. + * + * @param c the character + * @return the lower case character + */ + static char32_t tolower(char32_t c) noexcept; + + /** + * Convert to title case. + * + * @param c the character + * @return the title case character + */ + static char32_t totitle(char32_t c) noexcept; + + /** + * Convert the UTF-8 string to upper case. + * + * @param str the str + * @return the upper case string + * @warning very slow at the moment + */ + static inline std::string toupper(const std::string &str) + { + return toUtf8(toupper(toUtf32(str))); + } + + /** + * Convert the UTF-32 string to upper case. + * + * @param str the str + * @return the upper case string + */ + static inline std::u32string toupper(std::u32string str) + { + for (size_t i = 0; i < str.size(); ++i) { + str[i] = toupper(str[i]); + } + + return str; + } + + /** + * Convert the UTF-8 string to lower case. + * + * @param str the str + * @return the lower case string + * @warning very slow at the moment + */ + static inline std::string tolower(const std::string &str) + { + return toUtf8(tolower(toUtf32(str))); + } + + /** + * Convert the UTF-32 string to lower case. + * + * @param str the str + * @return the lower case string + */ + static inline std::u32string tolower(std::u32string str) + { + for (size_t i = 0; i < str.size(); ++i) { + str[i] = tolower(str[i]); + } + + return str; + } +}; + +#endif // !_UTF8_H_