diff tools/mkunicode/Unicode.h @ 352:7fe8d4094983

Utf8: - Fix invalid decoding from UTF-8 to UTF-32 - Add all files
author David Demelier <markand@malikania.fr>
date Wed, 08 Apr 2015 12:33:45 +0200
parents C++/modules/Utf8/Utf8.h@0b576ee64d45
children b78d6d8f2872
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mkunicode/Unicode.h	Wed Apr 08 12:33:45 2015 +0200
@@ -0,0 +1,241 @@
+/*
+ * Unicode.h -- UTF-8 to UTF-32 conversions and various operations
+ *
+ * Copyright (c) 2013, 2014, 2015 David Demelier <markand@malikania.fr>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _UNICODE_H_
+#define _UNICODE_H_
+
+/**
+ * @file Unicode.h
+ * @brief UTF-8 to UTF-32 conversions
+ */
+
+#include <stdexcept>
+#include <string>
+
+/**
+ * @class Unicode
+ * @brief Conversion between UTF-8 and UTF-32
+ */
+class Unicode {
+private:
+	static void encode(char32_t point, char res[5]) noexcept;
+	static void decode(char32_t &c, const char *res) noexcept;
+
+public:
+	/**
+	 * Get the number of bytes for the first multi byte character from a
+	 * utf-8 string.
+	 *
+	 * This can be used to iterate a valid UTF-8 string to jump to the next
+	 * real character.
+	 *
+	 * @param c the first multi byte character
+	 * @return the number of bytes [1-4]
+	 */
+	static int nbytesUtf8(char c) noexcept;
+
+	/**
+	 * Get the number of bytes for the unicode point.
+	 *
+	 * @param point the unicode point
+	 * @return the number of bytes [1-4] or -1 on invalid
+	 */
+	static int nbytesPoint(char32_t point) noexcept;
+
+	/**
+	 * Get real number of character in a string.
+	 *
+	 * @param str the string
+	 * @return the length
+	 * @throw std::invalid_argument on invalid sequence
+	 */
+	static int length(const std::string &str);
+
+	/**
+	 * Iterate over all real characters in the UTF-8 string.
+	 *
+	 * The function must have the following signature:
+	 *	void f(char ch)
+	 *
+	 * @param str the UTF-8 string
+	 * @throw std::invalid_argument on invalid sequence
+	 */
+	template <typename Func>
+	static void forEach(const std::string &str, Func function)
+	{
+		for (size_t i = 0; i < str.size(); ) {
+			char32_t point = 0;
+			int size = nbytesUtf8(str[i]);
+
+			if (size < 0) {
+				throw std::invalid_argument("invalid sequence");
+			}
+
+			decode(point, str.data() + i);
+			function(point);
+
+			i += size;
+		}
+	}
+
+	/**
+	 * Convert a UTF-32 string to UTF-8 string.
+	 *
+	 * @param array the UTF-32 string
+	 * @return the UTF-8 string
+	 * @throw std::invalid_argument on invalid sequence
+	 */
+	static std::string toUtf8(const std::u32string &array);
+
+	/**
+	 * Convert a UTF-8 string to UTF-32 string.
+	 *
+	 * @param str the UTF-8 string
+	 * @return the UTF-32 string
+	 * @throw std::invalid_argument on invalid sequence
+	 */
+	static std::u32string toUtf32(const std::string &str);
+
+	/**
+	 * Check if the unicode character is space.
+	 *
+	 * @param c the character
+	 * @return true if space
+	 */
+	static bool isspace(char32_t c) noexcept;
+
+	/**
+	 * Check if the unicode character is digit.
+	 *
+	 * @param c the character
+	 * @return true if digit
+	 */
+	static bool isdigit(char32_t c) noexcept;
+
+	/**
+	 * Check if the unicode character is alpha category.
+	 *
+	 * @param c the character
+	 * @return true if alpha
+	 */
+	static bool isalpha(char32_t c) noexcept;
+
+	/**
+	 * Check if the unicode character is upper case.
+	 *
+	 * @param c the character
+	 * @return true if upper case
+	 */
+	static bool isupper(char32_t c) noexcept;
+
+	/**
+	 * Check if the unicode character is lower case.
+	 *
+	 * @param c the character
+	 * @return true if lower case
+	 */
+	static bool islower(char32_t c) noexcept;
+
+	/**
+	 * Check if the unicode character is title case.
+	 *
+	 * @param c the character
+	 * @return true if title case
+	 */
+	static bool istitle(char32_t c) noexcept;
+
+	/**
+	 * Convert to upper case.
+	 *
+	 * @param c the character
+	 * @return the upper case character
+	 */
+	static char32_t toupper(char32_t c) noexcept;
+
+	/**
+	 * Convert to lower case.
+	 *
+	 * @param c the character
+	 * @return the lower case character
+	 */
+	static char32_t tolower(char32_t c) noexcept;
+
+	/**
+	 * Convert to title case.
+	 *
+	 * @param c the character
+	 * @return the title case character
+	 */
+	static char32_t totitle(char32_t c) noexcept;
+
+	/**
+	 * Convert the UTF-8 string to upper case.
+	 *
+	 * @param str the str
+	 * @return the upper case string
+	 * @warning very slow at the moment
+	 */
+	static inline std::string toupper(const std::string &str)
+	{
+		return toUtf8(toupper(toUtf32(str)));
+	}
+
+	/**
+	 * Convert the UTF-32 string to upper case.
+	 *
+	 * @param str the str
+	 * @return the upper case string
+	 */
+	static inline std::u32string toupper(std::u32string str)
+	{
+		for (size_t i = 0; i < str.size(); ++i) {
+			str[i] = toupper(str[i]);
+		}
+
+		return str;
+	}
+
+	/**
+	 * Convert the UTF-8 string to lower case.
+	 *
+	 * @param str the str
+	 * @return the lower case string
+	 * @warning very slow at the moment
+	 */
+	static inline std::string tolower(const std::string &str)
+	{
+		return toUtf8(tolower(toUtf32(str)));
+	}
+
+	/**
+	 * Convert the UTF-32 string to lower case.
+	 *
+	 * @param str the str
+	 * @return the lower case string
+	 */
+	static inline std::u32string tolower(std::u32string str)
+	{
+		for (size_t i = 0; i < str.size(); ++i) {
+			str[i] = tolower(str[i]);
+		}
+
+		return str;
+	}
+};
+
+#endif // !_UTF8_H_