comparison tools/mkunicode/Unicode.h @ 352:7fe8d4094983

Utf8: - Fix invalid decoding from UTF-8 to UTF-32 - Add all files
author David Demelier <markand@malikania.fr>
date Wed, 08 Apr 2015 12:33:45 +0200
parents C++/modules/Utf8/Utf8.h@0b576ee64d45
children b78d6d8f2872
comparison
equal deleted inserted replaced
351:47a206e724f2 352:7fe8d4094983
1 /*
2 * Unicode.h -- UTF-8 to UTF-32 conversions and various operations
3 *
4 * Copyright (c) 2013, 2014, 2015 David Demelier <markand@malikania.fr>
5 *
6 * Permission to use, copy, modify, and/or distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 #ifndef _UNICODE_H_
20 #define _UNICODE_H_
21
22 /**
23 * @file Unicode.h
24 * @brief UTF-8 to UTF-32 conversions
25 */
26
27 #include <stdexcept>
28 #include <string>
29
30 /**
31 * @class Unicode
32 * @brief Conversion between UTF-8 and UTF-32
33 */
34 class Unicode {
35 private:
36 static void encode(char32_t point, char res[5]) noexcept;
37 static void decode(char32_t &c, const char *res) noexcept;
38
39 public:
40 /**
41 * Get the number of bytes for the first multi byte character from a
42 * utf-8 string.
43 *
44 * This can be used to iterate a valid UTF-8 string to jump to the next
45 * real character.
46 *
47 * @param c the first multi byte character
48 * @return the number of bytes [1-4]
49 */
50 static int nbytesUtf8(char c) noexcept;
51
52 /**
53 * Get the number of bytes for the unicode point.
54 *
55 * @param point the unicode point
56 * @return the number of bytes [1-4] or -1 on invalid
57 */
58 static int nbytesPoint(char32_t point) noexcept;
59
60 /**
61 * Get real number of character in a string.
62 *
63 * @param str the string
64 * @return the length
65 * @throw std::invalid_argument on invalid sequence
66 */
67 static int length(const std::string &str);
68
69 /**
70 * Iterate over all real characters in the UTF-8 string.
71 *
72 * The function must have the following signature:
73 * void f(char ch)
74 *
75 * @param str the UTF-8 string
76 * @throw std::invalid_argument on invalid sequence
77 */
78 template <typename Func>
79 static void forEach(const std::string &str, Func function)
80 {
81 for (size_t i = 0; i < str.size(); ) {
82 char32_t point = 0;
83 int size = nbytesUtf8(str[i]);
84
85 if (size < 0) {
86 throw std::invalid_argument("invalid sequence");
87 }
88
89 decode(point, str.data() + i);
90 function(point);
91
92 i += size;
93 }
94 }
95
96 /**
97 * Convert a UTF-32 string to UTF-8 string.
98 *
99 * @param array the UTF-32 string
100 * @return the UTF-8 string
101 * @throw std::invalid_argument on invalid sequence
102 */
103 static std::string toUtf8(const std::u32string &array);
104
105 /**
106 * Convert a UTF-8 string to UTF-32 string.
107 *
108 * @param str the UTF-8 string
109 * @return the UTF-32 string
110 * @throw std::invalid_argument on invalid sequence
111 */
112 static std::u32string toUtf32(const std::string &str);
113
114 /**
115 * Check if the unicode character is space.
116 *
117 * @param c the character
118 * @return true if space
119 */
120 static bool isspace(char32_t c) noexcept;
121
122 /**
123 * Check if the unicode character is digit.
124 *
125 * @param c the character
126 * @return true if digit
127 */
128 static bool isdigit(char32_t c) noexcept;
129
130 /**
131 * Check if the unicode character is alpha category.
132 *
133 * @param c the character
134 * @return true if alpha
135 */
136 static bool isalpha(char32_t c) noexcept;
137
138 /**
139 * Check if the unicode character is upper case.
140 *
141 * @param c the character
142 * @return true if upper case
143 */
144 static bool isupper(char32_t c) noexcept;
145
146 /**
147 * Check if the unicode character is lower case.
148 *
149 * @param c the character
150 * @return true if lower case
151 */
152 static bool islower(char32_t c) noexcept;
153
154 /**
155 * Check if the unicode character is title case.
156 *
157 * @param c the character
158 * @return true if title case
159 */
160 static bool istitle(char32_t c) noexcept;
161
162 /**
163 * Convert to upper case.
164 *
165 * @param c the character
166 * @return the upper case character
167 */
168 static char32_t toupper(char32_t c) noexcept;
169
170 /**
171 * Convert to lower case.
172 *
173 * @param c the character
174 * @return the lower case character
175 */
176 static char32_t tolower(char32_t c) noexcept;
177
178 /**
179 * Convert to title case.
180 *
181 * @param c the character
182 * @return the title case character
183 */
184 static char32_t totitle(char32_t c) noexcept;
185
186 /**
187 * Convert the UTF-8 string to upper case.
188 *
189 * @param str the str
190 * @return the upper case string
191 * @warning very slow at the moment
192 */
193 static inline std::string toupper(const std::string &str)
194 {
195 return toUtf8(toupper(toUtf32(str)));
196 }
197
198 /**
199 * Convert the UTF-32 string to upper case.
200 *
201 * @param str the str
202 * @return the upper case string
203 */
204 static inline std::u32string toupper(std::u32string str)
205 {
206 for (size_t i = 0; i < str.size(); ++i) {
207 str[i] = toupper(str[i]);
208 }
209
210 return str;
211 }
212
213 /**
214 * Convert the UTF-8 string to lower case.
215 *
216 * @param str the str
217 * @return the lower case string
218 * @warning very slow at the moment
219 */
220 static inline std::string tolower(const std::string &str)
221 {
222 return toUtf8(tolower(toUtf32(str)));
223 }
224
225 /**
226 * Convert the UTF-32 string to lower case.
227 *
228 * @param str the str
229 * @return the lower case string
230 */
231 static inline std::u32string tolower(std::u32string str)
232 {
233 for (size_t i = 0; i < str.size(); ++i) {
234 str[i] = tolower(str[i]);
235 }
236
237 return str;
238 }
239 };
240
241 #endif // !_UTF8_H_