Mercurial > code
comparison tools/mkunicode/Unicode.h @ 352:7fe8d4094983
Utf8:
- Fix invalid decoding from UTF-8 to UTF-32
- Add all files
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 08 Apr 2015 12:33:45 +0200 |
parents | C++/modules/Utf8/Utf8.h@0b576ee64d45 |
children | b78d6d8f2872 |
comparison
equal
deleted
inserted
replaced
351:47a206e724f2 | 352:7fe8d4094983 |
---|---|
1 /* | |
2 * Unicode.h -- UTF-8 to UTF-32 conversions and various operations | |
3 * | |
4 * Copyright (c) 2013, 2014, 2015 David Demelier <markand@malikania.fr> | |
5 * | |
6 * Permission to use, copy, modify, and/or distribute this software for any | |
7 * purpose with or without fee is hereby granted, provided that the above | |
8 * copyright notice and this permission notice appear in all copies. | |
9 * | |
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
17 */ | |
18 | |
19 #ifndef _UNICODE_H_ | |
20 #define _UNICODE_H_ | |
21 | |
22 /** | |
23 * @file Unicode.h | |
24 * @brief UTF-8 to UTF-32 conversions | |
25 */ | |
26 | |
27 #include <stdexcept> | |
28 #include <string> | |
29 | |
30 /** | |
31 * @class Unicode | |
32 * @brief Conversion between UTF-8 and UTF-32 | |
33 */ | |
34 class Unicode { | |
35 private: | |
36 static void encode(char32_t point, char res[5]) noexcept; | |
37 static void decode(char32_t &c, const char *res) noexcept; | |
38 | |
39 public: | |
40 /** | |
41 * Get the number of bytes for the first multi byte character from a | |
42 * utf-8 string. | |
43 * | |
44 * This can be used to iterate a valid UTF-8 string to jump to the next | |
45 * real character. | |
46 * | |
47 * @param c the first multi byte character | |
48 * @return the number of bytes [1-4] | |
49 */ | |
50 static int nbytesUtf8(char c) noexcept; | |
51 | |
52 /** | |
53 * Get the number of bytes for the unicode point. | |
54 * | |
55 * @param point the unicode point | |
56 * @return the number of bytes [1-4] or -1 on invalid | |
57 */ | |
58 static int nbytesPoint(char32_t point) noexcept; | |
59 | |
60 /** | |
61 * Get real number of character in a string. | |
62 * | |
63 * @param str the string | |
64 * @return the length | |
65 * @throw std::invalid_argument on invalid sequence | |
66 */ | |
67 static int length(const std::string &str); | |
68 | |
69 /** | |
70 * Iterate over all real characters in the UTF-8 string. | |
71 * | |
72 * The function must have the following signature: | |
73 * void f(char ch) | |
74 * | |
75 * @param str the UTF-8 string | |
76 * @throw std::invalid_argument on invalid sequence | |
77 */ | |
78 template <typename Func> | |
79 static void forEach(const std::string &str, Func function) | |
80 { | |
81 for (size_t i = 0; i < str.size(); ) { | |
82 char32_t point = 0; | |
83 int size = nbytesUtf8(str[i]); | |
84 | |
85 if (size < 0) { | |
86 throw std::invalid_argument("invalid sequence"); | |
87 } | |
88 | |
89 decode(point, str.data() + i); | |
90 function(point); | |
91 | |
92 i += size; | |
93 } | |
94 } | |
95 | |
96 /** | |
97 * Convert a UTF-32 string to UTF-8 string. | |
98 * | |
99 * @param array the UTF-32 string | |
100 * @return the UTF-8 string | |
101 * @throw std::invalid_argument on invalid sequence | |
102 */ | |
103 static std::string toUtf8(const std::u32string &array); | |
104 | |
105 /** | |
106 * Convert a UTF-8 string to UTF-32 string. | |
107 * | |
108 * @param str the UTF-8 string | |
109 * @return the UTF-32 string | |
110 * @throw std::invalid_argument on invalid sequence | |
111 */ | |
112 static std::u32string toUtf32(const std::string &str); | |
113 | |
114 /** | |
115 * Check if the unicode character is space. | |
116 * | |
117 * @param c the character | |
118 * @return true if space | |
119 */ | |
120 static bool isspace(char32_t c) noexcept; | |
121 | |
122 /** | |
123 * Check if the unicode character is digit. | |
124 * | |
125 * @param c the character | |
126 * @return true if digit | |
127 */ | |
128 static bool isdigit(char32_t c) noexcept; | |
129 | |
130 /** | |
131 * Check if the unicode character is alpha category. | |
132 * | |
133 * @param c the character | |
134 * @return true if alpha | |
135 */ | |
136 static bool isalpha(char32_t c) noexcept; | |
137 | |
138 /** | |
139 * Check if the unicode character is upper case. | |
140 * | |
141 * @param c the character | |
142 * @return true if upper case | |
143 */ | |
144 static bool isupper(char32_t c) noexcept; | |
145 | |
146 /** | |
147 * Check if the unicode character is lower case. | |
148 * | |
149 * @param c the character | |
150 * @return true if lower case | |
151 */ | |
152 static bool islower(char32_t c) noexcept; | |
153 | |
154 /** | |
155 * Check if the unicode character is title case. | |
156 * | |
157 * @param c the character | |
158 * @return true if title case | |
159 */ | |
160 static bool istitle(char32_t c) noexcept; | |
161 | |
162 /** | |
163 * Convert to upper case. | |
164 * | |
165 * @param c the character | |
166 * @return the upper case character | |
167 */ | |
168 static char32_t toupper(char32_t c) noexcept; | |
169 | |
170 /** | |
171 * Convert to lower case. | |
172 * | |
173 * @param c the character | |
174 * @return the lower case character | |
175 */ | |
176 static char32_t tolower(char32_t c) noexcept; | |
177 | |
178 /** | |
179 * Convert to title case. | |
180 * | |
181 * @param c the character | |
182 * @return the title case character | |
183 */ | |
184 static char32_t totitle(char32_t c) noexcept; | |
185 | |
186 /** | |
187 * Convert the UTF-8 string to upper case. | |
188 * | |
189 * @param str the str | |
190 * @return the upper case string | |
191 * @warning very slow at the moment | |
192 */ | |
193 static inline std::string toupper(const std::string &str) | |
194 { | |
195 return toUtf8(toupper(toUtf32(str))); | |
196 } | |
197 | |
198 /** | |
199 * Convert the UTF-32 string to upper case. | |
200 * | |
201 * @param str the str | |
202 * @return the upper case string | |
203 */ | |
204 static inline std::u32string toupper(std::u32string str) | |
205 { | |
206 for (size_t i = 0; i < str.size(); ++i) { | |
207 str[i] = toupper(str[i]); | |
208 } | |
209 | |
210 return str; | |
211 } | |
212 | |
213 /** | |
214 * Convert the UTF-8 string to lower case. | |
215 * | |
216 * @param str the str | |
217 * @return the lower case string | |
218 * @warning very slow at the moment | |
219 */ | |
220 static inline std::string tolower(const std::string &str) | |
221 { | |
222 return toUtf8(tolower(toUtf32(str))); | |
223 } | |
224 | |
225 /** | |
226 * Convert the UTF-32 string to lower case. | |
227 * | |
228 * @param str the str | |
229 * @return the lower case string | |
230 */ | |
231 static inline std::u32string tolower(std::u32string str) | |
232 { | |
233 for (size_t i = 0; i < str.size(); ++i) { | |
234 str[i] = tolower(str[i]); | |
235 } | |
236 | |
237 return str; | |
238 } | |
239 }; | |
240 | |
241 #endif // !_UTF8_H_ |