0
|
1 /* |
|
2 * unicode.hpp -- UTF-8 to UTF-32 conversions and various operations |
|
3 * |
1
|
4 * Copyright (c) 2013-2017 David Demelier <markand@malikania.fr> |
0
|
5 * |
|
6 * Permission to use, copy, modify, and/or distribute this software for any |
|
7 * purpose with or without fee is hereby granted, provided that the above |
|
8 * copyright notice and this permission notice appear in all copies. |
|
9 * |
|
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
|
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
|
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
|
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
|
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
|
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
17 */ |
|
18 |
|
19 #ifndef UNICODE_HPP |
|
20 #define UNICODE_HPP |
|
21 |
|
22 /** |
|
23 * \file unicode.hpp |
|
24 * \brief UTF-8 to UTF-32 conversions |
|
25 * \author David Demelier <markand@malikania.fr> |
|
26 * \warning These files are auto-generated! |
|
27 */ |
|
28 |
|
29 /** |
|
30 * \page unicode Basic unicode management. |
|
31 * |
|
32 * ## Export macros |
|
33 * |
|
34 * You must define `UNICODE_DLL` globally and `UNICODE_BUILDING_DLL` when compiling the library if you want a DLL, alternatively you can provide |
|
35 * your own `UNICODE_EXPORT` macro instead. |
|
36 */ |
|
37 |
|
38 /** |
|
39 * \cond UNICODE_HIDDEN_SYMBOLS |
|
40 */ |
|
41 |
|
42 #if !defined(UNICODE_EXPORT) |
|
43 # if defined(UNICODE_DLL) |
|
44 # if defined(_WIN32) |
|
45 # if defined(UNICODE_BUILDING_DLL) |
|
46 # define UNICODE_EXPORT __declspec(dllexport) |
|
47 # else |
|
48 # define UNICODE_EXPORT __declspec(dllimport) |
|
49 # endif |
|
50 # else |
|
51 # define UNICODE_EXPORT |
|
52 # endif |
|
53 # else |
|
54 # define UNICODE_EXPORT |
|
55 # endif |
|
56 #endif |
|
57 |
|
58 /** |
|
59 * \endcond |
|
60 */ |
|
61 |
|
62 #include <stdexcept> |
|
63 #include <string> |
|
64 |
|
65 /** |
|
66 * \brief Unicode namespace. |
|
67 */ |
|
68 namespace unicode { |
|
69 |
|
70 /** |
|
71 * Encode the unicode code point into multibyte string. |
|
72 * |
|
73 * \param point the unicode code point |
|
74 * \param res the output buffer |
|
75 */ |
|
76 UNICODE_EXPORT void encode(char32_t point, char res[5]) noexcept; |
|
77 |
|
78 /** |
|
79 * Decode the multibyte buffer into an unicode code point. |
|
80 * |
|
81 * \param c the code point destination |
|
82 * \param res the multibyte string. |
|
83 */ |
|
84 UNICODE_EXPORT void decode(char32_t &c, const char *res) noexcept; |
|
85 |
|
86 /** |
|
87 * Get the number of bytes for the first multi byte character from a |
|
88 * utf-8 string. |
|
89 * |
|
90 * This can be used to iterate a valid UTF-8 string to jump to the next |
|
91 * real character. |
|
92 * |
|
93 * \param c the first multi byte character |
|
94 * \return the number of bytes [1-4] or -1 if invalid |
|
95 */ |
|
96 UNICODE_EXPORT int nbytesUtf8(char c) noexcept; |
|
97 |
|
98 /** |
|
99 * Get the number of bytes for the unicode point. |
|
100 * |
|
101 * \param point the unicode point |
|
102 * \return the number of bytes [1-4] or -1 if invalid |
|
103 */ |
|
104 UNICODE_EXPORT int nbytesPoint(char32_t point) noexcept; |
|
105 |
|
106 /** |
|
107 * Get real number of character in a string. |
|
108 * |
|
109 * \param str the string |
|
110 * \return the length |
|
111 * \throw std::invalid_argument on invalid sequence |
|
112 */ |
|
113 UNICODE_EXPORT unsigned length(const std::string &str); |
|
114 |
|
115 /** |
|
116 * Iterate over all real characters in the UTF-8 string. |
|
117 * |
|
118 * The function must have the following signature: |
|
119 * void f(char ch) |
|
120 * |
|
121 * \param str the UTF-8 string |
|
122 * \param function the function callback |
|
123 * \throw std::invalid_argument on invalid sequence |
|
124 */ |
|
125 template <typename Func> |
|
126 void forEach(const std::string &str, Func function) |
|
127 { |
|
128 for (size_t i = 0; i < str.size(); ) { |
|
129 char32_t point = 0; |
|
130 int size = nbytesUtf8(str[i]); |
|
131 |
|
132 if (size < 0) |
|
133 throw std::invalid_argument("invalid sequence"); |
|
134 |
|
135 decode(point, str.data() + i); |
|
136 function(point); |
|
137 |
|
138 i += size; |
|
139 } |
|
140 } |
|
141 |
|
142 /** |
|
143 * Convert a UTF-32 string to UTF-8 string. |
|
144 * |
|
145 * \param array the UTF-32 string |
|
146 * \return the UTF-8 string |
|
147 * \throw std::invalid_argument on invalid sequence |
|
148 */ |
|
149 UNICODE_EXPORT std::string toUtf8(const std::u32string &array); |
|
150 |
|
151 /** |
|
152 * Convert a UTF-8 string to UTF-32 string. |
|
153 * |
|
154 * \param str the UTF-8 string |
|
155 * \return the UTF-32 string |
|
156 * \throw std::invalid_argument on invalid sequence |
|
157 */ |
|
158 UNICODE_EXPORT std::u32string toUtf32(const std::string &str); |
|
159 |
|
160 /** |
|
161 * Check if the unicode character is space. |
|
162 * |
|
163 * \param c the character |
|
164 * \return true if space |
|
165 */ |
|
166 UNICODE_EXPORT bool isspace(char32_t c) noexcept; |
|
167 |
|
168 /** |
|
169 * Check if the unicode character is digit. |
|
170 * |
|
171 * \param c the character |
|
172 * \return true if digit |
|
173 */ |
|
174 UNICODE_EXPORT bool isdigit(char32_t c) noexcept; |
|
175 |
|
176 /** |
|
177 * Check if the unicode character is alpha category. |
|
178 * |
|
179 * \param c the character |
|
180 * \return true if alpha |
|
181 */ |
|
182 UNICODE_EXPORT bool isalpha(char32_t c) noexcept; |
|
183 |
|
184 /** |
|
185 * Check if the unicode character is upper case. |
|
186 * |
|
187 * \param c the character |
|
188 * \return true if upper case |
|
189 */ |
|
190 UNICODE_EXPORT bool isupper(char32_t c) noexcept; |
|
191 |
|
192 /** |
|
193 * Check if the unicode character is lower case. |
|
194 * |
|
195 * \param c the character |
|
196 * \return true if lower case |
|
197 */ |
|
198 UNICODE_EXPORT bool islower(char32_t c) noexcept; |
|
199 |
|
200 /** |
|
201 * Check if the unicode character is title case. |
|
202 * |
|
203 * \param c the character |
|
204 * \return true if title case |
|
205 */ |
|
206 UNICODE_EXPORT bool istitle(char32_t c) noexcept; |
|
207 |
|
208 /** |
|
209 * Convert to upper case. |
|
210 * |
|
211 * \param c the character |
|
212 * \return the upper case character |
|
213 */ |
|
214 UNICODE_EXPORT char32_t toupper(char32_t c) noexcept; |
|
215 |
|
216 /** |
|
217 * Convert to lower case. |
|
218 * |
|
219 * \param c the character |
|
220 * \return the lower case character |
|
221 */ |
|
222 UNICODE_EXPORT char32_t tolower(char32_t c) noexcept; |
|
223 |
|
224 /** |
|
225 * Convert to title case. |
|
226 * |
|
227 * \param c the character |
|
228 * \return the title case character |
|
229 */ |
|
230 UNICODE_EXPORT char32_t totitle(char32_t c) noexcept; |
|
231 |
|
232 /** |
|
233 * Convert the UTF-32 string to upper case. |
|
234 * |
|
235 * \param str the str |
|
236 * \return the upper case string |
|
237 */ |
|
238 inline std::u32string toupper(std::u32string str) |
|
239 { |
|
240 for (size_t i = 0; i < str.size(); ++i) |
|
241 str[i] = toupper(str[i]); |
|
242 |
|
243 return str; |
|
244 } |
|
245 |
|
246 /** |
|
247 * Convert the UTF-8 string to upper case. |
|
248 * |
|
249 * \param str the str |
|
250 * \return the upper case string |
|
251 * \warning very slow at the moment |
|
252 */ |
|
253 inline std::string toupper(const std::string &str) |
|
254 { |
|
255 std::string result; |
|
256 char buffer[5]; |
|
257 |
|
258 forEach(str, [&] (char32_t code) { |
|
259 encode(toupper(code), buffer); |
|
260 result += buffer; |
|
261 }); |
|
262 |
|
263 return result; |
|
264 } |
|
265 |
|
266 /** |
|
267 * Convert the UTF-32 string to lower case. |
|
268 * |
|
269 * \param str the str |
|
270 * \return the lower case string |
|
271 */ |
|
272 inline std::u32string tolower(std::u32string str) |
|
273 { |
|
274 for (size_t i = 0; i < str.size(); ++i) |
|
275 str[i] = tolower(str[i]); |
|
276 |
|
277 return str; |
|
278 } |
|
279 |
|
280 /** |
|
281 * Convert the UTF-8 string to lower case. |
|
282 * |
|
283 * \param str the str |
|
284 * \return the lower case string |
|
285 * \warning very slow at the moment |
|
286 */ |
|
287 inline std::string tolower(const std::string &str) |
|
288 { |
|
289 std::string result; |
|
290 char buffer[5]; |
|
291 |
|
292 forEach(str, [&] (char32_t code) { |
|
293 encode(tolower(code), buffer); |
|
294 result += buffer; |
|
295 }); |
|
296 |
|
297 return result; |
|
298 } |
|
299 |
|
300 } // !unicode |
|
301 |
|
302 #endif // !UNICODE_HPP |