0
|
1 /* |
|
2 * unicode.hpp -- UTF-8 to UTF-32 conversions and various operations |
|
3 * |
1
|
4 * Copyright (c) 2013-2017 David Demelier <markand@malikania.fr> |
0
|
5 * |
|
6 * Permission to use, copy, modify, and/or distribute this software for any |
|
7 * purpose with or without fee is hereby granted, provided that the above |
|
8 * copyright notice and this permission notice appear in all copies. |
|
9 * |
|
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
|
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
|
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
|
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
|
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
|
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
17 */ |
|
18 |
|
19 #ifndef UNICODE_HPP |
|
20 #define UNICODE_HPP |
|
21 |
|
22 /** |
|
23 * \file unicode.hpp |
|
24 * \brief UTF-8 to UTF-32 conversions |
|
25 * \author David Demelier <markand@malikania.fr> |
|
26 * \warning These files are auto-generated! |
|
27 */ |
|
28 |
|
29 /** |
|
30 * \page unicode Basic unicode management. |
|
31 * |
|
32 * ## Export macros |
|
33 * |
2
|
34 * You must define `UNICODE_DLL` globally and `UNICODE_BUILDING_DLL` when |
|
35 * compiling the library if you want a DLL, alternatively you can provide |
0
|
36 * your own `UNICODE_EXPORT` macro instead. |
|
37 */ |
|
38 |
|
39 /** |
|
40 * \cond UNICODE_HIDDEN_SYMBOLS |
|
41 */ |
|
42 |
|
43 #if !defined(UNICODE_EXPORT) |
|
44 # if defined(UNICODE_DLL) |
|
45 # if defined(_WIN32) |
|
46 # if defined(UNICODE_BUILDING_DLL) |
|
47 # define UNICODE_EXPORT __declspec(dllexport) |
|
48 # else |
|
49 # define UNICODE_EXPORT __declspec(dllimport) |
|
50 # endif |
|
51 # else |
|
52 # define UNICODE_EXPORT |
|
53 # endif |
|
54 # else |
|
55 # define UNICODE_EXPORT |
|
56 # endif |
|
57 #endif |
|
58 |
|
59 /** |
|
60 * \endcond |
|
61 */ |
|
62 |
|
63 #include <stdexcept> |
|
64 #include <string> |
|
65 |
|
66 /** |
|
67 * \brief Unicode namespace. |
|
68 */ |
|
69 namespace unicode { |
|
70 |
|
71 /** |
|
72 * Encode the unicode code point into multibyte string. |
|
73 * |
|
74 * \param point the unicode code point |
|
75 * \param res the output buffer |
|
76 */ |
|
77 UNICODE_EXPORT void encode(char32_t point, char res[5]) noexcept; |
|
78 |
|
79 /** |
|
80 * Decode the multibyte buffer into an unicode code point. |
|
81 * |
|
82 * \param c the code point destination |
|
83 * \param res the multibyte string. |
|
84 */ |
2
|
85 UNICODE_EXPORT void decode(char32_t& c, const char* res) noexcept; |
0
|
86 |
|
87 /** |
|
88 * Get the number of bytes for the first multi byte character from a |
|
89 * utf-8 string. |
|
90 * |
|
91 * This can be used to iterate a valid UTF-8 string to jump to the next |
|
92 * real character. |
|
93 * |
|
94 * \param c the first multi byte character |
|
95 * \return the number of bytes [1-4] or -1 if invalid |
|
96 */ |
2
|
97 UNICODE_EXPORT int nbytes_utf8(char c) noexcept; |
0
|
98 |
|
99 /** |
|
100 * Get the number of bytes for the unicode point. |
|
101 * |
|
102 * \param point the unicode point |
|
103 * \return the number of bytes [1-4] or -1 if invalid |
|
104 */ |
2
|
105 UNICODE_EXPORT int nbytes_point(char32_t point) noexcept; |
0
|
106 |
|
107 /** |
|
108 * Get real number of character in a string. |
|
109 * |
|
110 * \param str the string |
|
111 * \return the length |
|
112 * \throw std::invalid_argument on invalid sequence |
|
113 */ |
2
|
114 UNICODE_EXPORT unsigned length(const std::string& str); |
0
|
115 |
|
116 /** |
|
117 * Iterate over all real characters in the UTF-8 string. |
|
118 * |
|
119 * The function must have the following signature: |
|
120 * void f(char ch) |
|
121 * |
|
122 * \param str the UTF-8 string |
|
123 * \param function the function callback |
|
124 * \throw std::invalid_argument on invalid sequence |
|
125 */ |
|
126 template <typename Func> |
2
|
127 void for_each(const std::string& str, Func function) |
0
|
128 { |
|
129 for (size_t i = 0; i < str.size(); ) { |
|
130 char32_t point = 0; |
2
|
131 int size = nbytes_utf8(str[i]); |
0
|
132 |
2
|
133 if (size < 0) { |
0
|
134 throw std::invalid_argument("invalid sequence"); |
2
|
135 } |
0
|
136 |
|
137 decode(point, str.data() + i); |
|
138 function(point); |
|
139 |
|
140 i += size; |
|
141 } |
|
142 } |
|
143 |
|
144 /** |
|
145 * Convert a UTF-32 string to UTF-8 string. |
|
146 * |
|
147 * \param array the UTF-32 string |
|
148 * \return the UTF-8 string |
|
149 * \throw std::invalid_argument on invalid sequence |
|
150 */ |
2
|
151 UNICODE_EXPORT std::string to_utf8(const std::u32string& array); |
0
|
152 |
|
153 /** |
|
154 * Convert a UTF-8 string to UTF-32 string. |
|
155 * |
|
156 * \param str the UTF-8 string |
|
157 * \return the UTF-32 string |
|
158 * \throw std::invalid_argument on invalid sequence |
|
159 */ |
2
|
160 UNICODE_EXPORT std::u32string to_utf32(const std::string& str); |
0
|
161 |
|
162 /** |
|
163 * Check if the unicode character is space. |
|
164 * |
|
165 * \param c the character |
|
166 * \return true if space |
|
167 */ |
|
168 UNICODE_EXPORT bool isspace(char32_t c) noexcept; |
|
169 |
|
170 /** |
|
171 * Check if the unicode character is digit. |
|
172 * |
|
173 * \param c the character |
|
174 * \return true if digit |
|
175 */ |
|
176 UNICODE_EXPORT bool isdigit(char32_t c) noexcept; |
|
177 |
|
178 /** |
|
179 * Check if the unicode character is alpha category. |
|
180 * |
|
181 * \param c the character |
|
182 * \return true if alpha |
|
183 */ |
|
184 UNICODE_EXPORT bool isalpha(char32_t c) noexcept; |
|
185 |
|
186 /** |
|
187 * Check if the unicode character is upper case. |
|
188 * |
|
189 * \param c the character |
|
190 * \return true if upper case |
|
191 */ |
|
192 UNICODE_EXPORT bool isupper(char32_t c) noexcept; |
|
193 |
|
194 /** |
|
195 * Check if the unicode character is lower case. |
|
196 * |
|
197 * \param c the character |
|
198 * \return true if lower case |
|
199 */ |
|
200 UNICODE_EXPORT bool islower(char32_t c) noexcept; |
|
201 |
|
202 /** |
|
203 * Check if the unicode character is title case. |
|
204 * |
|
205 * \param c the character |
|
206 * \return true if title case |
|
207 */ |
|
208 UNICODE_EXPORT bool istitle(char32_t c) noexcept; |
|
209 |
|
210 /** |
|
211 * Convert to upper case. |
|
212 * |
|
213 * \param c the character |
|
214 * \return the upper case character |
|
215 */ |
|
216 UNICODE_EXPORT char32_t toupper(char32_t c) noexcept; |
|
217 |
|
218 /** |
|
219 * Convert to lower case. |
|
220 * |
|
221 * \param c the character |
|
222 * \return the lower case character |
|
223 */ |
|
224 UNICODE_EXPORT char32_t tolower(char32_t c) noexcept; |
|
225 |
|
226 /** |
|
227 * Convert to title case. |
|
228 * |
|
229 * \param c the character |
|
230 * \return the title case character |
|
231 */ |
|
232 UNICODE_EXPORT char32_t totitle(char32_t c) noexcept; |
|
233 |
|
234 /** |
|
235 * Convert the UTF-32 string to upper case. |
|
236 * |
|
237 * \param str the str |
|
238 * \return the upper case string |
|
239 */ |
|
240 inline std::u32string toupper(std::u32string str) |
|
241 { |
2
|
242 for (size_t i = 0; i < str.size(); ++i) { |
0
|
243 str[i] = toupper(str[i]); |
2
|
244 } |
0
|
245 |
|
246 return str; |
|
247 } |
|
248 |
|
249 /** |
|
250 * Convert the UTF-8 string to upper case. |
|
251 * |
|
252 * \param str the str |
|
253 * \return the upper case string |
|
254 * \warning very slow at the moment |
|
255 */ |
2
|
256 inline std::string toupper(const std::string& str) |
0
|
257 { |
|
258 std::string result; |
|
259 char buffer[5]; |
|
260 |
2
|
261 for_each(str, [&] (char32_t code) { |
0
|
262 encode(toupper(code), buffer); |
|
263 result += buffer; |
|
264 }); |
|
265 |
|
266 return result; |
|
267 } |
|
268 |
|
269 /** |
|
270 * Convert the UTF-32 string to lower case. |
|
271 * |
|
272 * \param str the str |
|
273 * \return the lower case string |
|
274 */ |
|
275 inline std::u32string tolower(std::u32string str) |
|
276 { |
2
|
277 for (size_t i = 0; i < str.size(); ++i) { |
0
|
278 str[i] = tolower(str[i]); |
2
|
279 } |
0
|
280 |
|
281 return str; |
|
282 } |
|
283 |
|
284 /** |
|
285 * Convert the UTF-8 string to lower case. |
|
286 * |
|
287 * \param str the str |
|
288 * \return the lower case string |
|
289 * \warning very slow at the moment |
|
290 */ |
2
|
291 inline std::string tolower(const std::string& str) |
0
|
292 { |
|
293 std::string result; |
|
294 char buffer[5]; |
|
295 |
2
|
296 for_each(str, [&] (char32_t code) { |
0
|
297 encode(tolower(code), buffer); |
|
298 result += buffer; |
|
299 }); |
|
300 |
|
301 return result; |
|
302 } |
|
303 |
|
304 } // !unicode |
|
305 |
|
306 #endif // !UNICODE_HPP |