0
|
1 /* |
|
2 * unicode.hpp -- UTF-8 to UTF-32 conversions and various operations |
|
3 * |
1
|
4 * Copyright (c) 2013-2017 David Demelier <markand@malikania.fr> |
0
|
5 * |
|
6 * Permission to use, copy, modify, and/or distribute this software for any |
|
7 * purpose with or without fee is hereby granted, provided that the above |
|
8 * copyright notice and this permission notice appear in all copies. |
|
9 * |
|
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
|
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
|
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
|
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
|
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
|
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
17 */ |
|
18 |
|
19 #ifndef UNICODE_HPP |
|
20 #define UNICODE_HPP |
|
21 |
|
22 /** |
|
23 * \file unicode.hpp |
|
24 * \brief UTF-8 to UTF-32 conversions |
|
25 * \author David Demelier <markand@malikania.fr> |
|
26 * \warning These files are auto-generated! |
|
27 */ |
|
28 |
|
29 /** |
|
30 * \page unicode Basic unicode management. |
|
31 * |
|
32 * ## Export macros |
|
33 * |
2
|
34 * You must define `UNICODE_DLL` globally and `UNICODE_BUILDING_DLL` when |
|
35 * compiling the library if you want a DLL, alternatively you can provide |
0
|
36 * your own `UNICODE_EXPORT` macro instead. |
|
37 */ |
|
38 |
|
39 /** |
|
40 * \cond UNICODE_HIDDEN_SYMBOLS |
|
41 */ |
|
42 |
|
43 #if !defined(UNICODE_EXPORT) |
|
44 # if defined(UNICODE_DLL) |
|
45 # if defined(_WIN32) |
|
46 # if defined(UNICODE_BUILDING_DLL) |
|
47 # define UNICODE_EXPORT __declspec(dllexport) |
|
48 # else |
|
49 # define UNICODE_EXPORT __declspec(dllimport) |
|
50 # endif |
|
51 # else |
|
52 # define UNICODE_EXPORT |
|
53 # endif |
|
54 # else |
|
55 # define UNICODE_EXPORT |
|
56 # endif |
|
57 #endif |
|
58 |
|
59 /** |
|
60 * \endcond |
|
61 */ |
|
62 |
|
63 #include <stdexcept> |
|
64 #include <string> |
|
65 |
|
66 /** |
|
67 * \brief Unicode namespace. |
|
68 */ |
|
69 namespace unicode { |
|
70 |
|
71 /** |
|
72 * Encode the unicode code point into multibyte string. |
|
73 * |
|
74 * \param point the unicode code point |
|
75 * \param res the output buffer |
|
76 */ |
|
77 UNICODE_EXPORT void encode(char32_t point, char res[5]) noexcept; |
|
78 |
|
79 /** |
|
80 * Decode the multibyte buffer into an unicode code point. |
|
81 * |
|
82 * \param c the code point destination |
|
83 * \param res the multibyte string. |
|
84 */ |
2
|
85 UNICODE_EXPORT void decode(char32_t& c, const char* res) noexcept; |
0
|
86 |
|
87 /** |
|
88 * Get the number of bytes for the first multi byte character from a |
|
89 * utf-8 string. |
|
90 * |
|
91 * This can be used to iterate a valid UTF-8 string to jump to the next |
|
92 * real character. |
|
93 * |
|
94 * \param c the first multi byte character |
|
95 * \return the number of bytes [1-4] or -1 if invalid |
|
96 */ |
2
|
97 UNICODE_EXPORT int nbytes_utf8(char c) noexcept; |
0
|
98 |
|
99 /** |
|
100 * Get the number of bytes for the unicode point. |
|
101 * |
|
102 * \param point the unicode point |
|
103 * \return the number of bytes [1-4] or -1 if invalid |
|
104 */ |
2
|
105 UNICODE_EXPORT int nbytes_point(char32_t point) noexcept; |
0
|
106 |
|
107 /** |
|
108 * Get real number of character in a string. |
|
109 * |
|
110 * \param str the string |
|
111 * \return the length |
|
112 * \throw std::invalid_argument on invalid sequence |
|
113 */ |
2
|
114 UNICODE_EXPORT unsigned length(const std::string& str); |
0
|
115 |
|
116 /** |
|
117 * Iterate over all real characters in the UTF-8 string. |
|
118 * |
|
119 * The function must have the following signature: |
|
120 * void f(char ch) |
|
121 * |
|
122 * \param str the UTF-8 string |
|
123 * \param function the function callback |
|
124 * \throw std::invalid_argument on invalid sequence |
|
125 */ |
|
126 template <typename Func> |
2
|
127 void for_each(const std::string& str, Func function) |
0
|
128 { |
|
129 for (size_t i = 0; i < str.size(); ) { |
|
130 char32_t point = 0; |
2
|
131 int size = nbytes_utf8(str[i]); |
0
|
132 |
3
|
133 if (size < 0) |
0
|
134 throw std::invalid_argument("invalid sequence"); |
|
135 |
|
136 decode(point, str.data() + i); |
|
137 function(point); |
|
138 |
|
139 i += size; |
|
140 } |
|
141 } |
|
142 |
|
143 /** |
|
144 * Convert a UTF-32 string to UTF-8 string. |
|
145 * |
|
146 * \param array the UTF-32 string |
|
147 * \return the UTF-8 string |
|
148 * \throw std::invalid_argument on invalid sequence |
|
149 */ |
2
|
150 UNICODE_EXPORT std::string to_utf8(const std::u32string& array); |
0
|
151 |
|
152 /** |
|
153 * Convert a UTF-8 string to UTF-32 string. |
|
154 * |
|
155 * \param str the UTF-8 string |
|
156 * \return the UTF-32 string |
|
157 * \throw std::invalid_argument on invalid sequence |
|
158 */ |
2
|
159 UNICODE_EXPORT std::u32string to_utf32(const std::string& str); |
0
|
160 |
|
161 /** |
|
162 * Check if the unicode character is space. |
|
163 * |
|
164 * \param c the character |
|
165 * \return true if space |
|
166 */ |
|
167 UNICODE_EXPORT bool isspace(char32_t c) noexcept; |
|
168 |
|
169 /** |
|
170 * Check if the unicode character is digit. |
|
171 * |
|
172 * \param c the character |
|
173 * \return true if digit |
|
174 */ |
|
175 UNICODE_EXPORT bool isdigit(char32_t c) noexcept; |
|
176 |
|
177 /** |
|
178 * Check if the unicode character is alpha category. |
|
179 * |
|
180 * \param c the character |
|
181 * \return true if alpha |
|
182 */ |
|
183 UNICODE_EXPORT bool isalpha(char32_t c) noexcept; |
|
184 |
|
185 /** |
|
186 * Check if the unicode character is upper case. |
|
187 * |
|
188 * \param c the character |
|
189 * \return true if upper case |
|
190 */ |
|
191 UNICODE_EXPORT bool isupper(char32_t c) noexcept; |
|
192 |
|
193 /** |
|
194 * Check if the unicode character is lower case. |
|
195 * |
|
196 * \param c the character |
|
197 * \return true if lower case |
|
198 */ |
|
199 UNICODE_EXPORT bool islower(char32_t c) noexcept; |
|
200 |
|
201 /** |
|
202 * Check if the unicode character is title case. |
|
203 * |
|
204 * \param c the character |
|
205 * \return true if title case |
|
206 */ |
|
207 UNICODE_EXPORT bool istitle(char32_t c) noexcept; |
|
208 |
|
209 /** |
|
210 * Convert to upper case. |
|
211 * |
|
212 * \param c the character |
|
213 * \return the upper case character |
|
214 */ |
|
215 UNICODE_EXPORT char32_t toupper(char32_t c) noexcept; |
|
216 |
|
217 /** |
|
218 * Convert to lower case. |
|
219 * |
|
220 * \param c the character |
|
221 * \return the lower case character |
|
222 */ |
|
223 UNICODE_EXPORT char32_t tolower(char32_t c) noexcept; |
|
224 |
|
225 /** |
|
226 * Convert to title case. |
|
227 * |
|
228 * \param c the character |
|
229 * \return the title case character |
|
230 */ |
|
231 UNICODE_EXPORT char32_t totitle(char32_t c) noexcept; |
|
232 |
|
233 /** |
|
234 * Convert the UTF-32 string to upper case. |
|
235 * |
|
236 * \param str the str |
|
237 * \return the upper case string |
|
238 */ |
|
239 inline std::u32string toupper(std::u32string str) |
|
240 { |
3
|
241 for (size_t i = 0; i < str.size(); ++i) |
0
|
242 str[i] = toupper(str[i]); |
|
243 |
|
244 return str; |
|
245 } |
|
246 |
|
247 /** |
|
248 * Convert the UTF-8 string to upper case. |
|
249 * |
|
250 * \param str the str |
|
251 * \return the upper case string |
|
252 * \warning very slow at the moment |
|
253 */ |
2
|
254 inline std::string toupper(const std::string& str) |
0
|
255 { |
|
256 std::string result; |
|
257 char buffer[5]; |
|
258 |
2
|
259 for_each(str, [&] (char32_t code) { |
0
|
260 encode(toupper(code), buffer); |
|
261 result += buffer; |
|
262 }); |
|
263 |
|
264 return result; |
|
265 } |
|
266 |
|
267 /** |
|
268 * Convert the UTF-32 string to lower case. |
|
269 * |
|
270 * \param str the str |
|
271 * \return the lower case string |
|
272 */ |
|
273 inline std::u32string tolower(std::u32string str) |
|
274 { |
3
|
275 for (size_t i = 0; i < str.size(); ++i) |
0
|
276 str[i] = tolower(str[i]); |
|
277 |
|
278 return str; |
|
279 } |
|
280 |
|
281 /** |
|
282 * Convert the UTF-8 string to lower case. |
|
283 * |
|
284 * \param str the str |
|
285 * \return the lower case string |
|
286 * \warning very slow at the moment |
|
287 */ |
2
|
288 inline std::string tolower(const std::string& str) |
0
|
289 { |
|
290 std::string result; |
|
291 char buffer[5]; |
|
292 |
2
|
293 for_each(str, [&] (char32_t code) { |
0
|
294 encode(tolower(code), buffer); |
|
295 result += buffer; |
|
296 }); |
|
297 |
|
298 return result; |
|
299 } |
|
300 |
|
301 } // !unicode |
|
302 |
|
303 #endif // !UNICODE_HPP |