comparison tools/mkunicode/Unicode-after.cpp @ 352:7fe8d4094983

Utf8: - Fix invalid decoding from UTF-8 to UTF-32 - Add all files
author David Demelier <markand@malikania.fr>
date Wed, 08 Apr 2015 12:33:45 +0200
parents
children b78d6d8f2872
comparison
equal deleted inserted replaced
351:47a206e724f2 352:7fe8d4094983
1 void Unicode::encode(char32_t c, char res[5]) noexcept
2 {
3 switch (nbytesPoint(c)) {
4 case 1:
5 res[0] = c;
6 res[1] = '\0';
7 break;
8 case 2:
9 res[0] = 0xC0 | ((c >> 6) & 0x1F);
10 res[1] = 0x80 | (c & 0x3F);
11 res[2] = '\0';
12 break;
13 case 3:
14 res[0] = 0xE0 | ((c >> 12) & 0xF );
15 res[1] = 0x80 | ((c >> 6) & 0x3F);
16 res[2] = 0x80 | (c & 0x3F);
17 res[3] = '\0';
18 break;
19 case 4:
20 res[0] = 0xF0 | ((c >> 18) & 0x7 );
21 res[1] = 0x80 | ((c >> 12) & 0x3F);
22 res[2] = 0x80 | ((c >> 6) & 0x3F);
23 res[3] = 0x80 | (c & 0x3F);
24 res[4] = '\0';
25 break;
26 default:
27 break;
28 }
29 }
30
31 void Unicode::decode(char32_t &c, const char *res) noexcept
32 {
33 c = 0;
34
35 switch (nbytesUtf8(res[0])) {
36 case 1:
37 c = res[0];
38 break;
39 case 2:
40 c = (res[0] & 0x1f) << 6;
41 c |= (res[1] & 0x3f);
42 break;
43 case 3:
44 c = (res[0] & 0x0f) << 12;
45 c |= (res[1] & 0x3f) << 6;
46 c |= (res[2] & 0x3f);
47 break;
48 case 4:
49 c = (res[0] & 0x07) << 16;
50 c |= (res[1] & 0x3f) << 12;
51 c |= (res[2] & 0x3f) << 6;
52 c |= (res[3] & 0x3f);
53 default:
54 break;
55 }
56 }
57
58 int Unicode::nbytesUtf8(char c) noexcept
59 {
60 if ((c & 0xE0) == 0xC0)
61 return 2;
62 if ((c & 0xF0) == 0xE0)
63 return 3;
64 if ((c & 0xF8) == 0xF0)
65 return 4;
66
67 return 1;
68 }
69
70 int Unicode::nbytesPoint(char32_t c) noexcept
71 {
72 if (c <= 0x7F)
73 return 1;
74 if (c <= 0x7FF)
75 return 2;
76 if (c <= 0xFFFF)
77 return 3;
78 if (c <= 0x1FFFFF)
79 return 4;
80
81 return -1;
82 }
83
84 int Unicode::length(const std::string &str)
85 {
86 int total = 0;
87
88 forEach(str, [&] (char32_t) {
89 ++ total;
90 });
91
92 return total;
93 }
94
95 std::string Unicode::toUtf8(const std::u32string &array)
96 {
97 std::string res;
98
99 for (size_t i = 0; i < array.size(); ++i) {
100 char tmp[5];
101 int size = nbytesPoint(array[i]);
102
103 if (size < 0) {
104 throw std::invalid_argument("invalid sequence");
105 }
106
107 encode(array[i], tmp);
108 res.insert(res.length(), tmp);
109 }
110
111 return res;
112 }
113
114 std::u32string Unicode::toUtf32(const std::string &str)
115 {
116 std::u32string res;
117
118 forEach(str, [&] (char32_t code) {
119 res.push_back(code);
120 });
121
122 return res;
123 }