352
|
1 void Unicode::encode(char32_t c, char res[5]) noexcept |
|
2 { |
|
3 switch (nbytesPoint(c)) { |
|
4 case 1: |
|
5 res[0] = c; |
|
6 res[1] = '\0'; |
|
7 break; |
|
8 case 2: |
|
9 res[0] = 0xC0 | ((c >> 6) & 0x1F); |
|
10 res[1] = 0x80 | (c & 0x3F); |
|
11 res[2] = '\0'; |
|
12 break; |
|
13 case 3: |
|
14 res[0] = 0xE0 | ((c >> 12) & 0xF ); |
|
15 res[1] = 0x80 | ((c >> 6) & 0x3F); |
|
16 res[2] = 0x80 | (c & 0x3F); |
|
17 res[3] = '\0'; |
|
18 break; |
|
19 case 4: |
|
20 res[0] = 0xF0 | ((c >> 18) & 0x7 ); |
|
21 res[1] = 0x80 | ((c >> 12) & 0x3F); |
|
22 res[2] = 0x80 | ((c >> 6) & 0x3F); |
|
23 res[3] = 0x80 | (c & 0x3F); |
|
24 res[4] = '\0'; |
|
25 break; |
|
26 default: |
|
27 break; |
|
28 } |
|
29 } |
|
30 |
|
31 void Unicode::decode(char32_t &c, const char *res) noexcept |
|
32 { |
|
33 c = 0; |
|
34 |
|
35 switch (nbytesUtf8(res[0])) { |
|
36 case 1: |
|
37 c = res[0]; |
|
38 break; |
|
39 case 2: |
|
40 c = (res[0] & 0x1f) << 6; |
|
41 c |= (res[1] & 0x3f); |
|
42 break; |
|
43 case 3: |
|
44 c = (res[0] & 0x0f) << 12; |
|
45 c |= (res[1] & 0x3f) << 6; |
|
46 c |= (res[2] & 0x3f); |
|
47 break; |
|
48 case 4: |
|
49 c = (res[0] & 0x07) << 16; |
|
50 c |= (res[1] & 0x3f) << 12; |
|
51 c |= (res[2] & 0x3f) << 6; |
|
52 c |= (res[3] & 0x3f); |
|
53 default: |
|
54 break; |
|
55 } |
|
56 } |
|
57 |
|
58 int Unicode::nbytesUtf8(char c) noexcept |
|
59 { |
|
60 if ((c & 0xE0) == 0xC0) |
|
61 return 2; |
|
62 if ((c & 0xF0) == 0xE0) |
|
63 return 3; |
|
64 if ((c & 0xF8) == 0xF0) |
|
65 return 4; |
|
66 |
|
67 return 1; |
|
68 } |
|
69 |
|
70 int Unicode::nbytesPoint(char32_t c) noexcept |
|
71 { |
|
72 if (c <= 0x7F) |
|
73 return 1; |
|
74 if (c <= 0x7FF) |
|
75 return 2; |
|
76 if (c <= 0xFFFF) |
|
77 return 3; |
|
78 if (c <= 0x1FFFFF) |
|
79 return 4; |
|
80 |
|
81 return -1; |
|
82 } |
|
83 |
|
84 int Unicode::length(const std::string &str) |
|
85 { |
|
86 int total = 0; |
|
87 |
|
88 forEach(str, [&] (char32_t) { |
|
89 ++ total; |
|
90 }); |
|
91 |
|
92 return total; |
|
93 } |
|
94 |
|
95 std::string Unicode::toUtf8(const std::u32string &array) |
|
96 { |
|
97 std::string res; |
|
98 |
|
99 for (size_t i = 0; i < array.size(); ++i) { |
|
100 char tmp[5]; |
|
101 int size = nbytesPoint(array[i]); |
|
102 |
|
103 if (size < 0) { |
|
104 throw std::invalid_argument("invalid sequence"); |
|
105 } |
|
106 |
|
107 encode(array[i], tmp); |
|
108 res.insert(res.length(), tmp); |
|
109 } |
|
110 |
|
111 return res; |
|
112 } |
|
113 |
|
114 std::u32string Unicode::toUtf32(const std::string &str) |
|
115 { |
|
116 std::u32string res; |
|
117 |
|
118 forEach(str, [&] (char32_t code) { |
|
119 res.push_back(code); |
|
120 }); |
|
121 |
|
122 return res; |
|
123 } |