Adobe Source Libraries 2.0.0
A collection of C++ libraries.
Loading...
Searching...
No Matches
unicode.hpp
Go to the documentation of this file.
1/*
2 Copyright 2012 Adobe Systems Incorporated
3 Distributed under the Boost Software License - Version 1.0 (see the accompanying file LICENSE
4 or a copy at https://stlab.github.io/adobe_source_libraries/licenses.html)
5*/
6
7/**************************************************************************************************/
8
9#ifndef ADOBE_UNICODE_HPP
10#define ADOBE_UNICODE_HPP
11
12/**************************************************************************************************/
13
14#include <cstdint>
15#include <iterator>
16
17#include <adobe/cassert.hpp>
18
19/**************************************************************************************************/
20
21namespace adobe {
22
23/**************************************************************************************************/
24
25namespace detail {
26
27/**************************************************************************************************/
28
29template <std::size_t N>
30struct unicode_size_type_ {};
31
32template <typename I> // I models InputIterator
33struct unicode_size_type {
34 typedef unicode_size_type_<sizeof(typename std::iterator_traits<I>::value_type)> type;
35};
36
37/**************************************************************************************************/
38
39const unsigned char to_utf32_pivot_1_k(128);
40const unsigned char to_utf32_pivot_2_k(192);
41const unsigned char to_utf32_pivot_3_k(224);
42const unsigned char to_utf32_pivot_4_k(240);
43const unsigned char to_utf32_pivot_5_k(248);
44
45const std::uint32_t to_utf8_pivot_1_k(1UL << 7);
46const std::uint32_t to_utf8_pivot_2_k(1UL << 11);
47const std::uint32_t to_utf8_pivot_3_k(1UL << 16);
48const std::uint32_t to_utf8_pivot_4_k(1UL << 21);
49
50/**************************************************************************************************/
51
52template <std::size_t NumBytes>
53struct utf8_header_t {};
54
55template <>
56struct utf8_header_t<0> {
57 static const char value = '\x80';
58}; // nonheader
59// template <> struct utf8_header_t<1> { static const char value = '\x00'; }; // illegal
60template <>
61struct utf8_header_t<2> {
62 static const char value = '\xC0';
63};
64template <>
65struct utf8_header_t<3> {
66 static const char value = '\xE0';
67};
68template <>
69struct utf8_header_t<4> {
70 static const char value = '\xF0';
71};
72// template <> struct utf8_header_t<5> { static const char value = '\xF8'; }; // illegal
73// template <> struct utf8_header_t<6> { static const char value = '\xFC'; }; // illegal
74
75/**************************************************************************************************/
76
77template <char Mask, typename BinaryInteger>
78inline char add_mask(BinaryInteger code) {
79 return static_cast<char>(static_cast<char>(code) | Mask);
80}
81
82template <std::size_t NumBytes, bool Header, typename BinaryInteger>
83inline char utf8_add_mask(BinaryInteger code) {
84 return add_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code);
85}
86
87
88// MM concept gcc-4.1.1 workaround
89inline char utf8_add_mask_0_false(char code) { return utf8_add_mask<0, false>(code); }
90
91/**************************************************************************************************/
92
93template <char Mask, typename BinaryInteger>
94inline char strip_mask(BinaryInteger code) {
95 return static_cast<char>(code & ~Mask);
96}
97
98template <std::size_t NumBytes, bool Header, typename BinaryInteger>
99inline char utf8_strip_mask(BinaryInteger code) {
100 return strip_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code);
101}
102
103/**************************************************************************************************/
104
105template <std::size_t Position>
106inline std::uint32_t promote_fragment(char fragment) {
107 return std::uint32_t(fragment << ((Position - 1) * 6));
108}
109
110template <>
111inline std::uint32_t promote_fragment<1>(char fragment) {
112 return std::uint32_t(fragment);
113}
114
115template <>
116inline std::uint32_t promote_fragment<0>(char); // unimplemented
117
118/**************************************************************************************************/
119
120template <std::size_t Position>
121inline char demote_fragment(std::uint32_t fragment) {
122 return char((fragment >> ((Position - 1) * 6)) & 0x0000003F);
123}
124
125template <>
126inline char demote_fragment<1>(std::uint32_t fragment) {
127 return char(fragment & 0x0000003F);
128}
129
130template <>
131inline char demote_fragment<0>(std::uint32_t); // unimplemented
132
133// MM concept gcc-4.1.1 workaround
134inline char demote_fragment_1(std::uint32_t fragment) { return demote_fragment<1>(fragment); }
135
136
137/**************************************************************************************************/
138
139template <typename T, std::size_t ByteCount, bool Header = true>
140struct demotion_engine_t {
141 template <typename OutputIterator>
142 inline OutputIterator operator()(std::uint32_t code, OutputIterator out) {
143 *out = static_cast<T>(utf8_add_mask<ByteCount, Header>(demote_fragment<ByteCount>(code)));
144
145 ++out;
146
147 return demotion_engine_t<T, ByteCount - 1, false>()(code, out);
148 }
149};
150
151
152template <typename T>
153struct demotion_engine_t<T, 1, false> {
154 template <typename OutputIterator>
155 inline OutputIterator operator()(std::uint32_t code, OutputIterator out) {
156 *out = static_cast<T>(utf8_add_mask_0_false(demote_fragment_1(code)));
157
158 return ++out;
159 }
160};
161
162/**************************************************************************************************/
163
164template <std::size_t ByteCount, bool Header = true>
165struct promotion_engine_t {
166 template <typename InputIterator>
167 inline std::uint32_t operator()(InputIterator& first, InputIterator last) {
168 /*
169 CodeWarrior 9.4 doesn't like this code composited into one line;
170 GCC doesn't seem to have a problem.
171 */
172
173 char n = static_cast<char>(*first);
174 char stripped(utf8_strip_mask<ByteCount, Header>(n));
175 std::uint32_t shifted(promote_fragment<ByteCount>(stripped));
176
177 ++first;
178
179 if (first == last) {
180 ADOBE_ASSERT(false && "unicode: UTF-8 to UTF-32 conversion ran out of input");
181 return 0;
182 }
183
184 return shifted | promotion_engine_t<ByteCount - 1, false>()(first, last);
185 }
186};
187
188template <>
189struct promotion_engine_t<1, false> {
190 template <typename InputIterator>
191 inline std::uint32_t operator()(InputIterator& first, InputIterator) {
192 std::uint32_t result(promote_fragment<1>(utf8_strip_mask<0, false>(*first)));
193
194 ++first;
195
196 return result;
197 }
198};
199
200/**************************************************************************************************/
201
202template <typename InputIterator, typename T>
203InputIterator to_utf32(InputIterator first, InputIterator last, T& result, unicode_size_type_<2>) {
204 std::uint16_t code = static_cast<std::uint16_t>(*first);
205 ++first;
206
207 if (code < 0xD800) {
208 result = static_cast<T>(code);
209 } else if (code < 0xDC00) {
210 if (first == last) {
211 ADOBE_ASSERT(false && "unicode: UTF-16 lead surrogate found without trail surrogate");
212 return first;
213 }
214
215 std::uint16_t trail = static_cast<std::uint16_t>(*first);
216 ++first;
217
218 ADOBE_ASSERT((0xDC00 <= trail && trail <= 0xDFFF) &&
219 "unicode: UTF-16 lead surrogate found without trail surrogate");
220
221 result = static_cast<T>(((code - 0xD800) << 10) + (trail - 0xDC00) + 0x10000);
222 } else {
223 ADOBE_ASSERT(!(code < 0xE000) &&
224 "unicode: UTF-16 trail surrogate found without lead surrogate");
225 result = static_cast<T>(code);
226 }
227
228 return first;
229}
230
231/**************************************************************************************************/
232
233template <typename InputIterator, typename T>
234InputIterator to_utf32(InputIterator first, InputIterator last, T& result, unicode_size_type_<1>) {
235 unsigned char n(static_cast<unsigned char>(*first));
236
237 if (n < to_utf32_pivot_1_k) {
238 result = static_cast<T>(n);
239 ++first;
240 } else if (n < to_utf32_pivot_3_k) {
241 ADOBE_ASSERT(!(n < to_utf32_pivot_2_k) &&
242 "unicode: ill-defined UTF-8 (first byte is 10xxxxxx)");
243 result = static_cast<T>(promotion_engine_t<2>()(first, last));
244 } else if (n < to_utf32_pivot_4_k) {
245 result = static_cast<T>(promotion_engine_t<3>()(first, last));
246 } else if (n < to_utf32_pivot_5_k) {
247 result = static_cast<T>(promotion_engine_t<4>()(first, last));
248 } else {
249 ADOBE_ASSERT(false && "unicode: ill-defined UTF-8 (first byte is 11111xxx)");
250 }
251
252 ADOBE_ASSERT(!(result > 0x0010FFFF) && "unicode: ill-defined UTF-8 (code point out of range)");
253
254 ADOBE_ASSERT(!(0x0000D800 <= result && result <= 0x0000DFFF) &&
255 "unicode: ill-defined UTF-8 (surrogate code point)");
256
257 return first;
258}
259
260/**************************************************************************************************/
261
262template <typename InputIterator, typename T>
263InputIterator to_utf32(InputIterator first, InputIterator, T& result, unicode_size_type_<4>) {
264 result = static_cast<T>(*first);
265
266 return ++first;
267}
268
269/**************************************************************************************************/
270/*
271 utf32 -> utf8
272 - 1 source value
273 - n output values
274*/
275
276template <typename T, typename O> // O models OutputIterator
277O utf32_to_utf8(std::uint32_t code, O output) {
278 if (code < to_utf8_pivot_1_k) // UTF-8 is 1 byte long
279 {
280 *output = static_cast<T>(code);
281 ++output;
282 } else if (code < to_utf8_pivot_2_k) // UTF-8 is 2 bytes long
283 output = demotion_engine_t<T, 2>()(code, output);
284 else if (code < to_utf8_pivot_3_k) // UTF-8 is 3 bytes long
285 output = demotion_engine_t<T, 3>()(code, output);
286 else if (code < to_utf8_pivot_4_k) // UTF-8 is 4 bytes long
287 output = demotion_engine_t<T, 4>()(code, output);
288 else
289 ADOBE_ASSERT(false && "unicode: invalid code point (out of range)");
290
291 return output;
292}
293
294/**************************************************************************************************/
295/*
296 utf32 -> utf16
297 - 1 source value
298 - n output values
299*/
300
301template <typename T, // output type for O
302 typename N, // models Integer; sizeof(T) must equal 4; code must be valid utf32
303 typename O>
304// models OutputIterator
305O utf32_to_utf16(N code, O output) {
306 if (code < 0x10000) {
307 *output = static_cast<T>(code);
308 } else {
309 *output = static_cast<T>(((code - 0x10000) >> 10) + 0xD800);
310
311 ++output;
312
313 *output = static_cast<T>(((code - 0x10000) & 0x03FF) + 0xDC00);
314 }
315
316 return ++output;
317}
318
319/**************************************************************************************************/
320/*
321 utf8 -> utf8
322 - n source values
323 - m output values
324*/
325
326template <typename T, typename I, // I models InputIterator
327 typename O>
328// O models OutputIterator
329O to_utf8(I first, I last, O output, unicode_size_type_<1>) {
330 return std::copy(first, last, output);
331}
332
333/**************************************************************************************************/
334/*
335 utf16 -> utf8
336 - n source values
337 - m output values
338*/
339
340template <typename T, typename I, // I models InputIterator
341 typename O>
342// O models OutputIterator
343O to_utf8(I first, I last, O output, unicode_size_type_<2>) {
344 while (first != last) {
345 std::uint32_t tmp;
346
347 first = to_utf32(first, last, tmp, unicode_size_type_<2>());
348
349 output = utf32_to_utf8<T>(tmp, output);
350 }
351
352 return output;
353}
354
355/**************************************************************************************************/
356/*
357 utf32 -> utf8
358 - n source values
359 - m output values
360*/
361
362template <typename T, typename I, // I models InputIterator
363 typename O>
364// O models OutputIterator
365O to_utf8(I first, I last, O output, unicode_size_type_<4>) {
366 while (first != last) {
367 output = utf32_to_utf8<T>(static_cast<std::uint32_t>(*first), output);
368 ++first;
369 }
370
371 return output;
372}
373
374/**************************************************************************************************/
375/*
376 utf8 -> utf16
377 - n source values
378 - m output values
379*/
380template <typename T, typename I, // I models InputIterator
381 typename O>
382// O models OutputIterator
383O to_utf16(I first, I last, O output, unicode_size_type_<1>) {
384 while (first != last) {
385 std::uint32_t result;
386
387 first = to_utf32(first, last, result, unicode_size_type_<1>());
388
389 output = utf32_to_utf16<T>(result, output);
390 }
391
392 return output;
393}
394
395/**************************************************************************************************/
396/*
397 utf16 -> utf16
398 - n source values
399 - n output values
400*/
401template <typename T, typename I, // I models InputIterator
402 typename O>
403// O models OutputIterator
404O to_utf16(I first, I last, O output, unicode_size_type_<2>) {
405 return std::copy(first, last, output);
406}
407
408/**************************************************************************************************/
409/*
410 utf32 -> utf16
411 - n source values
412 - m output values
413*/
414template <typename T, typename I, // I models InputIterator
415 typename O>
416// O models OutputIterator
417O to_utf16(I first, I last, O output, unicode_size_type_<4>) {
418 while (first != last) {
419 output = utf32_to_utf16<T>(*first, output);
420 ++first;
421 }
422
423 return output;
424}
425
426/**************************************************************************************************/
427
428template <typename T, typename I, typename O>
429O to_utf_(I f, I l, O o, unicode_size_type_<1>) {
430 return to_utf8<T>(f, l, o, typename unicode_size_type<I>::type());
431}
432
433template <typename T, typename I, typename O>
434O to_utf_(I f, I l, O o, unicode_size_type_<2>) {
435 return to_utf16<T>(f, l, o, typename unicode_size_type<I>::type());
436}
437
438template <typename T, typename I, typename O>
439O to_utf_(I f, I l, O o, unicode_size_type_<4>) {
440 T result;
441
442 while (f != l) {
443 f = to_utf32(f, l, result, typename unicode_size_type<I>::type());
444
445 *o++ = result;
446 }
447
448 return o;
449}
450
451/**************************************************************************************************/
452
453template <int A, int B>
454struct expand_utf_t;
455
456template <>
457struct expand_utf_t<8, 8> {
458 static const int value = 1;
459};
460template <>
461struct expand_utf_t<8, 16> {
462 static const int value = 1;
463};
464template <>
465struct expand_utf_t<8, 32> {
466 static const int value = 1;
467};
468
469template <>
470struct expand_utf_t<16, 8> {
471 static const int value = 3;
472};
473template <>
474struct expand_utf_t<16, 16> {
475 static const int value = 1;
476};
477template <>
478struct expand_utf_t<16, 32> {
479 static const int value = 1;
480};
481
482template <>
483struct expand_utf_t<32, 8> {
484 static const int value = 4;
485};
486template <>
487struct expand_utf_t<32, 16> {
488 static const int value = 2;
489};
490template <>
491struct expand_utf_t<32, 32> {
492 static const int value = 1;
493};
494
495/**************************************************************************************************/
496
497} // namespace detail
498
499/**************************************************************************************************/
500
518
519template <typename T, typename U>
521 : std::integral_constant<std::size_t, detail::expand_utf_t<sizeof(T), sizeof(U)>::value> {};
522
523/**************************************************************************************************/
524
547
548template <typename T, typename I, typename O>
549O copy_utf(I f, I l, O o) {
550 return detail::to_utf_<T>(f, l, o, detail::unicode_size_type_<sizeof(T)>());
551}
552
560
561template <typename T, typename R, typename O>
562O copy_utf(const R& r, O o) {
563 return copy_utf<T>(std::begin(r), std::end(r), o);
564}
565
566/**************************************************************************************************/
567
568} // namespace adobe
569
570/**************************************************************************************************/
571
572#endif
573
574/**************************************************************************************************/
#define ADOBE_ASSERT(p)
Definition cassert.hpp:32
O copy_utf(I f, I l, O o)
Definition unicode.hpp:549