9#ifndef ADOBE_UNICODE_HPP
10#define ADOBE_UNICODE_HPP
29template <std::
size_t N>
30struct unicode_size_type_ {};
33struct unicode_size_type {
34 typedef unicode_size_type_<
sizeof(
typename std::iterator_traits<I>::value_type)> type;
39const unsigned char to_utf32_pivot_1_k(128);
40const unsigned char to_utf32_pivot_2_k(192);
41const unsigned char to_utf32_pivot_3_k(224);
42const unsigned char to_utf32_pivot_4_k(240);
43const unsigned char to_utf32_pivot_5_k(248);
45const std::uint32_t to_utf8_pivot_1_k(1UL << 7);
46const std::uint32_t to_utf8_pivot_2_k(1UL << 11);
47const std::uint32_t to_utf8_pivot_3_k(1UL << 16);
48const std::uint32_t to_utf8_pivot_4_k(1UL << 21);
52template <std::
size_t NumBytes>
53struct utf8_header_t {};
56struct utf8_header_t<0> {
57 static const char value =
'\x80';
61struct utf8_header_t<2> {
62 static const char value =
'\xC0';
65struct utf8_header_t<3> {
66 static const char value =
'\xE0';
69struct utf8_header_t<4> {
70 static const char value =
'\xF0';
77template <
char Mask,
typename BinaryInteger>
78inline char add_mask(BinaryInteger code) {
79 return static_cast<char>(
static_cast<char>(code) | Mask);
82template <std::
size_t NumBytes,
bool Header,
typename BinaryInteger>
83inline char utf8_add_mask(BinaryInteger code) {
84 return add_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code);
89inline char utf8_add_mask_0_false(
char code) {
return utf8_add_mask<0, false>(code); }
93template <
char Mask,
typename BinaryInteger>
94inline char strip_mask(BinaryInteger code) {
95 return static_cast<char>(code & ~Mask);
98template <std::
size_t NumBytes,
bool Header,
typename BinaryInteger>
99inline char utf8_strip_mask(BinaryInteger code) {
100 return strip_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code);
105template <std::
size_t Position>
106inline std::uint32_t promote_fragment(
char fragment) {
107 return std::uint32_t(fragment << ((Position - 1) * 6));
111inline std::uint32_t promote_fragment<1>(
char fragment) {
112 return std::uint32_t(fragment);
116inline std::uint32_t promote_fragment<0>(
char);
120template <std::
size_t Position>
121inline char demote_fragment(std::uint32_t fragment) {
122 return char((fragment >> ((Position - 1) * 6)) & 0x0000003F);
126inline char demote_fragment<1>(std::uint32_t fragment) {
127 return char(fragment & 0x0000003F);
131inline char demote_fragment<0>(std::uint32_t);
134inline char demote_fragment_1(std::uint32_t fragment) {
return demote_fragment<1>(fragment); }
139template <
typename T, std::
size_t ByteCount,
bool Header = true>
140struct demotion_engine_t {
141 template <
typename OutputIterator>
142 inline OutputIterator operator()(std::uint32_t code, OutputIterator out) {
143 *out =
static_cast<T
>(utf8_add_mask<ByteCount, Header>(demote_fragment<ByteCount>(code)));
147 return demotion_engine_t<T, ByteCount - 1,
false>()(code, out);
153struct demotion_engine_t<T, 1, false> {
154 template <
typename OutputIterator>
155 inline OutputIterator operator()(std::uint32_t code, OutputIterator out) {
156 *out =
static_cast<T
>(utf8_add_mask_0_false(demote_fragment_1(code)));
164template <std::
size_t ByteCount,
bool Header = true>
165struct promotion_engine_t {
166 template <
typename InputIterator>
167 inline std::uint32_t operator()(InputIterator& first, InputIterator last) {
173 char n =
static_cast<char>(*first);
174 char stripped(utf8_strip_mask<ByteCount, Header>(n));
175 std::uint32_t shifted(promote_fragment<ByteCount>(stripped));
180 ADOBE_ASSERT(
false &&
"unicode: UTF-8 to UTF-32 conversion ran out of input");
184 return shifted | promotion_engine_t<ByteCount - 1,
false>()(first, last);
189struct promotion_engine_t<1, false> {
190 template <
typename InputIterator>
191 inline std::uint32_t operator()(InputIterator& first, InputIterator) {
192 std::uint32_t result(promote_fragment<1>(utf8_strip_mask<0, false>(*first)));
202template <
typename InputIterator,
typename T>
203InputIterator to_utf32(InputIterator first, InputIterator last, T& result, unicode_size_type_<2>) {
204 std::uint16_t code =
static_cast<std::uint16_t
>(*first);
208 result =
static_cast<T
>(code);
209 }
else if (code < 0xDC00) {
211 ADOBE_ASSERT(
false &&
"unicode: UTF-16 lead surrogate found without trail surrogate");
215 std::uint16_t trail =
static_cast<std::uint16_t
>(*first);
219 "unicode: UTF-16 lead surrogate found without trail surrogate");
221 result =
static_cast<T
>(((code - 0xD800) << 10) + (trail - 0xDC00) + 0x10000);
224 "unicode: UTF-16 trail surrogate found without lead surrogate");
225 result =
static_cast<T
>(code);
233template <
typename InputIterator,
typename T>
234InputIterator to_utf32(InputIterator first, InputIterator last, T& result, unicode_size_type_<1>) {
235 unsigned char n(
static_cast<unsigned char>(*first));
237 if (n < to_utf32_pivot_1_k) {
238 result =
static_cast<T
>(n);
240 }
else if (n < to_utf32_pivot_3_k) {
242 "unicode: ill-defined UTF-8 (first byte is 10xxxxxx)");
243 result =
static_cast<T
>(promotion_engine_t<2>()(first, last));
244 }
else if (n < to_utf32_pivot_4_k) {
245 result =
static_cast<T
>(promotion_engine_t<3>()(first, last));
246 }
else if (n < to_utf32_pivot_5_k) {
247 result =
static_cast<T
>(promotion_engine_t<4>()(first, last));
249 ADOBE_ASSERT(
false &&
"unicode: ill-defined UTF-8 (first byte is 11111xxx)");
252 ADOBE_ASSERT(!(result > 0x0010FFFF) &&
"unicode: ill-defined UTF-8 (code point out of range)");
254 ADOBE_ASSERT(!(0x0000D800 <= result && result <= 0x0000DFFF) &&
255 "unicode: ill-defined UTF-8 (surrogate code point)");
262template <
typename InputIterator,
typename T>
263InputIterator to_utf32(InputIterator first, InputIterator, T& result, unicode_size_type_<4>) {
264 result =
static_cast<T
>(*first);
276template <
typename T,
typename O>
277O utf32_to_utf8(std::uint32_t code, O output) {
278 if (code < to_utf8_pivot_1_k)
280 *output =
static_cast<T
>(code);
282 }
else if (code < to_utf8_pivot_2_k)
283 output = demotion_engine_t<T, 2>()(code, output);
284 else if (code < to_utf8_pivot_3_k)
285 output = demotion_engine_t<T, 3>()(code, output);
286 else if (code < to_utf8_pivot_4_k)
287 output = demotion_engine_t<T, 4>()(code, output);
289 ADOBE_ASSERT(
false &&
"unicode: invalid code point (out of range)");
305O utf32_to_utf16(N code, O output) {
306 if (code < 0x10000) {
307 *output =
static_cast<T
>(code);
309 *output =
static_cast<T
>(((code - 0x10000) >> 10) + 0xD800);
313 *output =
static_cast<T
>(((code - 0x10000) & 0x03FF) + 0xDC00);
326template <
typename T,
typename I,
329O to_utf8(I first, I last, O output, unicode_size_type_<1>) {
330 return std::copy(first, last, output);
340template <
typename T,
typename I,
343O to_utf8(I first, I last, O output, unicode_size_type_<2>) {
344 while (first != last) {
347 first = to_utf32(first, last, tmp, unicode_size_type_<2>());
349 output = utf32_to_utf8<T>(tmp, output);
362template <
typename T,
typename I,
365O to_utf8(I first, I last, O output, unicode_size_type_<4>) {
366 while (first != last) {
367 output = utf32_to_utf8<T>(
static_cast<std::uint32_t
>(*first), output);
380template <
typename T,
typename I,
383O to_utf16(I first, I last, O output, unicode_size_type_<1>) {
384 while (first != last) {
385 std::uint32_t result;
387 first = to_utf32(first, last, result, unicode_size_type_<1>());
389 output = utf32_to_utf16<T>(result, output);
401template <
typename T,
typename I,
404O to_utf16(I first, I last, O output, unicode_size_type_<2>) {
405 return std::copy(first, last, output);
414template <
typename T,
typename I,
417O to_utf16(I first, I last, O output, unicode_size_type_<4>) {
418 while (first != last) {
419 output = utf32_to_utf16<T>(*first, output);
428template <
typename T,
typename I,
typename O>
429O to_utf_(I f, I l, O o, unicode_size_type_<1>) {
430 return to_utf8<T>(f, l, o,
typename unicode_size_type<I>::type());
433template <
typename T,
typename I,
typename O>
434O to_utf_(I f, I l, O o, unicode_size_type_<2>) {
435 return to_utf16<T>(f, l, o,
typename unicode_size_type<I>::type());
438template <
typename T,
typename I,
typename O>
439O to_utf_(I f, I l, O o, unicode_size_type_<4>) {
443 f = to_utf32(f, l, result,
typename unicode_size_type<I>::type());
453template <
int A,
int B>
457struct expand_utf_t<8, 8> {
458 static const int value = 1;
461struct expand_utf_t<8, 16> {
462 static const int value = 1;
465struct expand_utf_t<8, 32> {
466 static const int value = 1;
470struct expand_utf_t<16, 8> {
471 static const int value = 3;
474struct expand_utf_t<16, 16> {
475 static const int value = 1;
478struct expand_utf_t<16, 32> {
479 static const int value = 1;
483struct expand_utf_t<32, 8> {
484 static const int value = 4;
487struct expand_utf_t<32, 16> {
488 static const int value = 2;
491struct expand_utf_t<32, 32> {
492 static const int value = 1;
519template <
typename T,
typename U>
521 : std::integral_constant<std::size_t, detail::expand_utf_t<sizeof(T), sizeof(U)>::value> {};
548template <
typename T,
typename I,
typename O>
550 return detail::to_utf_<T>(f, l, o, detail::unicode_size_type_<
sizeof(T)>());
561template <
typename T,
typename R,
typename O>
O copy_utf(I f, I l, O o)