9 #ifndef ADOBE_UNICODE_HPP
10 #define ADOBE_UNICODE_HPP
16 #include <boost/cstdint.hpp>
18 #include <adobe/cassert.hpp>
30 template <std::
size_t N>
31 struct unicode_size_type_ {};
34 struct unicode_size_type {
35 typedef unicode_size_type_<sizeof(typename std::iterator_traits<I>::value_type)> type;
40 const unsigned char to_utf32_pivot_1_k(128);
41 const unsigned char to_utf32_pivot_2_k(192);
42 const unsigned char to_utf32_pivot_3_k(224);
43 const unsigned char to_utf32_pivot_4_k(240);
44 const unsigned char to_utf32_pivot_5_k(248);
46 const boost::uint32_t to_utf8_pivot_1_k(1UL << 7);
47 const boost::uint32_t to_utf8_pivot_2_k(1UL << 11);
48 const boost::uint32_t to_utf8_pivot_3_k(1UL << 16);
49 const boost::uint32_t to_utf8_pivot_4_k(1UL << 21);
53 template <std::
size_t NumBytes>
54 struct utf8_header_t {};
57 struct utf8_header_t<0> {
58 static const char value =
'\x80';
62 struct utf8_header_t<2> {
63 static const char value =
'\xC0';
66 struct utf8_header_t<3> {
67 static const char value =
'\xE0';
70 struct utf8_header_t<4> {
71 static const char value =
'\xF0';
78 template <
char Mask,
typename BinaryInteger>
79 inline char add_mask(BinaryInteger code) {
80 return static_cast<char>(
static_cast<char>(code) | Mask);
83 template <std::
size_t NumBytes,
bool Header,
typename BinaryInteger>
84 inline char utf8_add_mask(BinaryInteger code) {
85 return add_mask < utf8_header_t < Header ? NumBytes : 0 > ::value > (code);
90 inline char utf8_add_mask_0_false(
char code) {
return utf8_add_mask<0, false>(code); }
94 template <
char Mask,
typename BinaryInteger>
95 inline char strip_mask(BinaryInteger code) {
96 return static_cast<char>(code & ~Mask);
99 template <std::
size_t NumBytes,
bool Header,
typename BinaryInteger>
100 inline char utf8_strip_mask(BinaryInteger code) {
101 return strip_mask < utf8_header_t < Header ? NumBytes : 0 > ::value > (code);
106 template <std::
size_t Position>
107 inline boost::uint32_t promote_fragment(
char fragment) {
108 return boost::uint32_t(fragment << ((Position - 1) * 6));
112 inline boost::uint32_t promote_fragment<1>(
char fragment) {
113 return boost::uint32_t(fragment);
117 inline boost::uint32_t promote_fragment<0>(char);
121 template <std::
size_t Position>
122 inline char demote_fragment(boost::uint32_t fragment) {
123 return char((fragment >> ((Position - 1) * 6)) & 0x0000003F);
127 inline char demote_fragment<1>(boost::uint32_t fragment) {
128 return char(fragment & 0x0000003F);
132 inline char demote_fragment<0>(boost::uint32_t);
135 inline char demote_fragment_1(boost::uint32_t fragment) {
return demote_fragment<1>(fragment); }
140 template <
typename T, std::
size_t ByteCount,
bool Header = true>
141 struct demotion_engine_t {
142 template <
typename OutputIterator>
143 inline OutputIterator operator()(boost::uint32_t code, OutputIterator out) {
144 *out =
static_cast<T
>(utf8_add_mask<ByteCount, Header>(demote_fragment<ByteCount>(code)));
148 return demotion_engine_t<T, ByteCount - 1,
false>()(code, out);
153 template <
typename T>
154 struct demotion_engine_t<T, 1, false> {
155 template <
typename OutputIterator>
156 inline OutputIterator operator()(boost::uint32_t code, OutputIterator out) {
157 *out =
static_cast<T
>(utf8_add_mask_0_false(demote_fragment_1(code)));
165 template <std::
size_t ByteCount,
bool Header = true>
166 struct promotion_engine_t {
167 template <
typename InputIterator>
168 inline boost::uint32_t operator()(InputIterator& first, InputIterator last) {
174 char n =
static_cast<char>(*first);
175 char stripped(utf8_strip_mask<ByteCount, Header>(n));
176 boost::uint32_t shifted(promote_fragment<ByteCount>(stripped));
181 ADOBE_ASSERT(
false &&
"unicode: UTF-8 to UTF-32 conversion ran out of input");
185 return shifted | promotion_engine_t<ByteCount - 1,
false>()(first, last);
190 struct promotion_engine_t<1, false> {
191 template <
typename InputIterator>
192 inline boost::uint32_t operator()(InputIterator& first, InputIterator) {
193 boost::uint32_t result(promote_fragment<1>(utf8_strip_mask<0, false>(*first)));
203 template <
typename InputIterator,
typename T>
204 InputIterator to_utf32(InputIterator first, InputIterator last, T& result, unicode_size_type_<2>) {
205 boost::uint16_t code =
static_cast<boost::uint16_t
>(*first);
209 result =
static_cast<T
>(code);
210 }
else if (code < 0xDC00) {
212 ADOBE_ASSERT(
false &&
"unicode: UTF-16 lead surrogate found without trail surrogate");
216 boost::uint16_t trail =
static_cast<boost::uint16_t
>(*first);
219 ADOBE_ASSERT((0xDC00 <= trail && trail <= 0xDFFF) &&
220 "unicode: UTF-16 lead surrogate found without trail surrogate");
222 result =
static_cast<T
>(((code - 0xD800) << 10) + (trail - 0xDC00) + 0x10000);
224 ADOBE_ASSERT(!(code < 0xE000) &&
225 "unicode: UTF-16 trail surrogate found without lead surrogate");
226 result =
static_cast<T
>(code);
234 template <
typename InputIterator,
typename T>
235 InputIterator to_utf32(InputIterator first, InputIterator last, T& result, unicode_size_type_<1>) {
236 unsigned char n(static_cast<unsigned char>(*first));
238 if (n < to_utf32_pivot_1_k) {
239 result =
static_cast<T
>(n);
241 }
else if (n < to_utf32_pivot_3_k) {
242 ADOBE_ASSERT(!(n < to_utf32_pivot_2_k) &&
243 "unicode: ill-defined UTF-8 (first byte is 10xxxxxx)");
244 result =
static_cast<T
>(promotion_engine_t<2>()(first, last));
245 }
else if (n < to_utf32_pivot_4_k) {
246 result =
static_cast<T
>(promotion_engine_t<3>()(first, last));
247 }
else if (n < to_utf32_pivot_5_k) {
248 result =
static_cast<T
>(promotion_engine_t<4>()(first, last));
250 ADOBE_ASSERT(
false &&
"unicode: ill-defined UTF-8 (first byte is 11111xxx)");
253 ADOBE_ASSERT(!(result > 0x0010FFFF) &&
"unicode: ill-defined UTF-8 (code point out of range)");
255 ADOBE_ASSERT(!(0x0000D800 <= result && result <= 0x0000DFFF) &&
256 "unicode: ill-defined UTF-8 (surrogate code point)");
263 template <
typename InputIterator,
typename T>
264 InputIterator to_utf32(InputIterator first, InputIterator, T& result, unicode_size_type_<4>) {
265 result =
static_cast<T
>(*first);
277 template <
typename T,
typename O>
278 O utf32_to_utf8(boost::uint32_t code, O output) {
279 if (code < to_utf8_pivot_1_k)
281 *output =
static_cast<T
>(code);
283 }
else if (code < to_utf8_pivot_2_k)
284 output = demotion_engine_t<T, 2>()(code, output);
285 else if (code < to_utf8_pivot_3_k)
286 output = demotion_engine_t<T, 3>()(code, output);
287 else if (code < to_utf8_pivot_4_k)
288 output = demotion_engine_t<T, 4>()(code, output);
290 ADOBE_ASSERT(
false &&
"unicode: invalid code point (out of range)");
302 template <
typename T,
306 O utf32_to_utf16(N code, O output) {
307 if (code < 0x10000) {
308 *output =
static_cast<T
>(code);
310 *output =
static_cast<T
>(((code - 0x10000) >> 10) + 0xD800);
314 *output =
static_cast<T
>(((code - 0x10000) & 0x03FF) + 0xDC00);
327 template <
typename T,
typename I,
330 O to_utf8(I first, I last, O output, unicode_size_type_<1>) {
331 return std::copy(first, last, output);
341 template <
typename T,
typename I,
344 O to_utf8(I first, I last, O output, unicode_size_type_<2>) {
345 while (first != last) {
348 first = to_utf32(first, last, tmp, unicode_size_type_<2>());
350 output = utf32_to_utf8<T>(tmp, output);
363 template <
typename T,
typename I,
366 O to_utf8(I first, I last, O output, unicode_size_type_<4>) {
367 while (first != last) {
368 output = utf32_to_utf8<T>(
static_cast<boost::uint32_t
>(*first), output);
381 template <
typename T,
typename I,
384 O to_utf16(I first, I last, O output, unicode_size_type_<1>) {
385 while (first != last) {
386 boost::uint32_t result;
388 first = to_utf32(first, last, result, unicode_size_type_<1>());
390 output = utf32_to_utf16<T>(result, output);
402 template <
typename T,
typename I,
405 O to_utf16(I first, I last, O output, unicode_size_type_<2>) {
406 return std::copy(first, last, output);
415 template <
typename T,
typename I,
418 O to_utf16(I first, I last, O output, unicode_size_type_<4>) {
419 while (first != last) {
420 output = utf32_to_utf16<T>(*first, output);
429 template <
typename T,
typename I,
typename O>
430 O to_utf_(I f, I l, O o, unicode_size_type_<1>) {
431 return to_utf8<T>(f, l, o,
typename unicode_size_type<I>::type());
434 template <
typename T,
typename I,
typename O>
435 O to_utf_(I f, I l, O o, unicode_size_type_<2>) {
436 return to_utf16<T>(f, l, o,
typename unicode_size_type<I>::type());
439 template <
typename T,
typename I,
typename O>
440 O to_utf_(I f, I l, O o, unicode_size_type_<4>) {
444 f = to_utf32(f, l, result,
typename unicode_size_type<I>::type());
487 template <
typename T,
typename I,
typename O>
489 return detail::to_utf_<T>(f, l, o, detail::unicode_size_type_<sizeof(T)>());
O copy_utf(I f, I l, O o)