Adobe Source Libraries  1.43
unicode.hpp
Go to the documentation of this file.
1 /*
2  Copyright 2012 Adobe Systems Incorporated
3  Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
4  or a copy at http://stlab.adobe.com/licenses.html)
5 */
6 
7 /**************************************************************************************************/
8 
9 #ifndef ADOBE_UNICODE_HPP
10 #define ADOBE_UNICODE_HPP
11 
12 /**************************************************************************************************/
13 
14 #include <iterator>
15 
16 #include <boost/cstdint.hpp>
17 
18 #include <adobe/cassert.hpp>
19 
20 /**************************************************************************************************/
21 
22 namespace adobe {
23 
24 /**************************************************************************************************/
25 
26 namespace detail {
27 
28 /**************************************************************************************************/
29 
30 template <std::size_t N>
31 struct unicode_size_type_ {};
32 
33 template <typename I> // I models InputIterator
34 struct unicode_size_type {
35  typedef unicode_size_type_<sizeof(typename std::iterator_traits<I>::value_type)> type;
36 };
37 
38 /**************************************************************************************************/
39 
40 const unsigned char to_utf32_pivot_1_k(128);
41 const unsigned char to_utf32_pivot_2_k(192);
42 const unsigned char to_utf32_pivot_3_k(224);
43 const unsigned char to_utf32_pivot_4_k(240);
44 const unsigned char to_utf32_pivot_5_k(248);
45 
46 const boost::uint32_t to_utf8_pivot_1_k(1UL << 7);
47 const boost::uint32_t to_utf8_pivot_2_k(1UL << 11);
48 const boost::uint32_t to_utf8_pivot_3_k(1UL << 16);
49 const boost::uint32_t to_utf8_pivot_4_k(1UL << 21);
50 
51 /**************************************************************************************************/
52 
53 template <std::size_t NumBytes>
54 struct utf8_header_t {};
55 
56 template <>
57 struct utf8_header_t<0> {
58  static const char value = '\x80';
59 }; // nonheader
60 // template <> struct utf8_header_t<1> { static const char value = '\x00'; }; // illegal
61 template <>
62 struct utf8_header_t<2> {
63  static const char value = '\xC0';
64 };
65 template <>
66 struct utf8_header_t<3> {
67  static const char value = '\xE0';
68 };
69 template <>
70 struct utf8_header_t<4> {
71  static const char value = '\xF0';
72 };
73 // template <> struct utf8_header_t<5> { static const char value = '\xF8'; }; // illegal
74 // template <> struct utf8_header_t<6> { static const char value = '\xFC'; }; // illegal
75 
76 /**************************************************************************************************/
77 
78 template <char Mask, typename BinaryInteger>
79 inline char add_mask(BinaryInteger code) {
80  return static_cast<char>(static_cast<char>(code) | Mask);
81 }
82 
83 template <std::size_t NumBytes, bool Header, typename BinaryInteger>
84 inline char utf8_add_mask(BinaryInteger code) {
85  return add_mask < utf8_header_t < Header ? NumBytes : 0 > ::value > (code);
86 }
87 
88 
89 // MM concept gcc-4.1.1 workaround
90 inline char utf8_add_mask_0_false(char code) { return utf8_add_mask<0, false>(code); }
91 
92 /**************************************************************************************************/
93 
94 template <char Mask, typename BinaryInteger>
95 inline char strip_mask(BinaryInteger code) {
96  return static_cast<char>(code & ~Mask);
97 }
98 
99 template <std::size_t NumBytes, bool Header, typename BinaryInteger>
100 inline char utf8_strip_mask(BinaryInteger code) {
101  return strip_mask < utf8_header_t < Header ? NumBytes : 0 > ::value > (code);
102 }
103 
104 /**************************************************************************************************/
105 
106 template <std::size_t Position>
107 inline boost::uint32_t promote_fragment(char fragment) {
108  return boost::uint32_t(fragment << ((Position - 1) * 6));
109 }
110 
111 template <>
112 inline boost::uint32_t promote_fragment<1>(char fragment) {
113  return boost::uint32_t(fragment);
114 }
115 
116 template <>
117 inline boost::uint32_t promote_fragment<0>(char); // unimplemented
118 
119 /**************************************************************************************************/
120 
121 template <std::size_t Position>
122 inline char demote_fragment(boost::uint32_t fragment) {
123  return char((fragment >> ((Position - 1) * 6)) & 0x0000003F);
124 }
125 
126 template <>
127 inline char demote_fragment<1>(boost::uint32_t fragment) {
128  return char(fragment & 0x0000003F);
129 }
130 
131 template <>
132 inline char demote_fragment<0>(boost::uint32_t); // unimplemented
133 
134 // MM concept gcc-4.1.1 workaround
135 inline char demote_fragment_1(boost::uint32_t fragment) { return demote_fragment<1>(fragment); }
136 
137 
138 /**************************************************************************************************/
139 
140 template <typename T, std::size_t ByteCount, bool Header = true>
141 struct demotion_engine_t {
142  template <typename OutputIterator>
143  inline OutputIterator operator()(boost::uint32_t code, OutputIterator out) {
144  *out = static_cast<T>(utf8_add_mask<ByteCount, Header>(demote_fragment<ByteCount>(code)));
145 
146  ++out;
147 
148  return demotion_engine_t<T, ByteCount - 1, false>()(code, out);
149  }
150 };
151 
152 
153 template <typename T>
154 struct demotion_engine_t<T, 1, false> {
155  template <typename OutputIterator>
156  inline OutputIterator operator()(boost::uint32_t code, OutputIterator out) {
157  *out = static_cast<T>(utf8_add_mask_0_false(demote_fragment_1(code)));
158 
159  return ++out;
160  }
161 };
162 
163 /**************************************************************************************************/
164 
165 template <std::size_t ByteCount, bool Header = true>
166 struct promotion_engine_t {
167  template <typename InputIterator>
168  inline boost::uint32_t operator()(InputIterator& first, InputIterator last) {
169  /*
170  CodeWarrior 9.4 doesn't like this code composited into one line;
171  GCC doesn't seem to have a problem.
172  */
173 
174  char n = static_cast<char>(*first);
175  char stripped(utf8_strip_mask<ByteCount, Header>(n));
176  boost::uint32_t shifted(promote_fragment<ByteCount>(stripped));
177 
178  ++first;
179 
180  if (first == last) {
181  ADOBE_ASSERT(false && "unicode: UTF-8 to UTF-32 conversion ran out of input");
182  return 0;
183  }
184 
185  return shifted | promotion_engine_t<ByteCount - 1, false>()(first, last);
186  }
187 };
188 
189 template <>
190 struct promotion_engine_t<1, false> {
191  template <typename InputIterator>
192  inline boost::uint32_t operator()(InputIterator& first, InputIterator) {
193  boost::uint32_t result(promote_fragment<1>(utf8_strip_mask<0, false>(*first)));
194 
195  ++first;
196 
197  return result;
198  }
199 };
200 
201 /**************************************************************************************************/
202 
203 template <typename InputIterator, typename T>
204 InputIterator to_utf32(InputIterator first, InputIterator last, T& result, unicode_size_type_<2>) {
205  boost::uint16_t code = static_cast<boost::uint16_t>(*first);
206  ++first;
207 
208  if (code < 0xD800) {
209  result = static_cast<T>(code);
210  } else if (code < 0xDC00) {
211  if (first == last) {
212  ADOBE_ASSERT(false && "unicode: UTF-16 lead surrogate found without trail surrogate");
213  return first;
214  }
215 
216  boost::uint16_t trail = static_cast<boost::uint16_t>(*first);
217  ++first;
218 
219  ADOBE_ASSERT((0xDC00 <= trail && trail <= 0xDFFF) &&
220  "unicode: UTF-16 lead surrogate found without trail surrogate");
221 
222  result = static_cast<T>(((code - 0xD800) << 10) + (trail - 0xDC00) + 0x10000);
223  } else {
224  ADOBE_ASSERT(!(code < 0xE000) &&
225  "unicode: UTF-16 trail surrogate found without lead surrogate");
226  result = static_cast<T>(code);
227  }
228 
229  return first;
230 }
231 
232 /**************************************************************************************************/
233 
234 template <typename InputIterator, typename T>
235 InputIterator to_utf32(InputIterator first, InputIterator last, T& result, unicode_size_type_<1>) {
236  unsigned char n(static_cast<unsigned char>(*first));
237 
238  if (n < to_utf32_pivot_1_k) {
239  result = static_cast<T>(n);
240  ++first;
241  } else if (n < to_utf32_pivot_3_k) {
242  ADOBE_ASSERT(!(n < to_utf32_pivot_2_k) &&
243  "unicode: ill-defined UTF-8 (first byte is 10xxxxxx)");
244  result = static_cast<T>(promotion_engine_t<2>()(first, last));
245  } else if (n < to_utf32_pivot_4_k) {
246  result = static_cast<T>(promotion_engine_t<3>()(first, last));
247  } else if (n < to_utf32_pivot_5_k) {
248  result = static_cast<T>(promotion_engine_t<4>()(first, last));
249  } else {
250  ADOBE_ASSERT(false && "unicode: ill-defined UTF-8 (first byte is 11111xxx)");
251  }
252 
253  ADOBE_ASSERT(!(result > 0x0010FFFF) && "unicode: ill-defined UTF-8 (code point out of range)");
254 
255  ADOBE_ASSERT(!(0x0000D800 <= result && result <= 0x0000DFFF) &&
256  "unicode: ill-defined UTF-8 (surrogate code point)");
257 
258  return first;
259 }
260 
261 /**************************************************************************************************/
262 
263 template <typename InputIterator, typename T>
264 InputIterator to_utf32(InputIterator first, InputIterator, T& result, unicode_size_type_<4>) {
265  result = static_cast<T>(*first);
266 
267  return ++first;
268 }
269 
270 /**************************************************************************************************/
271 /*
272  utf32 -> utf8
273  - 1 source value
274  - n output values
275 */
276 
277 template <typename T, typename O> // O models OutputIterator
278 O utf32_to_utf8(boost::uint32_t code, O output) {
279  if (code < to_utf8_pivot_1_k) // UTF-8 is 1 byte long
280  {
281  *output = static_cast<T>(code);
282  ++output;
283  } else if (code < to_utf8_pivot_2_k) // UTF-8 is 2 bytes long
284  output = demotion_engine_t<T, 2>()(code, output);
285  else if (code < to_utf8_pivot_3_k) // UTF-8 is 3 bytes long
286  output = demotion_engine_t<T, 3>()(code, output);
287  else if (code < to_utf8_pivot_4_k) // UTF-8 is 4 bytes long
288  output = demotion_engine_t<T, 4>()(code, output);
289  else
290  ADOBE_ASSERT(false && "unicode: invalid code point (out of range)");
291 
292  return output;
293 }
294 
295 /**************************************************************************************************/
296 /*
297  utf32 -> utf16
298  - 1 source value
299  - n output values
300 */
301 
302 template <typename T, // output type for O
303  typename N, // models Integer; sizeof(T) must equal 4; code must be valid utf32
304  typename O>
305 // models OutputIterator
306 O utf32_to_utf16(N code, O output) {
307  if (code < 0x10000) {
308  *output = static_cast<T>(code);
309  } else {
310  *output = static_cast<T>(((code - 0x10000) >> 10) + 0xD800);
311 
312  ++output;
313 
314  *output = static_cast<T>(((code - 0x10000) & 0x03FF) + 0xDC00);
315  }
316 
317  return ++output;
318 }
319 
320 /**************************************************************************************************/
321 /*
322  utf8 -> utf8
323  - n source values
324  - m output values
325 */
326 
327 template <typename T, typename I, // I models InputIterator
328  typename O>
329 // O models OutputIterator
330 O to_utf8(I first, I last, O output, unicode_size_type_<1>) {
331  return std::copy(first, last, output);
332 }
333 
334 /**************************************************************************************************/
335 /*
336  utf16 -> utf8
337  - n source values
338  - m output values
339 */
340 
341 template <typename T, typename I, // I models InputIterator
342  typename O>
343 // O models OutputIterator
344 O to_utf8(I first, I last, O output, unicode_size_type_<2>) {
345  while (first != last) {
346  boost::uint32_t tmp;
347 
348  first = to_utf32(first, last, tmp, unicode_size_type_<2>());
349 
350  output = utf32_to_utf8<T>(tmp, output);
351  }
352 
353  return output;
354 }
355 
356 /**************************************************************************************************/
357 /*
358  utf32 -> utf8
359  - n source values
360  - m output values
361 */
362 
363 template <typename T, typename I, // I models InputIterator
364  typename O>
365 // O models OutputIterator
366 O to_utf8(I first, I last, O output, unicode_size_type_<4>) {
367  while (first != last) {
368  output = utf32_to_utf8<T>(static_cast<boost::uint32_t>(*first), output);
369  ++first;
370  }
371 
372  return output;
373 }
374 
375 /**************************************************************************************************/
376 /*
377  utf8 -> utf16
378  - n source values
379  - m output values
380 */
381 template <typename T, typename I, // I models InputIterator
382  typename O>
383 // O models OutputIterator
384 O to_utf16(I first, I last, O output, unicode_size_type_<1>) {
385  while (first != last) {
386  boost::uint32_t result;
387 
388  first = to_utf32(first, last, result, unicode_size_type_<1>());
389 
390  output = utf32_to_utf16<T>(result, output);
391  }
392 
393  return output;
394 }
395 
396 /**************************************************************************************************/
397 /*
398  utf16 -> utf16
399  - n source values
400  - n output values
401 */
402 template <typename T, typename I, // I models InputIterator
403  typename O>
404 // O models OutputIterator
405 O to_utf16(I first, I last, O output, unicode_size_type_<2>) {
406  return std::copy(first, last, output);
407 }
408 
409 /**************************************************************************************************/
410 /*
411  utf32 -> utf16
412  - n source values
413  - m output values
414 */
415 template <typename T, typename I, // I models InputIterator
416  typename O>
417 // O models OutputIterator
418 O to_utf16(I first, I last, O output, unicode_size_type_<4>) {
419  while (first != last) {
420  output = utf32_to_utf16<T>(*first, output);
421  ++first;
422  }
423 
424  return output;
425 }
426 
427 /**************************************************************************************************/
428 
429 template <typename T, typename I, typename O>
430 O to_utf_(I f, I l, O o, unicode_size_type_<1>) {
431  return to_utf8<T>(f, l, o, typename unicode_size_type<I>::type());
432 }
433 
434 template <typename T, typename I, typename O>
435 O to_utf_(I f, I l, O o, unicode_size_type_<2>) {
436  return to_utf16<T>(f, l, o, typename unicode_size_type<I>::type());
437 }
438 
439 template <typename T, typename I, typename O>
440 O to_utf_(I f, I l, O o, unicode_size_type_<4>) {
441  T result;
442 
443  while (f != l) {
444  f = to_utf32(f, l, result, typename unicode_size_type<I>::type());
445 
446  *o++ = result;
447  }
448 
449  return o;
450 }
451 
452 /**************************************************************************************************/
453 
454 } // namespace detail
455 
456 /**************************************************************************************************/
457 
487 template <typename T, typename I, typename O>
488 O copy_utf(I f, I l, O o) {
489  return detail::to_utf_<T>(f, l, o, detail::unicode_size_type_<sizeof(T)>());
490 }
491 
492 /**************************************************************************************************/
493 
494 } // namespace adobe
495 
496 /**************************************************************************************************/
497 
498 #endif
499 
500 /**************************************************************************************************/
O copy_utf(I f, I l, O o)
Definition: unicode.hpp:488