from-utf16.i (5067B)
1 /* -*- C++ -*- vim: set syntax=cpp: 2 * PURPOSE: 3 * 4 * Provide the implementation of character and string converter functions 5 * FROM utf16 to utf8, utf16, utf32, char, and wchar_t. 6 * 7 * STEPS: 8 * 9 * (1) Include the implementation of the character converters from utf16 10 * to utf8, utf16, utf32, char, and wchar_t. 11 * 12 * Use: "character-converter/from-utf16.i" 13 * --> implementation for utf16 14 * 15 * "../generator/character-converter-char-wchar_t.gi" 16 * --> route 'char' and 'wchar_t' conversion to 17 * one of the converters defined before. 18 * 19 * (2) Generate the implementation of the string converters in terms 20 * of those character converters. 21 * 22 * Use: "../generator/implementation-string-converters.gi" 23 * 24 * which uses 25 * 26 * "../generator/string-converter.gi" 27 * 28 * to implement each string converter from the given 29 * character converters. 30 * 31 * All functions are placed in the analyzer's namespace. 32 * 33 * 2010 (C) Frank-Rene Schaefer; 34 * ABSOLUTELY NO WARRANTY */ 35 #if ! defined(__QUEX_INCLUDE_GUARD__CONVERTER_HELPER__FROM_UTF16_I) \ 36 || defined(__QUEX_INCLUDE_GUARD__CONVERTER_HELPER__TMP_DISABLED) 37 #if ! defined(__QUEX_INCLUDE_GUARD__CONVERTER_HELPER__TMP_DISABLED) 38 # define __QUEX_INCLUDE_GUARD__CONVERTER_HELPER__FROM_UTF16_I 39 #endif 40 41 #include "from-utf16" 42 43 #define __QUEX_FROM utf16 44 #define __QUEX_FROM_TYPE uint16_t 45 46 /* (1) Implement the character converters utf8, utf16, utf32. 47 * (Note, that character converters are generated into namespace 'quex'.)*/ 48 QUEX_NAMESPACE_MAIN_OPEN 49 50 QUEX_INLINE void 51 QUEX_CONVERTER_CHAR_DEF(utf16, utf8)(const uint16_t** input_pp, uint8_t** output_pp) 52 { 53 uint32_t x0 = (uint16_t)0; 54 uint32_t x1 = (uint16_t)0; 55 uint32_t unicode = (uint32_t)0; 56 57 if ( **input_pp <= (uint16_t)0x7f ) { 58 *((*output_pp)++) = (uint8_t)*(*input_pp); 59 ++(*input_pp); 60 61 } else if ( **input_pp <= (uint16_t)0x7ff ) { 62 *((*output_pp)++) = (uint8_t)(0xC0 | (*(*input_pp) >> 6)); 63 *((*output_pp)++) = (uint8_t)(0x80 | (*(*input_pp) & (uint16_t)0x3F)); 64 ++(*input_pp); 65 66 } else if ( **input_pp < (uint16_t)0xD800 ) { 67 *((*output_pp)++) = (uint8_t)(0xE0 | *(*input_pp) >> 12); 68 *((*output_pp)++) = (uint8_t)(0x80 | (*(*input_pp) & (uint16_t)0xFFF) >> 6); 69 *((*output_pp)++) = (uint8_t)(0x80 | (*(*input_pp) & (uint16_t)0x3F)); 70 ++(*input_pp); 71 72 } else if ( **input_pp < (uint16_t)0xE000 ) { 73 /* Characters > 0xFFFF need to be coded in two bytes by means of surrogates. */ 74 x0 = (uint32_t)(*(*input_pp)++ - (uint32_t)0xD800); 75 x1 = (uint32_t)(*(*input_pp)++ - (uint32_t)0xDC00); 76 unicode = (x0 << 10) + x1 + 0x10000; 77 78 /* Assume that only character appear, that are defined in unicode. */ 79 __quex_assert(unicode <= (uint16_t)0x1FFFFF); 80 81 *((*output_pp)++) = (uint8_t)(0xF0 | unicode >> 18); 82 *((*output_pp)++) = (uint8_t)(0x80 | (unicode & (uint32_t)0x3FFFF) >> 12); 83 *((*output_pp)++) = (uint8_t)(0x80 | (unicode & (uint32_t)0xFFF) >> 6); 84 *((*output_pp)++) = (uint8_t)(0x80 | (unicode & (uint32_t)0x3F)); 85 86 } else { 87 /* Always true: **input_pp <= 0xFFFF */ 88 *((*output_pp)++) = (uint8_t)(0xE0 | *(*input_pp) >> 12); 89 *((*output_pp)++) = (uint8_t)(0x80 | (*(*input_pp) & (uint16_t)0xFFF) >> 6); 90 *((*output_pp)++) = (uint8_t)(0x80 | (*(*input_pp) & (uint16_t)0x3F)); 91 ++(*input_pp); 92 } 93 } 94 95 QUEX_INLINE void 96 QUEX_CONVERTER_CHAR_DEF(utf16, utf16)(const uint16_t** input_pp, 97 uint16_t** output_pp) 98 { 99 if( **input_pp < (uint16_t)0xD800 || **input_pp >= (uint16_t)0xE000 ) { 100 *((*output_pp)++) = *(*input_pp)++; 101 } else { 102 *((*output_pp)++) = *(*input_pp)++; 103 *((*output_pp)++) = *(*input_pp)++; 104 } 105 } 106 107 QUEX_INLINE void 108 QUEX_CONVERTER_CHAR_DEF(utf16, utf32)(const uint16_t** input_pp, 109 uint32_t** output_pp) 110 { 111 uint32_t x0 = (uint32_t)0; 112 uint32_t x1 = (uint32_t)0; 113 114 if( **input_pp < (uint16_t)0xD800 || **input_pp >= (uint16_t)0xE000 ) { 115 *((*output_pp)++) = *(*input_pp)++; 116 } else { 117 x0 = (uint32_t)(*(*input_pp)++) - (uint32_t)0xD800; 118 x1 = (uint32_t)(*(*input_pp)++) - (uint32_t)0xDC00; 119 *((*output_pp)++) = (x0 << 10) + x1 + (uint32_t)0x10000; 120 } 121 } 122 123 /* (1b) Derive converters to char and wchar_t from the given set 124 * of converters. (Generator uses __QUEX_FROM and QUEX_FROM_TYPE) */ 125 #include "character-converter-to-char-wchar_t.gi" 126 127 /* (2) Generate string converters to utf8, utf16, utf32 based on the 128 * definitions of the character converters. */ 129 #include "implementations.gi" 130 131 QUEX_NAMESPACE_MAIN_CLOSE 132 133 #endif /* __QUEX_INCLUDE_GUARD__CONVERTER_HELPER__FROM_UTF16_I */