from-utf8.i (7986B)
1 /* -*- C++ -*- vim: set syntax=cpp: 2 * PURPOSE: 3 * 4 * Provide the implementation of character and string converter functions 5 * FROM utf8 to utf8, utf16, utf32, char, and wchar_t. 6 * 7 * STEPS: 8 * 9 * (1) Include the implementation of the character converters from utf8 10 * to utf8, utf16, utf32, char, and wchar_t. 11 * 12 * Use: "character-converter/from-utf8.i" 13 * --> implementation for utf8 14 * 15 * "../generator/character-converter-char-wchar_t.gi" 16 * --> route 'char' and 'wchar_t' conversion to 17 * one of the converters defined before. 18 * 19 * (2) Generate the implementation of the string converters in terms 20 * of those character converters. 21 * 22 * Use: "../generator/implementation-string-converters.gi" 23 * 24 * which uses 25 * 26 * "../generator/string-converter.gi" 27 * 28 * to implement each string converter from the given 29 * character converters. 30 * 31 * All functions are placed in the analyzer's namespace. 32 * 33 * ACKNOWLEDGEMENT: Parts of the following utf8 conversion have been derived from 34 * segments of the utf8 conversion library of Alexey Vatchenko 35 * <av@bsdua.org>. 36 * 37 * 2010 (C) Frank-Rene Schaefer; 38 * ABSOLUTELY NO WARRANTY */ 39 #if ! defined(__QUEX_INCLUDE_GUARD__CONVERTER_HELPER__FROM_UTF8_I) \ 40 || defined(__QUEX_INCLUDE_GUARD__CONVERTER_HELPER__TMP_DISABLED) 41 #if ! defined(__QUEX_INCLUDE_GUARD__CONVERTER_HELPER__TMP_DISABLED) 42 # define __QUEX_INCLUDE_GUARD__CONVERTER_HELPER__FROM_UTF8_I 43 #endif 44 45 #include "from-utf8" 46 47 #define __QUEX_FROM utf8 48 #define __QUEX_FROM_TYPE uint8_t 49 50 QUEX_NAMESPACE_MAIN_OPEN 51 52 /* (1) Implement the character converters utf8 to utf8, utf16, utf32. 53 * (Note, that character converters are generated into namespace 'quex'.)*/ 54 QUEX_INLINE void 55 /* DrainEnd pointer is not returned, since the increment is always '1' */ 56 QUEX_CONVERTER_CHAR_DEF(utf8, utf8)(const uint8_t** input_pp, uint8_t** output_pp) 57 { 58 /* Just for comformity with other encodings: Do nothing but copying. */ 59 if( (**input_pp & (uint8_t)0x80) == (uint8_t)0 ) { 60 *((*output_pp)++) = *(*input_pp)++; 61 } 62 else if( **input_pp < (uint8_t)0xE0 ) { 63 *((*output_pp)++) = *(*input_pp)++; 64 *((*output_pp)++) = *(*input_pp)++; 65 } 66 else if( **input_pp < (uint8_t)0xF0 ) { 67 *((*output_pp)++) = *(*input_pp)++; 68 *((*output_pp)++) = *(*input_pp)++; 69 *((*output_pp)++) = *(*input_pp)++; 70 } 71 else { 72 *((*output_pp)++) = *(*input_pp)++; 73 *((*output_pp)++) = *(*input_pp)++; 74 *((*output_pp)++) = *(*input_pp)++; 75 *((*output_pp)++) = *(*input_pp)++; 76 } 77 } 78 79 QUEX_INLINE void 80 /* DrainEnd pointer is not returned, since the increment is always '1' */ 81 QUEX_CONVERTER_CHAR_DEF(utf8, utf16)(const uint8_t** input_pp, uint16_t** output_pp) 82 { 83 const uint8_t* iterator = *input_pp; 84 uint32_t tmp = 0; 85 86 if( (*iterator & (uint8_t)0x80) == (uint8_t)0 ) { 87 /* Header: 0xxx.xxxx */ 88 **output_pp = (uint16_t)*(iterator++); 89 90 ++(*output_pp); 91 } 92 else if( *iterator < (uint8_t)0xE0 ) { /* ... max: 1101.1111 --> 0xDF, next: 0xE0 */ 93 /* 110x.xxxx 10yy.yyyy 94 * => 0000.0xxx:xxyy.yyyy */ 95 **output_pp = (uint16_t)(( ((uint16_t)*(iterator++)) & (uint16_t)0x1F ) << 6); 96 **output_pp = (uint16_t)((**output_pp) | (( ((uint16_t)*(iterator++)) & (uint16_t)0x3F ))); 97 98 ++(*output_pp); 99 } 100 else if( *iterator < (uint8_t)0xF0 ) { /* ... max: 1110.1111 --> 0xEF, next: 0xF0 */ 101 /* 1110.xxxx 10yy.yyyy 10zz.zzzz 102 * => xxxx.yyyy:yyzz.zzzz */ 103 **output_pp = (uint16_t)(( ((uint16_t)*(iterator++)) & (uint16_t)0x0F ) << 12); 104 **output_pp = (uint16_t)((**output_pp) | (( ((uint16_t)*(iterator++)) & (uint16_t)0x3F ) << 6)); 105 **output_pp = (uint16_t)((**output_pp) | (( ((uint16_t)*(iterator++)) & (uint16_t)0x3F ))); 106 107 ++(*output_pp); 108 } 109 else { 110 /* Unicode standard defines only chars until 0x10ffff, so max(len(utf8char)) == 4. 111 * 112 * NO CHECK: if( *iterator < 0xF8 ) { ... max: 1111.0111 --> 0xF7, next: 0xF8 113 * 114 * 1111.0uuu 10xx.xxxx 10yy.yyyy 10zz.zzzz 115 * => 000u.uuxx:xxxx.yyyy:yyzz.zzzz */ 116 117 /* It happens that the UTF8 domain with 4 bytes is >= 0x10000 which is the 118 * starting domain for surrogates (i.e. what is mapped into 0xD800-0xE000 */ 119 tmp = (uint32_t)( (((uint32_t)*(iterator++)) & (uint32_t)0x07 ) << 18); 120 tmp = (uint32_t)(tmp | (((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 12); 121 tmp = (uint32_t)(tmp | (((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 6); 122 tmp = (uint32_t)(tmp | (((uint32_t)*(iterator++)) & (uint32_t)0x3F )); 123 124 tmp = (uint32_t)(tmp - (uint32_t)0x10000); 125 *((*output_pp)++) = (uint16_t)((tmp >> 10) | (uint32_t)0xD800); 126 *((*output_pp)++) = (uint16_t)((tmp & (uint32_t)0x3FF) | (uint32_t)0xDC00); 127 } 128 *input_pp = iterator; 129 } 130 131 QUEX_INLINE void 132 /* DrainEnd pointer is not returned, since the increment is always '1' */ 133 QUEX_CONVERTER_CHAR_DEF(utf8, utf32)(const uint8_t** input_pp, uint32_t** output_pp) 134 { 135 const uint8_t* iterator = *input_pp; 136 137 if( (*iterator & (uint8_t)0x80) == (uint8_t)0 ) { 138 /* Header: 0xxx.xxxx */ 139 **output_pp = (uint32_t)*(iterator++); 140 } 141 else if( *iterator < (uint8_t)0xE0 ) { /* ... max: 1101.1111 --> 0xDF, next: 0xE0 */ 142 /* 110x.xxxx 10yy.yyyy 143 * => 0000.0xxx:xxyy.yyyy */ 144 **output_pp = ( ( ((uint32_t)*(iterator++)) & (uint32_t)0x1F ) << 6); 145 **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F )); 146 } 147 else if( *iterator < (uint8_t)0xF0 ) { /* ... max: 1110.1111 --> 0xEF, next: 0xF0 */ 148 /* 1110.xxxx 10yy.yyyy 10zz.zzzz 149 * => xxxx.yyyy:yyzz.zzzz */ 150 **output_pp = ( ( ((uint32_t)*(iterator++)) & (uint32_t)0x0F ) << 12); 151 **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 6); 152 **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F )); 153 } 154 else { 155 /* Unicode standard defines only chars until 0x10ffff, so max(len(utf8char)) == 4. 156 * 157 * NO CHECK: if( *iterator < 0xF8 ) { ... max: 1111.0111 --> 0xF7, next: 0xF8 158 * 159 * 1111.0uuu 10xx.xxxx 10yy.yyyy 10zz.zzzz 160 * => 000u.uuxx:xxxx.yyyy:yyzz.zzzz */ 161 **output_pp = ( ( ((uint32_t)*(iterator++)) & (uint32_t)0x07 ) << 18); 162 **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 12); 163 **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 6); 164 **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F )); 165 } 166 ++(*output_pp); 167 *input_pp = iterator; 168 } 169 170 171 /* (1b) Derive converters to char and wchar_t from the given set 172 * of converters. (Generator uses __QUEX_FROM and QUEX_FROM_TYPE) */ 173 #include "character-converter-to-char-wchar_t.gi" 174 175 /* (2) Generate string converters to utf8, utf16, utf32 based on the 176 * definitions of the character converters. */ 177 #include "implementations.gi" 178 179 QUEX_NAMESPACE_MAIN_CLOSE 180 181 #endif /* __QUEX_INCLUDE_GUARD__CONVERTER_HELPER__FROM_UTF8_I */