bom.i (8224B)
1 /* This file contains an implementation which can potentially be shared between 2 * multiple different lexical analyzers. See 'multi.i' for further info. */ 3 4 /* -*- C++ -*- vim:set syntax=cpp: 5 * 6 * Byte Order Mark (BOM) Handling. 7 * 8 * The byte order mark (BOM) is a Unicode character used to signal 9 * the endianness (byte order) of a text file or stream. Its code 10 * point is U+FEFF. 11 * [Source: <http://en.wikipedia.org/wiki/Byte_order_mark>] 12 * 13 * This file implements a function to cut the BOM and tell about 14 * the encoding of the data stream. 15 * 16 * (C) 2010 Frank-Rene Schaefer 17 * ABSOLUTELY NO WARRANTY */ 18 19 #ifndef __QUEX_INCLUDE_GUARD__BOM_I 20 #define __QUEX_INCLUDE_GUARD__BOM_I 21 22 #include "bom" 23 24 QUEX_NAMESPACE_QUEX_OPEN 25 26 extern QUEX_TYPE_BOM 27 QUEXED_DEF(bom_snap)(__QUEX_STD_FILE* InputHandle) 28 /* This function can **only** be used with **normally** behaving streams 29 * where the position increases by one with every character being read. If 30 * this is not the case then use the **binary** option of your stream. */ 31 { 32 uint8_t buffer[4] = { 0, 0, 0, 0}; 33 QUEX_TYPE_BOM result = QUEX_BOM_NONE; 34 size_t byte_n = 0; 35 size_t read_n = 0; 36 long p0 = __QUEX_STD_ftell(InputHandle); 37 38 read_n = (size_t)__QUEX_STD_fread((uint8_t*)buffer, 1, 4, InputHandle); 39 result = QUEXED_DEF(__bom_snap_core)(buffer, read_n, &byte_n); 40 41 /* Avoid temporary function argument. Store sum in p0. */ 42 p0 += (long)byte_n; 43 __QUEX_STD_fseek(InputHandle, p0, SEEK_SET); 44 45 return result; 46 } 47 48 #if ! defined(__QUEX_OPTION_PLAIN_C) 49 template <class InputStream> extern QUEX_TYPE_BOM 50 QUEXED_DEF(bom_snap)(InputStream* p_input_stream) 51 /* This function can **only** be used with **normally** behaving streams 52 * where the position increases by one with every character being read. If 53 * this is not the case then use the **binary** option of your stream. */ 54 { 55 uint8_t buffer[4] = { 0, 0, 0, 0}; 56 QUEX_TYPE_BOM result = QUEX_BOM_NONE; 57 size_t byte_n = 0; 58 size_t read_n = 0; 59 /**/ 60 const size_t CharSize = sizeof(typename InputStream::char_type); 61 typename InputStream::pos_type p0 = p_input_stream->tellg() * CharSize; 62 63 p_input_stream->read((typename InputStream::char_type*)buffer, 4 / CharSize); 64 read_n = (size_t)(p_input_stream->gcount()); 65 result = QUEXED_DEF(__bom_snap_core)(buffer, read_n, &byte_n); 66 67 /* Avoid temporary function argument. Store sum in p0. */ 68 p0 += typename InputStream::pos_type(byte_n); 69 p_input_stream->seekg(p0 / CharSize); 70 71 return result; 72 } 73 #endif 74 75 extern QUEX_TYPE_BOM 76 QUEXED_DEF(__bom_snap_core)(uint8_t buffer[4], size_t read_n, size_t* byte_n) 77 { 78 /* For non-existing bytes fill 0x77, because it does not occur 79 * anywhere as a criteria, see 'switch' after that. */ 80 switch( read_n ) { 81 case 0: return QUEX_BOM_NONE; 82 case 1: buffer[1] = 0x77; buffer[2] = 0x77; buffer[3] = 0x77; break; 83 case 2: buffer[2] = 0x77; buffer[3] = 0x77; break; 84 case 3: buffer[3] = 0x77; break; 85 } 86 87 return QUEXED_DEF(bom_identify)(buffer, byte_n); 88 } 89 90 extern QUEX_TYPE_BOM 91 QUEXED_DEF(bom_identify)(const uint8_t* const Buffer, size_t* n) 92 /* Assume, that the buffer contains at least 4 elements! */ 93 { 94 /* Table of byte order marks (BOMs), see file 'quex/code_base/bom' */ 95 const uint8_t B0 = Buffer[0]; 96 const uint8_t B1 = Buffer[1]; 97 const uint8_t B2 = Buffer[2]; 98 const uint8_t B3 = Buffer[3]; 99 QUEX_TYPE_BOM x = QUEX_BOM_NONE; 100 101 switch( B0 ) { 102 case 0x00: if( B1 == 0x00 && B2 == 0xFE && B3 == 0xFF ) { *n = 4; x = QUEX_BOM_UTF_32_BE; } break; 103 case 0x0E: if( B1 == 0xFE && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU; } break; 104 case 0x0F: if( B1 == 0xFE && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU_TO_UCS; } break; 105 case 0x18: if( B1 == 0xA5 && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU_W0_TO_FE80; } break; 106 case 0x19: if( B1 == 0xA5 && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU_W1_TO_FE80; } break; 107 case 0x1A: if( B1 == 0xA5 && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU_W2_TO_FE80; } break; 108 case 0x1B: if( B1 == 0xA5 && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU_W3_TO_FE80; } break; 109 case 0x1C: if( B1 == 0xA5 && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU_W4_TO_FE80; } break; 110 case 0x1D: if( B1 == 0xA5 && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU_W5_TO_FE80; } break; 111 case 0x1E: if( B1 == 0xA5 && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU_W6_TO_FE80; } break; 112 case 0x1F: if( B1 == 0xA5 && B2 == 0xFF ) { *n = 3; x = QUEX_BOM_SCSU_W7_TO_FE80; } break; 113 case 0x2B: 114 /* In any case, the UTF7 BOM is not eaten. 115 * This is too complicated, since it uses a base64 code. It would require 116 * to re-order the whole stream. This shall do the converter (if he wants). */ 117 *n = 0; 118 if( B1 == 0x2F && B2 == 0x76 ) { 119 switch( B3 ) 120 { case 0x2B: case 0x2F: case 0x38: case 0x39: x = QUEX_BOM_UTF_7; } 121 } 122 break; 123 case 0x84: if( B1 == 0x31 && B2 == 0x95 && B3 == 0x33 ) { *n = 4; x = QUEX_BOM_GB_18030; } break; 124 case 0xDD: if( B1 == 0x73 && B2 == 0x66 && B3 == 0x73 ) { *n = 4; x = QUEX_BOM_UTF_EBCDIC; } break; 125 case 0xEF: if( B1 == 0xBB && B2 == 0xBF ) { *n = 3; x = QUEX_BOM_UTF_8; } break; 126 case 0xF7: if( B1 == 0x64 && B2 == 0x4C ) { *n = 3; x = QUEX_BOM_UTF_1; } break; 127 case 0xFB: 128 if( B1 == 0xEE && B2 == 0x28 ) { 129 if( B3 == 0xFF ) { *n = 4; x = QUEX_BOM_BOCU_1; } 130 else { *n = 3; x = QUEX_BOM_BOCU_1; } 131 } 132 break; 133 case 0xFE: 134 if( B1 == 0xFF ) { *n = 2; x = QUEX_BOM_UTF_16_BE; } break; 135 case 0xFF: 136 if( B1 == 0xFE ) { 137 if( B2 == 0x00 && B3 == 0x00 ) { *n = 4; x = QUEX_BOM_UTF_32_LE; } 138 else { *n = 2; x = QUEX_BOM_UTF_16_LE; } 139 } 140 break; 141 default: 142 *n = 0; 143 } 144 145 return x; 146 } 147 148 extern const char* 149 QUEXED_DEF(bom_name)(QUEX_TYPE_BOM BOM) 150 { 151 switch( BOM ) { 152 case QUEX_BOM_UTF_8: return "UTF_8"; 153 case QUEX_BOM_UTF_1: return "UTF_1"; 154 case QUEX_BOM_UTF_EBCDIC: return "UTF_EBCDIC"; 155 case QUEX_BOM_BOCU_1: return "BOCU_1"; 156 case QUEX_BOM_GB_18030: return "GB_18030"; 157 case QUEX_BOM_UTF_7: return "UTF_7"; 158 case QUEX_BOM_UTF_16: return "UTF_16"; 159 case QUEX_BOM_UTF_16_LE: return "UTF_16_LE"; 160 case QUEX_BOM_UTF_16_BE: return "UTF_16_BE"; 161 case QUEX_BOM_UTF_32: return "UTF_32"; 162 case QUEX_BOM_UTF_32_LE: return "UTF_32_LE"; 163 case QUEX_BOM_UTF_32_BE: return "UTF_32_BE"; 164 case QUEX_BOM_SCSU: return "SCSU"; 165 case QUEX_BOM_SCSU_TO_UCS: return "SCSU_TO_UCS"; 166 case QUEX_BOM_SCSU_W0_TO_FE80: return "SCSU_W0_TO_FE80"; 167 case QUEX_BOM_SCSU_W1_TO_FE80: return "SCSU_W1_TO_FE80"; 168 case QUEX_BOM_SCSU_W2_TO_FE80: return "SCSU_W2_TO_FE80"; 169 case QUEX_BOM_SCSU_W3_TO_FE80: return "SCSU_W3_TO_FE80"; 170 case QUEX_BOM_SCSU_W4_TO_FE80: return "SCSU_W4_TO_FE80"; 171 case QUEX_BOM_SCSU_W5_TO_FE80: return "SCSU_W5_TO_FE80"; 172 case QUEX_BOM_SCSU_W6_TO_FE80: return "SCSU_W6_TO_FE80"; 173 case QUEX_BOM_SCSU_W7_TO_FE80: return "SCSU_W7_TO_FE80"; 174 default: 175 case QUEX_BOM_NONE: return "NONE"; 176 } 177 } 178 179 QUEX_NAMESPACE_QUEX_CLOSE 180 181 #endif /* __QUEX_INCLUDE_GUARD__BOM_I */ 182 183