bom (3963B)
1 /* -*- C++ -*- vim:set syntax=cpp: 2 * 3 * Byte Order Mark (BOM) Handling. 4 * 5 * The byte order mark (BOM) is a Unicode character used to signal 6 * the endianness (byte order) of a text file or stream. Its code 7 * point is U+FEFF. 8 * [Source: <http://en.wikipedia.org/wiki/Byte_order_mark>] 9 * 10 * This file implements a function to cut the BOM and tell about 11 * the encoding of the data stream. 12 * 13 * (C) 2010 Frank-Rene Schaefer 14 15 * ABSOLUTELY NO WARRANTY */ 16 #ifndef __QUEX_INCLUDE_GUARD__BOM 17 #define __QUEX_INCLUDE_GUARD__BOM 18 19 #include "definitions" 20 21 typedef enum { 22 QUEX_BOM_NONE = 0x200, /* D9 --> NONE/NOT SURE */ 23 QUEX_BOM_UTF_8 = 0x001, /* D0 --> UTF 8 */ 24 QUEX_BOM_UTF_1 = 0x002, /* D1 --> UTF 1 */ 25 QUEX_BOM_UTF_EBCDIC = 0x004, /* D2 --> UTF EBCDIC */ 26 QUEX_BOM_BOCU_1 = 0x008, /* D3 --> BOCU 1 */ 27 QUEX_BOM_GB_18030 = 0x010, /* D4 --> GB_18030 */ 28 QUEX_BOM_UTF_7 = 0x220, /* D5 --> UTF 7; 29 * D9 --> May be not. */ 30 QUEX_BOM_UTF_16 = 0x040, /* D6 --> UTF 16 */ 31 QUEX_BOM_UTF_16_LE = 0x041, 32 QUEX_BOM_UTF_16_BE = 0x042, 33 QUEX_BOM_UTF_32 = 0x080, /* D7 --> UTF 32 */ 34 QUEX_BOM_UTF_32_LE = 0x081, 35 QUEX_BOM_UTF_32_BE = 0x082, 36 QUEX_BOM_SCSU = 0x100, /* D8 --> SCSU */ 37 QUEX_BOM_SCSU_TO_UCS = 0x101, 38 QUEX_BOM_SCSU_W0_TO_FE80 = 0x102, 39 QUEX_BOM_SCSU_W1_TO_FE80 = 0x103, 40 QUEX_BOM_SCSU_W2_TO_FE80 = 0x104, 41 QUEX_BOM_SCSU_W3_TO_FE80 = 0x105, 42 QUEX_BOM_SCSU_W4_TO_FE80 = 0x106, 43 QUEX_BOM_SCSU_W5_TO_FE80 = 0x107, 44 QUEX_BOM_SCSU_W6_TO_FE80 = 0x108, 45 QUEX_BOM_SCSU_W7_TO_FE80 = 0x109 46 } QUEX_TYPE_BOM; 47 48 /* Table of (known) BOMs _____________________________________________________ 49 * 50 * BOM_UTF_8 { 0xEF, 0xBB, 0xBF } 51 * UTF_16_BE { 0xFE, 0xFF } 52 * UTF_16_LE { 0xFF, 0xFE } 53 * UTF_32_BE { 0x00, 0x00, 0xFE, 0xFF } 54 * UTF_32_LE { 0xFF, 0xFE, 0x00, 0x00 } 55 * UTF_7_38 { 0x2B, 0x2F, 0x76, 0x38 } 56 * UTF_7_39 { 0x2B, 0x2F, 0x76, 0x39 } 57 * UTF_7_2B { 0x2B, 0x2F, 0x76, 0x2B } 58 * UTF_7_2F { 0x2B, 0x2F, 0x76, 0x2F } 59 * UTF_1 { 0xF7, 0x64, 0x4C } 60 * UTF_EBCDIC { 0xDD, 0x73, 0x66, 0x73 } 61 * SCSU { 0x0E, 0xFE, 0xFF } 62 * SCSU_TO_UCS { 0x0F, 0xFE, 0xFF } 63 * SCSU_W0_TO_FE80 { 0x18, 0xA5, 0xFF } 64 * SCSU_W1_TO_FE80 { 0x19, 0xA5, 0xFF } 65 * SCSU_W2_TO_FE80 { 0x1A, 0xA5, 0xFF } 66 * SCSU_W3_TO_FE80 { 0x1B, 0xA5, 0xFF } 67 * SCSU_W4_TO_FE80 { 0x1C, 0xA5, 0xFF } 68 * SCSU_W5_TO_FE80 { 0x1D, 0xA5, 0xFF } 69 * SCSU_W6_TO_FE80 { 0x1E, 0xA5, 0xFF } 70 * SCSU_W7_TO_FE80 { 0x1F, 0xA5, 0xFF } 71 * BOCU_1_x { 0xFB, 0xEE, 0x28, 0xFF } 72 * BOCU_1 { 0xFB, 0xEE, 0x28, } 73 * GB_18030 { 0x84, 0x31, 0x95, 0x33 } 74 *_____________________________________________________________________________*/ 75 76 QUEX_NAMESPACE_QUEX_OPEN 77 78 extern QUEX_TYPE_BOM 79 QUEXED_DEF(bom_snap)(__QUEX_STD_FILE* InputHandle); 80 81 #if ! defined(__QUEX_OPTION_PLAIN_C) 82 template <class InputStream> QUEX_INLINE QUEX_TYPE_BOM 83 QUEXED_DEF(bom_snap)(InputStream* p_input_stream); 84 #endif 85 86 extern QUEX_TYPE_BOM 87 QUEXED_DEF(__bom_snap_core)(uint8_t buffer[4], size_t read_n, size_t* byte_n); 88 89 extern QUEX_TYPE_BOM 90 QUEXED_DEF(bom_identify)(const uint8_t* const Buffer, size_t* n); 91 92 extern const char* 93 QUEXED_DEF(bom_name)(QUEX_TYPE_BOM BOM); 94 95 QUEX_NAMESPACE_QUEX_CLOSE 96 97 #endif /* __QUEX_INCLUDE_GUARD__BOM */ 98