sheepy

build system (sheepy) and package manager (spm) for C
git clone https://spartatek.se/git/sheepy.git
Log | Files | Refs | README | LICENSE

bom.i (8224B)


      1 /* This file contains an implementation which can potentially be shared between
      2  * multiple different lexical analyzers. See 'multi.i' for further info.     */
      3 
      4 /* -*- C++ -*- vim:set syntax=cpp: 
      5  *
      6  * Byte Order Mark (BOM) Handling.
      7  *
      8  * The byte order mark (BOM) is a Unicode character used to signal 
      9  * the endianness (byte order) of a text file or stream. Its code 
     10  * point is U+FEFF. 
     11  * [Source: <http://en.wikipedia.org/wiki/Byte_order_mark>]
     12  *
     13  * This file implements a function to cut the BOM and tell about 
     14  * the encoding of the data stream.
     15  *
     16  * (C) 2010 Frank-Rene Schaefer    
     17  * ABSOLUTELY NO WARRANTY                                                    */
     18 
     19 #ifndef __QUEX_INCLUDE_GUARD__BOM_I
     20 #define __QUEX_INCLUDE_GUARD__BOM_I
     21 
     22 #include "bom"
     23 
     24 QUEX_NAMESPACE_QUEX_OPEN
     25 
     26 extern QUEX_TYPE_BOM
     27 QUEXED_DEF(bom_snap)(__QUEX_STD_FILE* InputHandle)
     28 /* This function can **only** be used with **normally** behaving streams
     29  * where the position increases by one with every character being read. If
     30  * this is not the case then use the **binary** option of your stream.       */
     31 {
     32     uint8_t        buffer[4] = { 0, 0, 0, 0};
     33     QUEX_TYPE_BOM  result    = QUEX_BOM_NONE;
     34     size_t         byte_n    = 0;
     35     size_t         read_n    = 0;
     36     long           p0        = __QUEX_STD_ftell(InputHandle);
     37 
     38     read_n = (size_t)__QUEX_STD_fread((uint8_t*)buffer, 1, 4, InputHandle);
     39     result = QUEXED_DEF(__bom_snap_core)(buffer, read_n, &byte_n);
     40 
     41     /* Avoid temporary function argument. Store sum in p0. */
     42     p0 += (long)byte_n;
     43     __QUEX_STD_fseek(InputHandle, p0, SEEK_SET);
     44 
     45     return result;
     46 }
     47 
     48 #if ! defined(__QUEX_OPTION_PLAIN_C)
     49 template <class InputStream> extern QUEX_TYPE_BOM
     50 QUEXED_DEF(bom_snap)(InputStream* p_input_stream)
     51 /* This function can **only** be used with **normally** behaving streams
     52  * where the position increases by one with every character being read. If
     53  * this is not the case then use the **binary** option of your stream.     */
     54 {
     55     uint8_t               buffer[4] = { 0, 0, 0, 0};
     56     QUEX_TYPE_BOM         result    = QUEX_BOM_NONE;
     57     size_t                byte_n    = 0;
     58     size_t                read_n    = 0;
     59     /**/
     60     const size_t                   CharSize  = sizeof(typename InputStream::char_type);
     61     typename InputStream::pos_type p0 = p_input_stream->tellg() * CharSize;
     62 
     63     p_input_stream->read((typename InputStream::char_type*)buffer, 4 / CharSize);
     64     read_n = (size_t)(p_input_stream->gcount());
     65     result = QUEXED_DEF(__bom_snap_core)(buffer, read_n, &byte_n);
     66 
     67     /* Avoid temporary function argument. Store sum in p0. */
     68     p0 += typename InputStream::pos_type(byte_n);
     69     p_input_stream->seekg(p0 / CharSize); 
     70 
     71     return result;
     72 }
     73 #endif
     74 
     75 extern QUEX_TYPE_BOM
     76 QUEXED_DEF(__bom_snap_core)(uint8_t buffer[4], size_t read_n, size_t* byte_n)
     77 {
     78     /* For non-existing bytes fill 0x77, because it does not occur
     79      * anywhere as a criteria, see 'switch' after that.             */
     80     switch( read_n ) {
     81         case 0: return QUEX_BOM_NONE;
     82         case 1: buffer[1] = 0x77; buffer[2] = 0x77; buffer[3] = 0x77; break; 
     83         case 2:                   buffer[2] = 0x77; buffer[3] = 0x77; break;
     84         case 3:                                     buffer[3] = 0x77; break;
     85     }
     86 
     87     return QUEXED_DEF(bom_identify)(buffer, byte_n);
     88 }
     89 
     90 extern QUEX_TYPE_BOM
     91 QUEXED_DEF(bom_identify)(const uint8_t* const Buffer, size_t* n)
     92     /* Assume, that the buffer contains at least 4 elements!                 */
     93 {
     94     /* Table of byte order marks (BOMs), see file 'quex/code_base/bom'       */
     95     const uint8_t B0 = Buffer[0];
     96     const uint8_t B1 = Buffer[1];
     97     const uint8_t B2 = Buffer[2];
     98     const uint8_t B3 = Buffer[3];
     99     QUEX_TYPE_BOM  x = QUEX_BOM_NONE;
    100 
    101     switch( B0 ) {
    102     case 0x00: if( B1 == 0x00 && B2 == 0xFE && B3 == 0xFF ) { *n = 4; x = QUEX_BOM_UTF_32_BE;        } break; 
    103     case 0x0E: if( B1 == 0xFE && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU;            } break;
    104     case 0x0F: if( B1 == 0xFE && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU_TO_UCS;     } break; 
    105     case 0x18: if( B1 == 0xA5 && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU_W0_TO_FE80; } break; 
    106     case 0x19: if( B1 == 0xA5 && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU_W1_TO_FE80; } break; 
    107     case 0x1A: if( B1 == 0xA5 && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU_W2_TO_FE80; } break; 
    108     case 0x1B: if( B1 == 0xA5 && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU_W3_TO_FE80; } break; 
    109     case 0x1C: if( B1 == 0xA5 && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU_W4_TO_FE80; } break; 
    110     case 0x1D: if( B1 == 0xA5 && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU_W5_TO_FE80; } break; 
    111     case 0x1E: if( B1 == 0xA5 && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU_W6_TO_FE80; } break; 
    112     case 0x1F: if( B1 == 0xA5 && B2 == 0xFF )               { *n = 3; x = QUEX_BOM_SCSU_W7_TO_FE80; } break; 
    113     case 0x2B: 
    114            /* In any case, the UTF7 BOM is not eaten. 
    115             * This is too complicated, since it uses a base64 code. It would require
    116             * to re-order the whole stream. This shall do the converter (if he wants). */
    117            *n = 0;
    118            if( B1 == 0x2F && B2 == 0x76 ) {
    119                switch( B3 ) 
    120                { case 0x2B: case 0x2F: case 0x38: case 0x39: x = QUEX_BOM_UTF_7; } 
    121            }
    122            break;
    123     case 0x84: if( B1 == 0x31 && B2 == 0x95 && B3 == 0x33 ) { *n = 4; x = QUEX_BOM_GB_18030;   } break;
    124     case 0xDD: if( B1 == 0x73 && B2 == 0x66 && B3 == 0x73 ) { *n = 4; x = QUEX_BOM_UTF_EBCDIC; } break;
    125     case 0xEF: if( B1 == 0xBB && B2 == 0xBF )               { *n = 3; x = QUEX_BOM_UTF_8;      } break;
    126     case 0xF7: if( B1 == 0x64 && B2 == 0x4C )               { *n = 3; x = QUEX_BOM_UTF_1;      } break;
    127     case 0xFB: 
    128            if( B1 == 0xEE && B2 == 0x28 ) {
    129                if( B3 == 0xFF )  { *n = 4; x = QUEX_BOM_BOCU_1; } 
    130                else              { *n = 3; x = QUEX_BOM_BOCU_1; }
    131            }
    132            break;
    133     case 0xFE: 
    134            if( B1 == 0xFF ) { *n = 2; x = QUEX_BOM_UTF_16_BE; } break;
    135     case 0xFF: 
    136            if( B1 == 0xFE ) {
    137                if( B2 == 0x00 && B3 == 0x00 ) { *n = 4; x = QUEX_BOM_UTF_32_LE; }
    138                else                           { *n = 2; x = QUEX_BOM_UTF_16_LE; } 
    139            }
    140            break;
    141     default: 
    142            *n = 0;
    143     }
    144 
    145     return x;
    146 }           
    147 
    148 extern const char*
    149 QUEXED_DEF(bom_name)(QUEX_TYPE_BOM BOM)
    150 {
    151     switch( BOM ) {
    152     case QUEX_BOM_UTF_8:           return "UTF_8";                      
    153     case QUEX_BOM_UTF_1:           return "UTF_1";                      
    154     case QUEX_BOM_UTF_EBCDIC:      return "UTF_EBCDIC";            
    155     case QUEX_BOM_BOCU_1:          return "BOCU_1";                    
    156     case QUEX_BOM_GB_18030:        return "GB_18030";                
    157     case QUEX_BOM_UTF_7:           return "UTF_7";                      
    158     case QUEX_BOM_UTF_16:          return "UTF_16";                                  
    159     case QUEX_BOM_UTF_16_LE:       return "UTF_16_LE";              
    160     case QUEX_BOM_UTF_16_BE:       return "UTF_16_BE";              
    161     case QUEX_BOM_UTF_32:          return "UTF_32";                    
    162     case QUEX_BOM_UTF_32_LE:       return "UTF_32_LE";              
    163     case QUEX_BOM_UTF_32_BE:       return "UTF_32_BE";              
    164     case QUEX_BOM_SCSU:            return "SCSU";                        
    165     case QUEX_BOM_SCSU_TO_UCS:     return "SCSU_TO_UCS";          
    166     case QUEX_BOM_SCSU_W0_TO_FE80: return "SCSU_W0_TO_FE80";  
    167     case QUEX_BOM_SCSU_W1_TO_FE80: return "SCSU_W1_TO_FE80";  
    168     case QUEX_BOM_SCSU_W2_TO_FE80: return "SCSU_W2_TO_FE80";  
    169     case QUEX_BOM_SCSU_W3_TO_FE80: return "SCSU_W3_TO_FE80";  
    170     case QUEX_BOM_SCSU_W4_TO_FE80: return "SCSU_W4_TO_FE80";  
    171     case QUEX_BOM_SCSU_W5_TO_FE80: return "SCSU_W5_TO_FE80";  
    172     case QUEX_BOM_SCSU_W6_TO_FE80: return "SCSU_W6_TO_FE80";  
    173     case QUEX_BOM_SCSU_W7_TO_FE80: return "SCSU_W7_TO_FE80";  
    174     default:
    175     case QUEX_BOM_NONE:            return "NONE";                        
    176     }
    177 }
    178 
    179 QUEX_NAMESPACE_QUEX_CLOSE
    180 
    181 #endif /* __QUEX_INCLUDE_GUARD__BOM_I */
    182 
    183