sheepy

build system (sheepy) and package manager (spm) for C
git clone https://spartatek.se/git/sheepy.git
Log | Files | Refs | README | LICENSE

from-utf8.i (7986B)


      1 /* -*- C++ -*- vim: set syntax=cpp:
      2  * PURPOSE: 
      3  *
      4  * Provide the implementation of character and string converter functions
      5  * FROM utf8 to utf8, utf16, utf32, char, and wchar_t.
      6  *
      7  * STEPS:
      8  *
      9  * (1) Include the implementation of the character converters from utf8 
     10  *     to utf8, utf16, utf32, char, and wchar_t.
     11  *
     12  *     Use: "character-converter/from-utf8.i"
     13  *             --> implementation for utf8
     14  *
     15  *          "../generator/character-converter-char-wchar_t.gi"
     16  *             --> route 'char' and 'wchar_t' conversion to
     17  *                 one of the converters defined before.
     18  *
     19  * (2) Generate the implementation of the string converters in terms
     20  *     of those character converters.
     21  *
     22  *     Use: "../generator/implementation-string-converters.gi"
     23  *
     24  *          which uses
     25  *
     26  *              "../generator/string-converter.gi"
     27  *
     28  *          to implement each string converter from the given 
     29  *          character converters. 
     30  *
     31  * All functions are placed in the analyzer's namespace.
     32  *
     33  * ACKNOWLEDGEMENT: Parts of the following utf8 conversion have been derived from 
     34  *                  segments of the utf8 conversion library of Alexey Vatchenko 
     35  *                  <av@bsdua.org>.    
     36  *
     37  * 2010 (C) Frank-Rene Schaefer; 
     38  * ABSOLUTELY NO WARRANTY                                                    */
     39 #if    ! defined(__QUEX_INCLUDE_GUARD__CONVERTER_HELPER__FROM_UTF8_I) \
     40     ||   defined(__QUEX_INCLUDE_GUARD__CONVERTER_HELPER__TMP_DISABLED)
     41 #if    ! defined(__QUEX_INCLUDE_GUARD__CONVERTER_HELPER__TMP_DISABLED)
     42 #        define  __QUEX_INCLUDE_GUARD__CONVERTER_HELPER__FROM_UTF8_I
     43 #endif
     44 
     45 #include "from-utf8"
     46 
     47 #define __QUEX_FROM       utf8
     48 #define __QUEX_FROM_TYPE  uint8_t
     49 
     50 QUEX_NAMESPACE_MAIN_OPEN
     51 
     52 /* (1) Implement the character converters utf8 to utf8, utf16, utf32.
     53  *     (Note, that character converters are generated into namespace 'quex'.)*/
     54 QUEX_INLINE void
     55 /* DrainEnd pointer is not returned, since the increment is always '1' */
     56 QUEX_CONVERTER_CHAR_DEF(utf8, utf8)(const uint8_t** input_pp, uint8_t** output_pp)
     57 {
     58     /* Just for comformity with other encodings: Do nothing but copying. */
     59     if( (**input_pp & (uint8_t)0x80) == (uint8_t)0 ) {
     60         *((*output_pp)++) = *(*input_pp)++;
     61     }
     62     else if( **input_pp < (uint8_t)0xE0 ) { 
     63         *((*output_pp)++) = *(*input_pp)++;
     64         *((*output_pp)++) = *(*input_pp)++;
     65     }
     66     else if( **input_pp < (uint8_t)0xF0 ) { 
     67         *((*output_pp)++) = *(*input_pp)++;
     68         *((*output_pp)++) = *(*input_pp)++; 
     69         *((*output_pp)++) = *(*input_pp)++;
     70     }
     71     else {
     72         *((*output_pp)++) = *(*input_pp)++; 
     73         *((*output_pp)++) = *(*input_pp)++; 
     74         *((*output_pp)++) = *(*input_pp)++; 
     75         *((*output_pp)++) = *(*input_pp)++;
     76     }
     77 }
     78 
     79 QUEX_INLINE void
     80 /* DrainEnd pointer is not returned, since the increment is always '1' */
     81 QUEX_CONVERTER_CHAR_DEF(utf8, utf16)(const uint8_t** input_pp, uint16_t** output_pp)
     82 {
     83     const uint8_t*  iterator = *input_pp;
     84     uint32_t        tmp = 0;
     85 
     86     if( (*iterator & (uint8_t)0x80) == (uint8_t)0 ) {
     87         /* Header: 0xxx.xxxx */
     88         **output_pp = (uint16_t)*(iterator++);
     89 
     90         ++(*output_pp);
     91     }
     92     else if( *iterator < (uint8_t)0xE0 ) { /* ... max: 1101.1111 --> 0xDF, next: 0xE0               */
     93         /*    110x.xxxx 10yy.yyyy 
     94          * => 0000.0xxx:xxyy.yyyy                                                          */
     95         **output_pp = (uint16_t)(( ((uint16_t)*(iterator++)) & (uint16_t)0x1F ) << 6);
     96         **output_pp = (uint16_t)((**output_pp) | (( ((uint16_t)*(iterator++)) & (uint16_t)0x3F )));
     97 
     98         ++(*output_pp);
     99     }
    100     else if( *iterator < (uint8_t)0xF0 ) { /* ... max: 1110.1111 --> 0xEF, next: 0xF0               */
    101         /*    1110.xxxx 10yy.yyyy 10zz.zzzz
    102          * => xxxx.yyyy:yyzz.zzzz                                                          */
    103         **output_pp = (uint16_t)(( ((uint16_t)*(iterator++)) & (uint16_t)0x0F ) << 12);
    104         **output_pp = (uint16_t)((**output_pp) | (( ((uint16_t)*(iterator++)) & (uint16_t)0x3F ) << 6)); 
    105         **output_pp = (uint16_t)((**output_pp) | (( ((uint16_t)*(iterator++)) & (uint16_t)0x3F )));
    106 
    107         ++(*output_pp);
    108     }
    109     else {
    110         /* Unicode standard defines only chars until 0x10ffff, so max(len(utf8char)) == 4.
    111          *
    112          * NO CHECK: if( *iterator < 0xF8 ) { ... max: 1111.0111 --> 0xF7, next: 0xF8 
    113          *
    114          *    1111.0uuu 10xx.xxxx 10yy.yyyy 10zz.zzzz
    115          * => 000u.uuxx:xxxx.yyyy:yyzz.zzzz                                                */
    116 
    117         /* It happens that the UTF8 domain with 4 bytes is >= 0x10000 which is the
    118          * starting domain for surrogates (i.e. what is mapped into 0xD800-0xE000         */
    119         tmp = (uint32_t)(      (((uint32_t)*(iterator++)) & (uint32_t)0x07 ) << 18); 
    120         tmp = (uint32_t)(tmp | (((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 12); 
    121         tmp = (uint32_t)(tmp | (((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 6); 
    122         tmp = (uint32_t)(tmp | (((uint32_t)*(iterator++)) & (uint32_t)0x3F ));
    123 
    124         tmp               = (uint32_t)(tmp - (uint32_t)0x10000);
    125         *((*output_pp)++) = (uint16_t)((tmp >> 10)             | (uint32_t)0xD800);
    126         *((*output_pp)++) = (uint16_t)((tmp & (uint32_t)0x3FF) | (uint32_t)0xDC00);
    127     }
    128     *input_pp = iterator;
    129 }
    130 
    131 QUEX_INLINE void
    132 /* DrainEnd pointer is not returned, since the increment is always '1' */
    133 QUEX_CONVERTER_CHAR_DEF(utf8, utf32)(const uint8_t** input_pp, uint32_t** output_pp)
    134 {
    135     const uint8_t*  iterator = *input_pp;
    136 
    137     if( (*iterator & (uint8_t)0x80) == (uint8_t)0 ) {
    138         /* Header: 0xxx.xxxx */
    139         **output_pp = (uint32_t)*(iterator++);
    140     }
    141     else if( *iterator < (uint8_t)0xE0 ) { /* ... max: 1101.1111 --> 0xDF, next: 0xE0               */
    142         /*    110x.xxxx 10yy.yyyy 
    143          * => 0000.0xxx:xxyy.yyyy                                                          */
    144         **output_pp = (                          ( ((uint32_t)*(iterator++)) & (uint32_t)0x1F ) << 6);
    145         **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F ));
    146     }
    147     else if( *iterator < (uint8_t)0xF0 ) { /* ... max: 1110.1111 --> 0xEF, next: 0xF0               */
    148         /*    1110.xxxx 10yy.yyyy 10zz.zzzz
    149          * => xxxx.yyyy:yyzz.zzzz                                                          */
    150         **output_pp = (                          ( ((uint32_t)*(iterator++)) & (uint32_t)0x0F ) << 12);
    151         **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 6); 
    152         **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F ));
    153     }
    154     else {
    155         /* Unicode standard defines only chars until 0x10ffff, so max(len(utf8char)) == 4.
    156          *
    157          * NO CHECK: if( *iterator < 0xF8 ) { ... max: 1111.0111 --> 0xF7, next: 0xF8 
    158          *
    159          *    1111.0uuu 10xx.xxxx 10yy.yyyy 10zz.zzzz
    160          * => 000u.uuxx:xxxx.yyyy:yyzz.zzzz                                                */
    161         **output_pp = (                          ( ((uint32_t)*(iterator++)) & (uint32_t)0x07 ) << 18); 
    162         **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 12); 
    163         **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F ) << 6); 
    164         **output_pp = (uint32_t)((**output_pp) | ( ((uint32_t)*(iterator++)) & (uint32_t)0x3F ));
    165     }
    166     ++(*output_pp);
    167     *input_pp = iterator;
    168 }
    169 
    170 
    171 /* (1b) Derive converters to char and wchar_t from the given set 
    172  *      of converters. (Generator uses __QUEX_FROM and QUEX_FROM_TYPE)      */
    173 #include "character-converter-to-char-wchar_t.gi"
    174 
    175 /* (2) Generate string converters to utf8, utf16, utf32 based on the
    176  *     definitions of the character converters.                             */
    177 #include "implementations.gi"
    178 
    179 QUEX_NAMESPACE_MAIN_CLOSE
    180 
    181 #endif /* __QUEX_INCLUDE_GUARD__CONVERTER_HELPER__FROM_UTF8_I */