libsheepy

C lib for handling text files, strings and json like data structure with an object oriented system
git clone https://spartatek.se/git/libsheepy.git
Log | Files | Refs | README | LICENSE

ymlReader.c (16680B)


      1 
      2 #include "yaml_private.h"
      3 
      4 /*
      5  * Declarations.
      6  */
      7 
      8 static int
      9 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
     10         size_t offset, int value);
     11 
     12 static int
     13 yaml_parser_update_raw_buffer(yaml_parser_t *parser);
     14 
     15 static int
     16 yaml_parser_determine_encoding(yaml_parser_t *parser);
     17 
     18 YAML_DECLARE(int)
     19 yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
     20 
     21 /*
     22  * Set the reader error and return 0.
     23  */
     24 
     25 static int
     26 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
     27         size_t offset, int value)
     28 {
     29     parser->error = YAML_READER_ERROR;
     30     parser->problem = problem;
     31     parser->problem_offset = offset;
     32     parser->problem_value = value;
     33 
     34     return 0;
     35 }
     36 
     37 /*
     38  * Byte order marks.
     39  */
     40 
     41 #define BOM_UTF8    "\xef\xbb\xbf"
     42 #define BOM_UTF16LE "\xff\xfe"
     43 #define BOM_UTF16BE "\xfe\xff"
     44 
     45 /*
     46  * Determine the input stream encoding by checking the BOM symbol. If no BOM is
     47  * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
     48  */
     49 
     50 static int
     51 yaml_parser_determine_encoding(yaml_parser_t *parser)
     52 {
     53     /* Ensure that we had enough bytes in the raw buffer. */
     54 
     55     while (!parser->eof
     56             && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
     57         if (!yaml_parser_update_raw_buffer(parser)) {
     58             return 0;
     59         }
     60     }
     61 
     62     /* Determine the encoding. */
     63 
     64     if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
     65             && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
     66         parser->encoding = YAML_UTF16LE_ENCODING;
     67         parser->raw_buffer.pointer += 2;
     68         parser->offset += 2;
     69     }
     70     else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
     71             && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
     72         parser->encoding = YAML_UTF16BE_ENCODING;
     73         parser->raw_buffer.pointer += 2;
     74         parser->offset += 2;
     75     }
     76     else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
     77             && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
     78         parser->encoding = YAML_UTF8_ENCODING;
     79         parser->raw_buffer.pointer += 3;
     80         parser->offset += 3;
     81     }
     82     else {
     83         parser->encoding = YAML_UTF8_ENCODING;
     84     }
     85 
     86     return 1;
     87 }
     88 
     89 /*
     90  * Update the raw buffer.
     91  */
     92 
     93 static int
     94 yaml_parser_update_raw_buffer(yaml_parser_t *parser)
     95 {
     96     size_t size_read = 0;
     97 
     98     /* Return if the raw buffer is full. */
     99 
    100     if (parser->raw_buffer.start == parser->raw_buffer.pointer
    101             && parser->raw_buffer.last == parser->raw_buffer.end)
    102         return 1;
    103 
    104     /* Return on EOF. */
    105 
    106     if (parser->eof) return 1;
    107 
    108     /* Move the remaining bytes in the raw buffer to the beginning. */
    109 
    110     if (parser->raw_buffer.start < parser->raw_buffer.pointer
    111             && parser->raw_buffer.pointer < parser->raw_buffer.last) {
    112         memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
    113                 parser->raw_buffer.last - parser->raw_buffer.pointer);
    114     }
    115     parser->raw_buffer.last -=
    116         parser->raw_buffer.pointer - parser->raw_buffer.start;
    117     parser->raw_buffer.pointer = parser->raw_buffer.start;
    118 
    119     /* Call the read handler to fill the buffer. */
    120 
    121     if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
    122                 parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
    123         return yaml_parser_set_reader_error(parser, "input error",
    124                 parser->offset, -1);
    125     }
    126     parser->raw_buffer.last += size_read;
    127     if (!size_read) {
    128         parser->eof = 1;
    129     }
    130 
    131     return 1;
    132 }
    133 
    134 /*
    135  * Ensure that the buffer contains at least `length` characters.
    136  * Return 1 on success, 0 on failure.
    137  *
    138  * The length is supposed to be significantly less that the buffer size.
    139  */
    140 
    141 YAML_DECLARE(int)
    142 yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
    143 {
    144     int first = 1;
    145 
    146     assert(parser->read_handler);   /* Read handler must be set. */
    147 
    148     /* If the EOF flag is set and the raw buffer is empty, do nothing. */
    149 
    150     if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
    151         return 1;
    152 
    153     /* Return if the buffer contains enough characters. */
    154 
    155     if (parser->unread >= length)
    156         return 1;
    157 
    158     /* Determine the input encoding if it is not known yet. */
    159 
    160     if (!parser->encoding) {
    161         if (!yaml_parser_determine_encoding(parser))
    162             return 0;
    163     }
    164 
    165     /* Move the unread characters to the beginning of the buffer. */
    166 
    167     if (parser->buffer.start < parser->buffer.pointer
    168             && parser->buffer.pointer < parser->buffer.last) {
    169         size_t size = parser->buffer.last - parser->buffer.pointer;
    170         memmove(parser->buffer.start, parser->buffer.pointer, size);
    171         parser->buffer.pointer = parser->buffer.start;
    172         parser->buffer.last = parser->buffer.start + size;
    173     }
    174     else if (parser->buffer.pointer == parser->buffer.last) {
    175         parser->buffer.pointer = parser->buffer.start;
    176         parser->buffer.last = parser->buffer.start;
    177     }
    178 
    179     /* Fill the buffer until it has enough characters. */
    180 
    181     while (parser->unread < length)
    182     {
    183         /* Fill the raw buffer if necessary. */
    184 
    185         if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) {
    186             if (!yaml_parser_update_raw_buffer(parser)) return 0;
    187         }
    188         first = 0;
    189 
    190         /* Decode the raw buffer. */
    191 
    192         while (parser->raw_buffer.pointer != parser->raw_buffer.last)
    193         {
    194             unsigned int value = 0, value2 = 0;
    195             int incomplete = 0;
    196             unsigned char octet;
    197             unsigned int width = 0;
    198             int low, high;
    199             size_t k;
    200             size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;
    201 
    202             /* Decode the next character. */
    203 
    204             switch (parser->encoding)
    205             {
    206                 case YAML_UTF8_ENCODING:
    207 
    208                     /*
    209                      * Decode a UTF-8 character.  Check RFC 3629
    210                      * (http://www.ietf.org/rfc/rfc3629.txt) for more details.
    211                      *
    212                      * The following table (taken from the RFC) is used for
    213                      * decoding.
    214                      *
    215                      *    Char. number range |        UTF-8 octet sequence
    216                      *      (hexadecimal)    |              (binary)
    217                      *   --------------------+------------------------------------
    218                      *   0000 0000-0000 007F | 0xxxxxxx
    219                      *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
    220                      *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
    221                      *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    222                      *
    223                      * Additionally, the characters in the range 0xD800-0xDFFF
    224                      * are prohibited as they are reserved for use with UTF-16
    225                      * surrogate pairs.
    226                      */
    227 
    228                     /* Determine the length of the UTF-8 sequence. */
    229 
    230                     octet = parser->raw_buffer.pointer[0];
    231                     width = (octet & 0x80) == 0x00 ? 1 :
    232                             (octet & 0xE0) == 0xC0 ? 2 :
    233                             (octet & 0xF0) == 0xE0 ? 3 :
    234                             (octet & 0xF8) == 0xF0 ? 4 : 0;
    235 
    236                     /* Check if the leading octet is valid. */
    237 
    238                     if (!width)
    239                         return yaml_parser_set_reader_error(parser,
    240                                 "invalid leading UTF-8 octet",
    241                                 parser->offset, octet);
    242 
    243                     /* Check if the raw buffer contains an incomplete character. */
    244 
    245                     if (width > raw_unread) {
    246                         if (parser->eof) {
    247                             return yaml_parser_set_reader_error(parser,
    248                                     "incomplete UTF-8 octet sequence",
    249                                     parser->offset, -1);
    250                         }
    251                         incomplete = 1;
    252                         break;
    253                     }
    254 
    255                     /* Decode the leading octet. */
    256 
    257                     value = (octet & 0x80) == 0x00 ? octet & 0x7F :
    258                             (octet & 0xE0) == 0xC0 ? octet & 0x1F :
    259                             (octet & 0xF0) == 0xE0 ? octet & 0x0F :
    260                             (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
    261 
    262                     /* Check and decode the trailing octets. */
    263 
    264                     for (k = 1; k < width; k ++)
    265                     {
    266                         octet = parser->raw_buffer.pointer[k];
    267 
    268                         /* Check if the octet is valid. */
    269 
    270                         if ((octet & 0xC0) != 0x80)
    271                             return yaml_parser_set_reader_error(parser,
    272                                     "invalid trailing UTF-8 octet",
    273                                     parser->offset+k, octet);
    274 
    275                         /* Decode the octet. */
    276 
    277                         value = (value << 6) + (octet & 0x3F);
    278                     }
    279 
    280                     /* Check the length of the sequence against the value. */
    281 
    282                     if (!((width == 1) ||
    283                             (width == 2 && value >= 0x80) ||
    284                             (width == 3 && value >= 0x800) ||
    285                             (width == 4 && value >= 0x10000)))
    286                         return yaml_parser_set_reader_error(parser,
    287                                 "invalid length of a UTF-8 sequence",
    288                                 parser->offset, -1);
    289 
    290                     /* Check the range of the value. */
    291 
    292                     if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF)
    293                         return yaml_parser_set_reader_error(parser,
    294                                 "invalid Unicode character",
    295                                 parser->offset, value);
    296 
    297                     break;
    298 
    299                 case YAML_UTF16LE_ENCODING:
    300                 case YAML_UTF16BE_ENCODING:
    301 
    302                     low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
    303                     high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);
    304 
    305                     /*
    306                      * The UTF-16 encoding is not as simple as one might
    307                      * naively think.  Check RFC 2781
    308                      * (http://www.ietf.org/rfc/rfc2781.txt).
    309                      *
    310                      * Normally, two subsequent bytes describe a Unicode
    311                      * character.  However a special technique (called a
    312                      * surrogate pair) is used for specifying character
    313                      * values larger than 0xFFFF.
    314                      *
    315                      * A surrogate pair consists of two pseudo-characters:
    316                      *      high surrogate area (0xD800-0xDBFF)
    317                      *      low surrogate area (0xDC00-0xDFFF)
    318                      *
    319                      * The following formulas are used for decoding
    320                      * and encoding characters using surrogate pairs:
    321                      *
    322                      *  U  = U' + 0x10000   (0x01 00 00 <= U <= 0x10 FF FF)
    323                      *  U' = yyyyyyyyyyxxxxxxxxxx   (0 <= U' <= 0x0F FF FF)
    324                      *  W1 = 110110yyyyyyyyyy
    325                      *  W2 = 110111xxxxxxxxxx
    326                      *
    327                      * where U is the character value, W1 is the high surrogate
    328                      * area, W2 is the low surrogate area.
    329                      */
    330 
    331                     /* Check for incomplete UTF-16 character. */
    332 
    333                     if (raw_unread < 2) {
    334                         if (parser->eof) {
    335                             return yaml_parser_set_reader_error(parser,
    336                                     "incomplete UTF-16 character",
    337                                     parser->offset, -1);
    338                         }
    339                         incomplete = 1;
    340                         break;
    341                     }
    342 
    343                     /* Get the character. */
    344 
    345                     value = parser->raw_buffer.pointer[low]
    346                         + (parser->raw_buffer.pointer[high] << 8);
    347 
    348                     /* Check for unexpected low surrogate area. */
    349 
    350                     if ((value & 0xFC00) == 0xDC00)
    351                         return yaml_parser_set_reader_error(parser,
    352                                 "unexpected low surrogate area",
    353                                 parser->offset, value);
    354 
    355                     /* Check for a high surrogate area. */
    356 
    357                     if ((value & 0xFC00) == 0xD800) {
    358 
    359                         width = 4;
    360 
    361                         /* Check for incomplete surrogate pair. */
    362 
    363                         if (raw_unread < 4) {
    364                             if (parser->eof) {
    365                                 return yaml_parser_set_reader_error(parser,
    366                                         "incomplete UTF-16 surrogate pair",
    367                                         parser->offset, -1);
    368                             }
    369                             incomplete = 1;
    370                             break;
    371                         }
    372 
    373                         /* Get the next character. */
    374 
    375                         value2 = parser->raw_buffer.pointer[low+2]
    376                             + (parser->raw_buffer.pointer[high+2] << 8);
    377 
    378                         /* Check for a low surrogate area. */
    379 
    380                         if ((value2 & 0xFC00) != 0xDC00)
    381                             return yaml_parser_set_reader_error(parser,
    382                                     "expected low surrogate area",
    383                                     parser->offset+2, value2);
    384 
    385                         /* Generate the value of the surrogate pair. */
    386 
    387                         value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
    388                     }
    389 
    390                     else {
    391                         width = 2;
    392                     }
    393 
    394                     break;
    395 
    396                 default:
    397                     assert(1);      /* Impossible. */
    398             }
    399 
    400             /* Check if the raw buffer contains enough bytes to form a character. */
    401 
    402             if (incomplete) break;
    403 
    404             /*
    405              * Check if the character is in the allowed range:
    406              *      #x9 | #xA | #xD | [#x20-#x7E]               (8 bit)
    407              *      | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD]    (16 bit)
    408              *      | [#x10000-#x10FFFF]                        (32 bit)
    409              */
    410 
    411             if (! (value == 0x09 || value == 0x0A || value == 0x0D
    412                         || (value >= 0x20 && value <= 0x7E)
    413                         || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF)
    414                         || (value >= 0xE000 && value <= 0xFFFD)
    415                         || (value >= 0x10000 && value <= 0x10FFFF)))
    416                 return yaml_parser_set_reader_error(parser,
    417                         "control characters are not allowed",
    418                         parser->offset, value);
    419 
    420             /* Move the raw pointers. */
    421 
    422             parser->raw_buffer.pointer += width;
    423             parser->offset += width;
    424 
    425             /* Finally put the character into the buffer. */
    426 
    427             /* 0000 0000-0000 007F -> 0xxxxxxx */
    428             if (value <= 0x7F) {
    429                 *(parser->buffer.last++) = value;
    430             }
    431             /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
    432             else if (value <= 0x7FF) {
    433                 *(parser->buffer.last++) = 0xC0 + (value >> 6);
    434                 *(parser->buffer.last++) = 0x80 + (value & 0x3F);
    435             }
    436             /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
    437             else if (value <= 0xFFFF) {
    438                 *(parser->buffer.last++) = 0xE0 + (value >> 12);
    439                 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
    440                 *(parser->buffer.last++) = 0x80 + (value & 0x3F);
    441             }
    442             /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    443             else {
    444                 *(parser->buffer.last++) = 0xF0 + (value >> 18);
    445                 *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
    446                 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
    447                 *(parser->buffer.last++) = 0x80 + (value & 0x3F);
    448             }
    449 
    450             parser->unread ++;
    451         }
    452 
    453         /* On EOF, put NUL into the buffer and return. */
    454 
    455         if (parser->eof) {
    456             *(parser->buffer.last++) = '\0';
    457             parser->unread ++;
    458             return 1;
    459         }
    460 
    461     }
    462 
    463     if (parser->offset >= MAX_FILE_SIZE) {
    464         return yaml_parser_set_reader_error(parser, "input is too long",
    465             parser->offset, -1);
    466     }
    467 
    468     return 1;
    469 }