Mercurial > embed
diff libjansson/src/utf.c @ 82:3c05ebba5cb8
uriparser: removal, closes #1716
jansson: removal, closes #1715
author | David Demelier <markand@malikania.fr> |
---|---|
date | Thu, 25 Jul 2019 21:03:12 +0200 |
parents | 7fdaf1411e6f |
children | 7937ccab4d14 |
line wrap: on
line diff
--- a/libjansson/src/utf.c Thu Jul 25 20:30:00 2019 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,187 +0,0 @@ -/* - * Copyright (c) 2009-2014 Petri Lehtinen <petri@digip.org> - * - * Jansson is free software; you can redistribute it and/or modify - * it under the terms of the MIT license. See LICENSE for details. - */ - -#include <string.h> -#include "utf.h" - -int utf8_encode(int32_t codepoint, char *buffer, size_t *size) -{ - if(codepoint < 0) - return -1; - else if(codepoint < 0x80) - { - buffer[0] = (char)codepoint; - *size = 1; - } - else if(codepoint < 0x800) - { - buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); - buffer[1] = 0x80 + ((codepoint & 0x03F)); - *size = 2; - } - else if(codepoint < 0x10000) - { - buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); - buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); - buffer[2] = 0x80 + ((codepoint & 0x003F)); - *size = 3; - } - else if(codepoint <= 0x10FFFF) - { - buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); - buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); - buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); - buffer[3] = 0x80 + ((codepoint & 0x00003F)); - *size = 4; - } - else - return -1; - - return 0; -} - -size_t utf8_check_first(char byte) -{ - unsigned char u = (unsigned char)byte; - - if(u < 0x80) - return 1; - - if(0x80 <= u && u <= 0xBF) { - /* second, third or fourth byte of a multi-byte - sequence, i.e. a "continuation byte" */ - return 0; - } - else if(u == 0xC0 || u == 0xC1) { - /* overlong encoding of an ASCII byte */ - return 0; - } - else if(0xC2 <= u && u <= 0xDF) { - /* 2-byte sequence */ - return 2; - } - - else if(0xE0 <= u && u <= 0xEF) { - /* 3-byte sequence */ - return 3; - } - else if(0xF0 <= u && u <= 0xF4) { - /* 4-byte sequence */ - return 4; - } - else { /* u >= 0xF5 */ - /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid - UTF-8 */ - return 0; - } -} - -size_t utf8_check_full(const char *buffer, size_t size, int32_t *codepoint) -{ - size_t i; - int32_t value = 0; - unsigned char u = (unsigned char)buffer[0]; - - if(size == 2) - { - value = u & 0x1F; - } - else if(size == 3) - { - value = u & 0xF; - } - else if(size == 4) - { - value = u & 0x7; - } - else - return 0; - - for(i = 1; i < size; i++) - { - u = (unsigned char)buffer[i]; - - if(u < 0x80 || u > 0xBF) { - /* not a continuation byte */ - return 0; - } - - value = (value << 6) + (u & 0x3F); - } - - if(value > 0x10FFFF) { - /* not in Unicode range */ - return 0; - } - - else if(0xD800 <= value && value <= 0xDFFF) { - /* invalid code point (UTF-16 surrogate halves) */ - return 0; - } - - else if((size == 2 && value < 0x80) || - (size == 3 && value < 0x800) || - (size == 4 && value < 0x10000)) { - /* overlong encoding */ - return 0; - } - - if(codepoint) - *codepoint = value; - - return 1; -} - -const char *utf8_iterate(const char *buffer, size_t bufsize, int32_t *codepoint) -{ - size_t count; - int32_t value; - - if(!bufsize) - return buffer; - - count = utf8_check_first(buffer[0]); - if(count <= 0) - return NULL; - - if(count == 1) - value = (unsigned char)buffer[0]; - else - { - if(count > bufsize || !utf8_check_full(buffer, count, &value)) - return NULL; - } - - if(codepoint) - *codepoint = value; - - return buffer + count; -} - -int utf8_check_string(const char *string, size_t length) -{ - size_t i; - - for(i = 0; i < length; i++) - { - size_t count = utf8_check_first(string[i]); - if(count == 0) - return 0; - else if(count > 1) - { - if(count > length - i) - return 0; - - if(!utf8_check_full(&string[i], count, NULL)) - return 0; - - i += count - 1; - } - } - - return 1; -}