view generator/make-unicode/src/mkunicode.c @ 2:84765c6f4872

New style
author David Demelier <markand@malikania.fr>
date Thu, 02 Feb 2017 18:07:27 +0100
parents f94206b2e05e
children d9d3406c1250
line wrap: on
line source

/*
 * Tool to create our unicode.cpp and unicode.h file.
 *
 * Current version: 7.0.0
 *
 * Based on mkrunetype from the Go language.
 *
 * Adapted to generated C++ code.
 */

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

/*
 * make is(upper|lower|title|space|alpha)rune and
 * to(upper|lower|title)rune from a UnicodeData.txt file.
 * these can be found at unicode.org
 *
 * with -c, runs a check of the existing runetype functions vs.
 * those extracted from UnicodeData.
 *
 * with -p, generates tables for pairs of chars, as well as for ranges
 * and singletons.
 *
 * UnicodeData defines 4 fields of interest:
 * 1) a category
 * 2) an upper case mapping
 * 3) a lower case mapping
 * 4) a title case mapping
 *
 * toupper, tolower, and totitle are defined directly from the mapping.
 *
 * isalpharune(c) is true iff c is a "letter" category
 * isupperrune(c) is true iff c is the target of toupperrune,
 *  or is in the uppercase letter category
 * similarly for islowerrune and istitlerune.
 * isspacerune is true for space category chars, "C" locale white space chars,
 *  and two additions:
 *  0085    "next line" control char
 *  feff]   "zero-width non-break space"
 * isdigitrune is true iff c is a numeric-digit category.
 */

#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "utf.h"
#include "utfdef.h"

#define nelem(x) (sizeof(x)/sizeof((x)[0]))

enum {
    /*
     * fields in the unicode data file
     */
    FIELD_CODE,
    FIELD_NAME,
    FIELD_CATEGORY,
    FIELD_COMBINING,
    FIELD_BIDIR,
    FIELD_DECOMP,
    FIELD_DECIMAL_DIG,
    FIELD_DIG,
    FIELD_NUMERIC_VAL,
    FIELD_MIRRORED,
    FIELD_UNICODE_1_NAME,
    FIELD_COMMENT,
    FIELD_UPPER,
    FIELD_LOWER,
    FIELD_TITLE,
    NFIELDS,

    MAX_LINE    = 1024,

    TO_OFFSET   = 1 << 20,

    NRUNES      = 1 << 21,
};

#define TO_DELTA(xmapped,x) (TO_OFFSET + (xmapped) - (x))

static FILE *out;
static char myisspace[NRUNES];
static char myisalpha[NRUNES];
static char myisdigit[NRUNES];
static char myisupper[NRUNES];
static char myislower[NRUNES];
static char myistitle[NRUNES];

static int  mytoupper[NRUNES];
static int  mytolower[NRUNES];
static int  mytotitle[NRUNES];

static void check(void);
static void mktables(char *src, int usepairs);
static void fatal(const char *fmt, ...);
static int  mygetfields(char **fields, int nfields, char *str, const char *delim);
static int  getunicodeline(FILE *in, char **fields, char *buf);
static int  getcode(char *s);

static void
usage(void)
{
    fprintf(stderr, "usage: mktables [-cp] output UnicodeData.txt\n");
    exit(1);
}

int
main(int argc, char *argv[])
{
    FILE *in;
    char buf[MAX_LINE], buf2[MAX_LINE];
    char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
    char *p;
    int i, code, last, usepairs;

    usepairs = 0;

    --argc;
    ++argv;

    if (argc != 2){
        usage();
    }

    out = fopen(argv[0], "w");
    if (out == NULL) {
        fatal("can't open %s", argv[0]);
    }

    in = fopen(argv[1], "r");

    if (in == NULL) {
        fatal("can't open %s", argv[1]);
    }

    for(i = 0; i < NRUNES; i++){
        mytoupper[i] = i;
        mytolower[i] = i;
        mytotitle[i] = i;
    }

    /*
     * make sure isspace has all of the "C" locale whitespace chars
     */
    myisspace['\t'] = 1;
    myisspace['\n'] = 1;
    myisspace['\r'] = 1;
    myisspace['\f'] = 1;
    myisspace['\v'] = 1;

    /*
     * a couple of other exceptions
     */
    myisspace[0x85] = 1;    /* control char, "next line" */
    myisspace[0xfeff] = 1;  /* zero-width non-break space */

    last = -1;
    while(getunicodeline(in, fields, buf)){
        code = getcode(fields[FIELD_CODE]);
        if (code >= NRUNES)
            fatal("code-point value too big: %x", code);
        if(code <= last)
            fatal("bad code sequence: %x then %x", last, code);
        last = code;

        /*
         * check for ranges
         */
        p = fields[FIELD_CATEGORY];
        if(strstr(fields[FIELD_NAME], ", First>") != NULL){
            if(!getunicodeline(in, fields2, buf2))
                fatal("range start at eof");
            if (strstr(fields2[FIELD_NAME], ", Last>") == NULL)
                fatal("range start not followed by range end");
            last = getcode(fields2[FIELD_CODE]);
            if(last <= code)
                fatal("range out of sequence: %x then %x", code, last);
            if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
                fatal("range with mismatched category");
        }

        /*
         * set properties and conversions
         */
        for (; code <= last; code++){
            if(p[0] == 'L')
                myisalpha[code] = 1;
            if(p[0] == 'Z')
                myisspace[code] = 1;

            if(strcmp(p, "Lu") == 0)
                myisupper[code] = 1;
            if(strcmp(p, "Ll") == 0)
                myislower[code] = 1;

            if(strcmp(p, "Lt") == 0)
                myistitle[code] = 1;

            if(strcmp(p, "Nd") == 0)
                myisdigit[code] = 1;

            /*
             * when finding conversions, also need to mark
             * upper/lower case, since some chars, like
             * "III" (0x2162), aren't defined as letters but have a
             * lower case mapping ("iii" (0x2172)).
             */
            if(fields[FIELD_UPPER][0] != '\0'){
                mytoupper[code] = getcode(fields[FIELD_UPPER]);
            }
            if(fields[FIELD_LOWER][0] != '\0'){
                mytolower[code] = getcode(fields[FIELD_LOWER]);
            }
            if(fields[FIELD_TITLE][0] != '\0'){
                mytotitle[code] = getcode(fields[FIELD_TITLE]);
            }
        }
    }

    fclose(in);

    /*
     * check for codes with no totitle mapping but a toupper mapping.
     * these appear in UnicodeData-2.0.14.txt, but are almost certainly
     * erroneous.
     */
    for(i = 0; i < NRUNES; i++){
        if(mytotitle[i] == i
        && mytoupper[i] != i
        && !myistitle[i])
            fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]);
    }

    /*
     * make sure isupper[c] is true if for some x toupper[x]  == c
     * ditto for islower and istitle
     */
    for(i = 0; i < NRUNES; i++) {
        if(mytoupper[i] != i)
            myisupper[mytoupper[i]] = 1;
        if(mytolower[i] != i)
            myislower[mytolower[i]] = 1;
        if(mytotitle[i] != i)
            myistitle[mytotitle[i]] = 1;
    }

    mktables(argv[0], usepairs);
    exit(0);
}

/*
 * generate a properties array for ranges, clearing those cases covered.
 * if force, generate one-entry ranges for singletons.
 */
static int
mkisrange(const char* label, char* prop, int force)
{
    int start, stop, some;

    /*
     * first, the ranges
     */
    some = 0;
    for(start = 0; start < NRUNES; ) {
        if(!prop[start]){
            start++;
            continue;
        }

        for(stop = start + 1; stop < NRUNES; stop++){
            if(!prop[stop]){
                break;
            }
            prop[stop] = 0;
        }
        if(force || stop != start + 1){
            if(!some){
                fprintf(out, "namespace {\n\n");
                fprintf(out, "const char32_t is%sr[] = {\n", label);
                some = 1;
            }
            prop[start] = 0;
            fprintf(out, "    0x%.4x, 0x%.4x,\n", start, stop - 1);
        }

        start = stop;
    }
    if(some) {
        fprintf(out, "};\n\n");
        fprintf(out, "} // !namespace\n\n");
    }

    return some;
}

/*
 * generate a mapping array for pairs with a skip between,
 * clearing those entries covered.
 */
static int
mkispair(const char *label, char *prop)
{
    int start, stop, some;

    some = 0;
    for(start = 0; start + 2 < NRUNES; ) {
        if(!prop[start]){
            start++;
            continue;
        }

        for(stop = start + 2; stop < NRUNES; stop += 2){
            if(!prop[stop]){
                break;
            }
            prop[stop] = 0;
        }
        if(stop != start + 2){
            if(!some){
                fprintf(out, "namespace {\n\n");
                fprintf(out, "const char32_t is%sp[] = {\n", label);
                some = 1;
            }
            prop[start] = 0;
            fprintf(out, "    0x%.4x, 0x%.4x,\n", start, stop - 2);
        }

        start = stop;
    }
    if(some) {
        fprintf(out, "};\n\n");
        fprintf(out, "} // !namespace\n\n");
    }
    return some;
}

/*
 * generate a properties array for singletons, clearing those cases covered.
 */
static int
mkissingle(const char *label, char *prop)
{
    int start, some;

    some = 0;
    for(start = 0; start < NRUNES; start++) {
        if(!prop[start]){
            continue;
        }

        if(!some){
            fprintf(out, "namespace {\n\n");
            fprintf(out, "const char32_t is%ss[] = {\n", label);
            some = 1;
        }
        prop[start] = 0;
        fprintf(out, "    0x%.4x,\n", start);
    }
    if(some) {
        fprintf(out, "};\n\n");
        fprintf(out, "} // !namespace\n\n");
    }
    return some;
}

/*
 * generate tables and a function for is<label>rune
 */
static void
mkis(const char* label, char* prop, int usepairs)
{
    int isr, isp, iss;

    isr = mkisrange(label, prop, 0);
    isp = 0;
    if(usepairs)
        isp = mkispair(label, prop);
    iss = mkissingle(label, prop);

    fprintf(out,
        "bool is%s(char32_t c) noexcept\n"
        "{\n"
        "    const char32_t* p;\n"
        "\n",
        label);

    if(isr)
        fprintf(out,
            "    p = rbsearch(c, is%sr, nelem (is%sr) / 2, 2);\n\n"
            "    if (p && c >= p[0] && c <= p[1]) {\n"
            "        return true;\n"
            "    }\n",
            label, label);

    if(isp)
        fprintf(out,
            "\n"
            "    p = rbsearch(c, is%sp, nelem (is%sp) / 2, 2);\n\n"
            "    if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1)) {\n"
            "        return true;\n"
            "    }\n",
            label, label);

    if(iss)
        fprintf(out,
            "\n"
            "    p = rbsearch(c, is%ss, nelem (is%ss), 1);\n\n"
            "    if (p && c == p[0]) {\n"
            "        return true;\n"
            "    }\n",
            label, label);


    fprintf(out,
        "\n"
        "    return false;\n"
        "}\n"
        "\n"
    );
}

/*
 * generate a mapping array for ranges, clearing those entries covered.
 * if force, generate one-entry ranges for singletons.
 */
static int
mktorange(const char* label, int* map, int force)
{
    int start, stop, delta, some;

    some = 0;
    for(start = 0; start < NRUNES; ) {
        if(map[start] == start){
            start++;
            continue;
        }

        delta = TO_DELTA(map[start], start);
        if(delta != (Rune)delta)
            fatal("bad map delta %d", delta);
        for(stop = start + 1; stop < NRUNES; stop++){
            if(TO_DELTA(map[stop], stop) != delta){
                break;
            }
            map[stop] = stop;
        }
        if(stop != start + 1){
            if(!some){
                fprintf(out, "namespace {\n\n");
                fprintf(out, "const char32_t to%sr[] = {\n", label);
                some = 1;
            }
            map[start] = start;
            fprintf(out, "    0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta);
        }

        start = stop;
    }
    if(some) {
        fprintf(out, "};\n\n");
        fprintf(out, "} // !namespace\n\n");
    }

    return some;
}

/*
 * generate a mapping array for pairs with a skip between,
 * clearing those entries covered.
 */
static int
mktopair(const char* label, int* map)
{
    int start, stop, delta, some;

    some = 0;
    for(start = 0; start + 2 < NRUNES; ) {
        if(map[start] == start){
            start++;
            continue;
        }

        delta = TO_DELTA(map[start], start);
        if(delta != (Rune)delta)
            fatal("bad map delta %d", delta);
        for(stop = start + 2; stop < NRUNES; stop += 2){
            if(TO_DELTA(map[stop], stop) != delta){
                break;
            }
            map[stop] = stop;
        }
        if(stop != start + 2){
            if(!some){
                fprintf(out, "namespace {\n\n");
                fprintf(out, "const char32_t to%sp[] = {\n", label);
                some = 1;
            }
            map[start] = start;
            fprintf(out, "    0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta);
        }

        start = stop;
    }
    if(some) {
        fprintf(out, "};\n\n");
        fprintf(out, "} // !namespace\n\n");
    }

    return some;
}

/*
 * generate a mapping array for singletons, clearing those entries covered.
 */
static int
mktosingle(const char* label, int* map)
{
    int start, delta, some;

    some = 0;
    for(start = 0; start < NRUNES; start++) {
        if(map[start] == start){
            continue;
        }

        delta = TO_DELTA(map[start], start);
        if(delta != (Rune)delta)
            fatal("bad map delta %d", delta);
        if(!some){
            fprintf(out, "namespace {\n\n");
            fprintf(out, "const char32_t to%ss[] = {\n", label);
            some = 1;
        }
        map[start] = start;
        fprintf(out, "    0x%.4x, %d,\n", start, delta);
    }
    if(some) {
        fprintf(out, "};\n\n");
        fprintf(out, "} // !namespace\n\n");
    }

    return some;
}

/*
 * generate tables and a function for to<label>rune
 */
static void
mkto(const char* label, int* map, int usepairs)
{
    int tor, top, tos;

    tor = mktorange(label, map, 0);
    top = 0;
    if(usepairs)
        top = mktopair(label, map);
    tos = mktosingle(label, map);

    fprintf(out,
        "char32_t to%s(char32_t c) noexcept\n"
        "{\n"
        "    const char32_t* p;\n"
        "\n",
        label);

    if(tor)
        fprintf(out,
            "    p = rbsearch(c, to%sr, nelem (to%sr) / 3, 3);\n\n"
            "    if (p && c >= p[0] && c <= p[1]) {\n"
            "        return c + p[2] - %d;\n"
            "    }\n",
            label, label, TO_OFFSET);

    if(top)
        fprintf(out,
            "\n"
            "    p = rbsearch(c, to%sp, nelem (to%sp) / 3, 3);\n\n"
            "    if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1)) {\n"
            "        return c + p[2] - %d;\n"
            "    }\n",
            label, label, TO_OFFSET);

    if(tos)
        fprintf(out,
            "\n"
            "    p = rbsearch(c, to%ss, nelem (to%ss) / 2, 2);\n\n"
            "    if (p && c == p[0]) {\n"
            "        return c + p[1] - %d;\n"
            "    }\n\n",
            label, label, TO_OFFSET);

    fprintf(out,
        "    return c;\n"
        "}\n"
        "\n"
    );
}

// Make only range tables and a function for is<label>rune.
static void
mkisronly(const char* label, char* prop)
{
    mkisrange(label, prop, 1);
    fprintf(out,
        "bool is%s(char32_t c) noexcept\n"
        "{\n"
        "    const char32_t* p;\n"
        "\n"
        "    p = rbsearch(c, is%sr, nelem (is%sr) / 2, 2);\n\n"
        "    if (p && c >= p[0] && c <= p[1]) {\n"
        "        return true;\n"
        "    }\n\n"
        "    return false;\n"
        "}\n"
        "\n",
            label, label, label);
}

/*
 * generate the body of runetype.
 * assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne);
 */
static void
mktables(char *src, int usepairs)
{
    /* Add nelem macro */
    fprintf(out,
        "#define nelem(x) (sizeof (x) / sizeof ((x)[0]))\n\n"
    );

    /* Add the rbsearch function */
    fprintf(out,
        "namespace {\n\n"
        "const char32_t *rbsearch(char32_t c, const char32_t* t, int n, int ne) noexcept\n"
        "{\n"
        "    const char32_t* p;\n"
        "    int m;\n\n"
        "    while (n > 1) {\n"
        "        m = n >> 1;\n"
        "        p = t + m * ne;\n\n"
        "        if (c >= p[0]) {\n"
        "            t = p;\n"
        "            n = n - m;\n"
        "        } else {\n"
        "            n = m;\n"
        "        }\n"
        "    }\n\n"
        "    if (n && c >= t[0]) {\n"
        "        return t;\n"
        "    }\n\n"
        "    return nullptr;\n"
        "}\n\n"
        "} // !namespace\n\n"
    );

    /*
     * we special case the space and digit tables, since they are assumed
     * to be small with several ranges.
     */
    mkisronly("space", myisspace);
    mkisronly("digit", myisdigit);

    mkis("alpha", myisalpha, 0);
    mkis("upper", myisupper, usepairs);
    mkis("lower", myislower, usepairs);
    mkis("title", myistitle, usepairs);

    mkto("upper", mytoupper, usepairs);
    mkto("lower", mytolower, usepairs);
    mkto("title", mytotitle, usepairs);
}

static int
mygetfields(char **fields, int nfields, char *str, const char *delim)
{
    int nf;

    fields[0] = str;
    nf = 1;
    if(nf >= nfields)
        return nf;

    for(; *str; str++){
        if(strchr(delim, *str) != NULL){
            *str = '\0';
            fields[nf++] = str + 1;
            if(nf >= nfields)
                break;
        }
    }
    return nf;
}

static int
getunicodeline(FILE *in, char **fields, char *buf)
{
    char *p;

    if(fgets(buf, MAX_LINE, in) == NULL)
        return 0;

    p = strchr(buf, '\n');
    if (p == NULL)
        fatal("line too long");
    *p = '\0';

    if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS)
        fatal("bad number of fields");

    return 1;
}

static int
getcode(char *s)
{
    int i, code;

    code = 0;
    i = 0;
    /* Parse a hex number */
    while(s[i]) {
        code <<= 4;
        if(s[i] >= '0' && s[i] <= '9')
            code += s[i] - '0';
        else if(s[i] >= 'A' && s[i] <= 'F')
            code += s[i] - 'A' + 10;
        else
            fatal("bad code char '%c'", s[i]);
        i++;
    }
    return code;
}

static void
fatal(const char *fmt, ...)
{
    va_list arg;

    fprintf(stderr, "mkunicode: fatal error: ");
    va_start(arg, fmt);
    vfprintf(stderr, fmt, arg);
    va_end(arg);
    fprintf(stderr, "\n");

    exit(1);
}