mercury/runtime/mercury_string.c

/*
** vim: ts=4 sw=4 expandtab
*/
/*
** Copyright (C) 2000-2002, 2006, 2011-2012 The University of Melbourne.
** This file may only be copied under the terms of the GNU Library General
** Public License - see the file COPYING.LIB in the Mercury distribution.
*/

/* mercury_string.c - string handling */

#include "mercury_imp.h"
#include "mercury_string.h"

#if defined(MR_HAVE__VSNPRINTF) && ! defined(MR_HAVE_VSNPRINTF)
  #define vsnprintf _vsnprintf
#endif

#if defined(MR_HAVE_VSNPRINTF) || defined(MR_HAVE__VSNPRINTF)
  #define MR_HAVE_A_VSNPRINTF
#endif

#define BUFFER_SIZE 4096

MR_String
MR_make_string(MR_AllocSiteInfoPtr alloc_id, const char *fmt, ...)
{
    va_list     ap;
    MR_String   result;
    int         n;
    char        *p;

#ifdef MR_HAVE_A_VSNPRINTF
    int         size = BUFFER_SIZE;
    char        fixed[BUFFER_SIZE];
    MR_bool     dynamically_allocated = MR_FALSE;

    /*
    ** On the first iteration we try with a fixed-size buffer.
    ** If that didn't work, use a dynamically allocated array twice
    ** the size of the fixed array and keep growing the array until
    ** the string fits.
    */
    p = fixed;

    while (1) {
        /* Try to print in the allocated space. */
        va_start(ap, fmt);
        n = vsnprintf(p, size, fmt, ap);
        va_end(ap);

        /* If that worked, return the string.  */
        if (n > -1 && n < size) {
            break;
        }

        /* Else try again with more space.  */
        if (n > -1) {   /* glibc 2.1 */
            size = n + 1; /* precisely what is needed */
        } else {        /* glibc 2.0 */
            size *= 2;  /* twice the old size */
        }

        if (!dynamically_allocated) {
            p = MR_NEW_ARRAY(char, size);
            dynamically_allocated = MR_TRUE;
        } else {
            p = MR_RESIZE_ARRAY(p, char, size);
        }
    }

#else
    /*
    ** It is possible for this buffer to overflow,
    ** and then bad things may happen.
    */
    char fixed[40960];

    va_start(ap, fmt);
    n = vsprintf(fixed, fmt, ap);
    va_end(ap);

    p = fixed;
#endif
    MR_restore_transient_hp();
    MR_allocate_aligned_string_msg(result, strlen(p), alloc_id);
    MR_save_transient_hp();
    strcpy(result, p);

#ifdef MR_HAVE_A_VSNPRINTF
    if (dynamically_allocated) {
        MR_free(p);
    }
#endif

    return result;
}

/*
**  Note that MR_hash_string{,2,3} are actually defined as macros in
**  mercury_string.h, if we're using GNU C.
**  We define them here whether or not we're using gcc, so that users
**  can easily switch between gcc and cc without rebuilding the libraries.
*/

#undef MR_hash_string
#undef MR_hash_string2
#undef MR_hash_string3

MR_Integer
MR_hash_string(MR_ConstString s)
{
    MR_HASH_STRING_FUNC_BODY
}

MR_Integer
MR_hash_string2(MR_ConstString s)
{
    MR_HASH_STRING2_FUNC_BODY
}

MR_Integer
MR_hash_string3(MR_ConstString s)
{
    MR_HASH_STRING3_FUNC_BODY
}

MR_bool
MR_utf8_next(const MR_String s_, MR_Integer *pos)
{
    const unsigned char *s = (const unsigned char *)s_;
    int c;

    if (s[*pos] == '\0') {
        /* End of string. */
        return MR_FALSE;
    }

    for (;;) {
        ++(*pos);
        c = s[*pos];
        if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
            break;
        }
    }

    return MR_TRUE;
}

MR_bool
MR_utf8_prev(const MR_String s_, MR_Integer *pos)
{
    const unsigned char *s = (const unsigned char *)s_;
    int c;

    while (*pos > 0) {
        (*pos)--;
        c = s[*pos];
        if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
            return MR_TRUE;
        }
    }

    return MR_FALSE;
}

MR_int_least32_t
MR_utf8_get(const MR_String s_, MR_Integer pos)
{
    const unsigned char *s = (const unsigned char *)s_;
    int c;
    int width;

    c = s[pos];
    if (MR_is_ascii(c)) {
        return c;
    } else {
        return MR_utf8_get_mb(s_, pos, &width);
    }
}

MR_int_least32_t
MR_utf8_get_mb(const MR_String s_, MR_Integer pos, int *width)
{
    const unsigned char *s = (const unsigned char *)s_;
    int c;
    int d;
    int minc;

    c = s[pos];

    /* c <= 0x7f (ASCII) must be handled before calling this function. */

    if (c <= 0xC1) {
        /* Trailing byte of multi-byte sequence or an overlong encoding for
         * code point <= 127.
         */
        return -2;
    }

    if (c <= 0xDF) {
        /* 2-byte sequence. */
        c &= 0x1F;
        *width = 2;
        minc = 0x80;
    }
    else if (c <= 0xEF) {
        /* 3-byte sequence. */
        c &= 0x0F;
        *width = 3;
        minc = 0x800;
    }
    else if (c <= 0xF4) {
        /* 4-byte sequence. */
        c &= 0x07;
        *width = 4;
        minc = 0x10000;
    }
    else {
        /* Otherwise invalid. */
        return -2;
    }

    switch (*width) {
        case 4:
            d = s[++pos];
            if (!MR_utf8_is_trail_byte(d)) {
                return -2;
            }
            c = (c << 6) | (d & 0x3F);
            /* fall through */
        case 3:
            d = s[++pos];
            if (!MR_utf8_is_trail_byte(d)) {
                return -2;
            }
            c = (c << 6) | (d & 0x3F);
            /* fall through */
        case 2:
            d = s[++pos];
            if (!MR_utf8_is_trail_byte(d)) {
                return -2;
            }
            c = (c << 6) | (d & 0x3F);
            break;
    }

    /* Check for overlong forms, which could be used to bypass security
     * validations. We could also check code points aren't above U+10FFFF or in
     * the surrogate ranges, but we don't.
     */

    if (c < minc) {
        return -2;
    }

    return c;
}

MR_int_least32_t
MR_utf8_get_next(const MR_String s, MR_Integer *pos)
{
    int c;

    c = s[*pos];
    if (MR_is_ascii(c)) {
        (*pos)++;
        return c;
    }

    return MR_utf8_get_next_mb(s, pos);
}

MR_int_least32_t
MR_utf8_get_next_mb(const MR_String s, MR_Integer *pos)
{
    int c, width;

    c = MR_utf8_get_mb(s, *pos, &width);
    if (c >= 0) {
        /* Multibyte code point. */
        (*pos) += width;
        return c;
    }

    /* Some invalid byte sequence. */
    MR_utf8_next(s, pos);
    return c;
}

MR_int_least32_t
MR_utf8_prev_get(const MR_String s, MR_Integer *pos)
{
    int c, width;

    if (MR_utf8_prev(s, pos)) {
        c = s[*pos];
        if (MR_is_ascii(c)) {
            return c;
        } else {
            return MR_utf8_get_mb(s, *pos, &width);
        }
    }

    /* Past beginning. */
    return -1;
}

size_t
MR_utf8_width(MR_Char c)
{
    /* So we don't need to check for negative values nor use unsigned ints
     * in the interface, which are a pain.
     */
    MR_UnsignedChar uc = c;

    if (uc <= 0x7f) {
        return 1;
    }
    if (uc <= 0x7ff) {
        return 2;
    }
    if (uc <= 0xffff) {
        return (MR_is_surrogate(uc)) ? 0 : 3;
    }
    if (uc <= 0x10ffff) {
        return 4;
    }
    /* The rest are illegal. */
    return 0;
}

size_t
MR_utf8_encode(char s_[], MR_Char c)
{
    unsigned char *s = (unsigned char *)s_;
    MR_UnsignedChar uc = c;

    if (uc <= 0x7f) {
        s[0] = uc;
        return 1;
    }

    if (uc <= 0x7ff) {
        s[0] = 0xC0 | ((uc >> 6) & 0x1F);
        s[1] = 0x80 |  (uc       & 0x3F);
        return 2;
    }

    if (uc <= 0xffff) {
        if (MR_is_surrogate(uc)) {
            return 0;
        }
        s[0] = 0xE0 | ((uc >> 12) & 0x0F);
        s[1] = 0x80 | ((uc >>  6) & 0x3F);
        s[2] = 0x80 |  (uc        & 0x3F);
        return 3;
    }

    if (uc <= 0x10ffff) {
        s[0] = 0xF0 | ((uc >> 18) & 0x07);
        s[1] = 0x80 | ((uc >> 12) & 0x3F);
        s[2] = 0x80 | ((uc >>  6) & 0x3F);
        s[3] = 0x80 |  (uc        & 0x3F);
        return 4;
    }

    /* Otherwise is illegal. */
    return 0;
}

MR_bool
MR_utf8_verify(const MR_String s)
{
    MR_Integer pos = 0;

    for (;;) {
        MR_int_least32_t c;

        c = MR_utf8_get_next(s, &pos);
        if (c == 0) {
            return MR_TRUE;
        }
        if (c < 0) {
            return MR_FALSE;
        }
    }
}