mercury/runtime/mercury_string.h

// vim: ts=4 sw=4 expandtab ft=c

// Copyright (C) 1995-2005, 2007, 2011-2012 The University of Melbourne.
// Copyright (C) 2015-2016, 2018-2019, 2021, 2023-2024 The Mercury team.
// This file is distributed under the terms specified in COPYING.LIB.

// mercury_string.h - string handling

#ifndef MERCURY_STRING_H
#define MERCURY_STRING_H

#include "mercury_heap.h"       // for MR_offset_incr_hp_atomic

#include <string.h>             // for strcmp() etc.
#include <stdarg.h>
#include <stdio.h>
#if defined(MR_WIN32)
   #include <wchar.h>
#endif

// On Windows, snprintf/vsnprintf may be synonyms for _snprintf/_vsnprintf and
// thus not conform to the C99 specification. Since it is next to impossible to
// tell at compile time which implementation we are getting, just define a
// wrapper over _vsnprintf if it exists.
// Beginning with the UCRT in Visual Studio 2015 and Windows 10, snprintf and
// vsnprintf are C99 standard compliant so we may be able to drop this code
// eventually.

#if defined(MR_HAVE__VSNPRINTF)
    extern int
    MR_vsnprintf(char *str, size_t size, const char *format, va_list ap);
#elif defined(MR_HAVE_VSNPRINTF)
    #define MR_vsnprintf vsnprintf
#else
    #error "Missing both vsnprintf and _vsnprintf"
#endif

#if defined(MR_HAVE__SNPRINTF)
    extern int MR_snprintf(char *str, size_t size, const char *format, ...);
#elif defined(MR_HAVE_SNPRINTF)
    #define MR_snprintf snprintf
#else
    #error "Missing both snprintf and _snprintf"
#endif

// Mercury characters (Unicode code points) are given type `MR_Char',
// which is a typedef for `MR_int_least32_t'.
// Mercury strings are stored as pointers to '\0'-terminated arrays of `char'.
// Strings are UTF-8 encoded.
// Mercury strings must not contain null characters. Unexpected null characters
// are a source of security vulnerabilities.
//
// The actual typedefs are in mercury_types.h to avoid problems with
// circular #includes.
//
// typedef MR_int_least32_t     MR_Char;
// typedef MR_uint_least32_t    MR_UnsignedChar;
//
// typedef char                 *MR_String;
// typedef const char           *MR_ConstString;

// MR_string_const("...", len):
//
// Given a C string literal and its length, returns a Mercury string.

#define MR_string_const(string, len) ((MR_String) string)

#define MR_make_string_const(string)                                    \
                MR_string_const((string), sizeof(string) - 1)

// MR_bool MR_string_equal(MR_ConstString s1, MR_ConstString s2):
// Return true iff the two Mercury strings s1 and s2 are equal.

#define MR_string_equal(s1,s2) (strcmp((char*)(s1),(char*)(s2))==0)

// void MR_make_aligned_string(MR_String ptr, const char *string):
//
// Given a C string `string', set `ptr' to be a Mercury string with the
// same contents. (`ptr' must be an lvalue.) If the resulting Mercury string
// is to be used by Mercury code, then the string pointed to by `string'
// should have been either statically allocated or allocated on the
// Mercury heap.
//
// BEWARE: this may modify `MR_hp', so it must only be called from places
// where `MR_hp' is valid. If calling it from inside a C function, rather than
// inside Mercury code, you may need to call MR_{save/restore}_transient_hp().
//
// Algorithm: if the string is aligned, just set ptr equal to it.
// Otherwise, allocate space on the heap and copy the C string to
// the Mercury string.

#define MR_make_aligned_string(ptr, string)                             \
    do {                                                                \
        if (MR_tag((MR_Word) (string)) != 0) {                          \
            MR_make_aligned_string_copy((ptr), (string));               \
        } else {                                                        \
            /* The cast is there to cast away const, if needed. */      \
            (ptr) = (MR_String) (string);                               \
        }                                                               \
    } while (0)

// void MR_make_aligned_string_copy(MR_String ptr, const char * string):
//
// Same as MR_make_aligned_string(ptr, string), except that the string
// is guaranteed to be copied. This is useful for copying C strings
// onto the Mercury heap.
//
// BEWARE: this may modify `MR_hp', so it must only be called from places
// where `MR_hp' is valid. If calling it from inside a C function, rather than
// inside Mercury code, you may need to call MR_{save/restore}_transient_hp().

#define MR_make_aligned_string_copy(ptr, string)                        \
    MR_make_aligned_string_copy_msg((ptr), (string), NULL)

#define MR_make_aligned_string_copy_msg(ptr, string, alloc_id)          \
    do {                                                                \
        MR_Word make_aligned_string_tmp;                                \
        char    *make_aligned_string_ptr;                               \
                                                                        \
        MR_offset_incr_hp_atomic_msg(make_aligned_string_tmp, 0,        \
            (strlen(string) + sizeof(MR_Word)) / sizeof(MR_Word),       \
            (alloc_id), "string.string/0");                             \
        make_aligned_string_ptr = (char *) make_aligned_string_tmp;     \
        strcpy(make_aligned_string_ptr, (string));                      \
        (ptr) = make_aligned_string_ptr;                                \
    } while (0)

// void MR_make_aligned_string_copy_saved_hp(MR_String ptr,
//          const char * string):
//
// Same as MR_make_aligned_string_copy(ptr, string), except that it uses
// MR_offset_incr_saved_hp_atomic instead of MR_offset_incr_hp_atomic.

#define MR_make_aligned_string_copy_saved_hp(ptr, string, alloc_id)     \
    do {                                                                \
        MR_Word make_aligned_string_tmp;                                \
        char    *make_aligned_string_ptr;                               \
                                                                        \
        MR_offset_incr_saved_hp_atomic(make_aligned_string_tmp, 0,      \
            (strlen(string) + sizeof(MR_Word)) / sizeof(MR_Word),       \
            (alloc_id), "string.string/0");                             \
        make_aligned_string_ptr = (char *) make_aligned_string_tmp;     \
        strcpy(make_aligned_string_ptr, (string));                      \
        (ptr) = make_aligned_string_ptr;                                \
    } while (0)

// void MR_make_aligned_string_copy_saved_hp_quote(MR_String ptr,
//          const char * string):
//
// Same as MR_make_aligned_string_copy_saved_hp(ptr, string), except that
// it puts double quote marks at the start and end of the string.

#define MR_make_aligned_string_copy_saved_hp_quote(ptr, string, alloc_id) \
    do {                                                                  \
        MR_Word make_aligned_string_tmp;                                  \
        char    *make_aligned_string_ptr;                                 \
                                                                          \
        MR_offset_incr_saved_hp_atomic(make_aligned_string_tmp, 0,        \
            (strlen(string) + 2 + sizeof(MR_Word)) / sizeof(MR_Word),     \
            (alloc_id), "string.string/0");                               \
        make_aligned_string_ptr = (char *) make_aligned_string_tmp;       \
        sprintf(make_aligned_string_ptr, "%c%s%c", '"', string, '"');     \
        (ptr) = make_aligned_string_ptr;                                  \
    } while (0)

// void MR_allocate_aligned_string_msg(MR_String ptr, size_t len,
//          MR_Code *proclabel):
//
// Allocate enough word aligned memory to hold len characters. Also record
// the location, proclabel, of the allocation if profiling is enabled.
//
// BEWARE: this may modify `MR_hp', so it must only be called from places
// where `MR_hp' is valid. If calling it from inside a C function, rather than
// inside Mercury code, you may need to call MR_{save/restore}_transient_hp().

#define MR_allocate_aligned_string_msg(ptr, len, alloc_id)              \
    do {                                                                \
        MR_Word make_aligned_string_tmp;                                \
        char    *make_aligned_string_ptr;                               \
                                                                        \
        MR_offset_incr_hp_atomic_msg(make_aligned_string_tmp, 0,        \
            ((len) + sizeof(MR_Word)) / sizeof(MR_Word),                \
            (alloc_id), "string.string/0");                             \
        make_aligned_string_ptr = (char *) make_aligned_string_tmp;     \
        (ptr) = make_aligned_string_ptr;                                \
    } while (0)

#define MR_allocate_aligned_string_saved_hp(ptr, len, alloc_id)         \
    do {                                                                \
        MR_Word make_aligned_string_tmp;                                \
        char    *make_aligned_string_ptr;                               \
                                                                        \
        MR_offset_incr_saved_hp_atomic(make_aligned_string_tmp, 0,      \
            ((len) + sizeof(MR_Word)) / sizeof(MR_Word),                \
            (alloc_id), "string.string/0");                             \
        make_aligned_string_ptr = (char *) make_aligned_string_tmp;     \
        (ptr) = make_aligned_string_ptr;                                \
    } while (0)

// MR_do_hash_string{,2,3,4,5,6}(int & hash, MR_Word string):
//
// Given a Mercury string `string', set `hash' to the hash value
// for that string. (`hash' must be an lvalue.)
//
// This is an implementation detail used to implement
// MR_hash_string{,2,3,4,5,6}(). It should not be used directly.
// Use MR_hash_string{,2,3,4,5,6}() instead.
//
// Note that these functions are also defined in library/string.m.
// The definition here and in string.m must be kept equivalent.

#define MR_do_hash_string(hash, s)                                      \
    {                                                                   \
        int len;                                                        \
        MR_CHECK_EXPR_TYPE(hash, int);                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        len = 0;                                                        \
        hash = 0;                                                       \
        while (((const unsigned char *)(s))[len]) {                     \
            hash ^= (hash << 5);                                        \
            hash ^= ((const unsigned char *)(s))[len];                  \
            len++;                                                      \
        }                                                               \
        hash ^= len;                                                    \
    }

#define MR_do_hash_string2(hash, s)                                     \
    {                                                                   \
        int len;                                                        \
        MR_CHECK_EXPR_TYPE(hash, int);                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        len = 0;                                                        \
        hash = 0;                                                       \
        while (((const unsigned char *)(s))[len]) {                     \
            hash = hash * 37;                                           \
            hash += ((const unsigned char *)(s))[len];                  \
            len++;                                                      \
        }                                                               \
        hash ^= len;                                                    \
    }

#define MR_do_hash_string3(hash, s)                                     \
    {                                                                   \
        int len;                                                        \
        MR_CHECK_EXPR_TYPE(hash, int);                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        len = 0;                                                        \
        hash = 0;                                                       \
        while (((const unsigned char *)(s))[len]) {                     \
            hash = hash * 49;                                           \
            hash += ((const unsigned char *)(s))[len];                  \
            len++;                                                      \
        }                                                               \
        hash ^= len;                                                    \
    }

#define MR_keep_30_bits(x)                                              \
    ((x) & ((1 << 30) - 1))

#define MR_do_hash_string4(hash, s)                                     \
    {                                                                   \
        int i;                                                          \
        MR_CHECK_EXPR_TYPE(hash, int);                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        hash = 0;                                                       \
        for (i = 0; ((const unsigned char *)(s))[i] != 0; i++) {        \
            hash = MR_keep_30_bits(hash ^ (hash << 5));                 \
            hash = hash ^ ((const unsigned char *)(s))[i];              \
        }                                                               \
        hash ^= i;                                                      \
    }

#define MR_do_hash_string5(hash, s)                                         \
    {                                                                       \
        int i;                                                              \
        MR_CHECK_EXPR_TYPE(hash, int);                                      \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                              \
        hash = 0;                                                           \
        for (i = 0; ((const unsigned char *)(s))[i] != 0; i++) {            \
            hash = MR_keep_30_bits(hash * 37);                              \
            hash = MR_keep_30_bits(hash + ((const unsigned char *)(s))[i]); \
        }                                                                   \
        hash ^= i;                                                          \
    }

#define MR_do_hash_string6(hash, s)                                         \
    {                                                                       \
        int i;                                                              \
        MR_CHECK_EXPR_TYPE(hash, int);                                      \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                              \
        hash = 0;                                                           \
        for (i = 0; ((const unsigned char *)(s))[i] != 0; i++) {            \
            hash = MR_keep_30_bits(hash * 49);                              \
            hash = MR_keep_30_bits(hash + ((const unsigned char *)(s))[i]); \
        }                                                                   \
        hash ^= i;                                                          \
    }

// MR_hash_string{,2,3,4,5,6}(s):
//
// Given a Mercury string `s', return a hash value for that string.

MR_Integer      MR_hash_string(MR_ConstString);
MR_Integer      MR_hash_string2(MR_ConstString);
MR_Integer      MR_hash_string3(MR_ConstString);
MR_Integer      MR_hash_string4(MR_ConstString);
MR_Integer      MR_hash_string5(MR_ConstString);
MR_Integer      MR_hash_string6(MR_ConstString);

#if defined(MR_GNUC)
#define MR_hash_string(s)                                               \
    ({                                                                  \
        MR_Integer hash_string_result;                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        MR_do_hash_string(hash_string_result, s);                       \
        hash_string_result;                                             \
    })

#define MR_hash_string2(s)                                              \
    ({                                                                  \
        MR_Integer hash_string_result;                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        MR_do_hash_string2(hash_string_result, s);                      \
        hash_string_result;                                             \
    })

#define MR_hash_string3(s)                                              \
    ({                                                                  \
        MR_Integer hash_string_result;                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        MR_do_hash_string3(hash_string_result, s);                      \
        hash_string_result;                                             \
    })

#define MR_hash_string4(s)                                              \
    ({                                                                  \
        MR_Integer hash_string_result;                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        MR_do_hash_string4(hash_string_result, s);                      \
        hash_string_result;                                             \
    })

#define MR_hash_string5(s)                                              \
    ({                                                                  \
        MR_Integer hash_string_result;                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        MR_do_hash_string5(hash_string_result, s);                      \
        hash_string_result;                                             \
    })

#define MR_hash_string6(s)                                              \
    ({                                                                  \
        MR_Integer hash_string_result;                                  \
        MR_CHECK_EXPR_TYPE(s, MR_ConstString);                          \
        MR_do_hash_string6(hash_string_result, s);                      \
        hash_string_result;                                             \
    })
#endif

// If we are not using gcc, the actual definitions of these functions
// are runtime/mercury_string.c; they use the macros below.

#define MR_HASH_STRING_FUNC_BODY                                        \
       MR_Integer hash_string_result;                                   \
       MR_do_hash_string(hash_string_result, s);                        \
       return hash_string_result;
#define MR_HASH_STRING2_FUNC_BODY                                       \
       MR_Integer hash_string_result;                                   \
       MR_do_hash_string2(hash_string_result, s);                       \
       return hash_string_result;
#define MR_HASH_STRING3_FUNC_BODY                                       \
       MR_Integer hash_string_result;                                   \
       MR_do_hash_string3(hash_string_result, s);                       \
       return hash_string_result;
#define MR_HASH_STRING4_FUNC_BODY                                       \
       MR_Integer hash_string_result;                                   \
       MR_do_hash_string4(hash_string_result, s);                       \
       return hash_string_result;
#define MR_HASH_STRING5_FUNC_BODY                                       \
       MR_Integer hash_string_result;                                   \
       MR_do_hash_string5(hash_string_result, s);                       \
       return hash_string_result;
#define MR_HASH_STRING6_FUNC_BODY                                       \
       MR_Integer hash_string_result;                                   \
       MR_do_hash_string6(hash_string_result, s);                       \
       return hash_string_result;

// A version of strcmp to which we can pass Mercury words
// without having to cast the arguments first.

#define MR_strcmp(s, t)         strcmp((const char *)(s), (const char *)(t))

// Assuming that the first o (offset) code units of sa and sb are equal,
// are the rest of sa and sb equal?
#define MR_offset_streq(o, sa, sb)                                      \
    (strcmp((const char *) ((sa)+(o)), (const char *) ((sb)+(o))) == 0)
// Assuming that the first o (offset) code units of sa and sb are equal,
// are the next sz (size) code units of sa and sb equal?
#define MR_offset_strn_eq(o, sz, sa, sb)                                \
    (strncmp((const char *) ((sa)+(o)), (const char *) ((sb)+(o)), sz) == 0)

// Return the kth code unit in a string.

#define MR_nth_code_unit(s, k)                                          \
    ((unsigned) (((const unsigned char *) (s))[(k)]))

// Return an MR_String which has been created using the format string, fmt,
// passed to sprintf. If memory profiling is turned on, record the allocation
// as coming from proclabel. The MR_String returned has been allocated
// on the mercury heap using MR_allocate_aligned_string_msg.
//
// BEWARE: this may modify the saved copy of `MR_hp', so it must only be called
// from places where the saved copy of `MR_hp' is valid. You will generally
// need to call MR_{save/restore}_transient_hp() before/after calling
// this function.

MR_String MR_make_string(MR_AllocSiteInfoPtr alloc_id, const char *fmt, ...);

// Given a Mercury string `string', make a copy that inserts any required
// character escapes and places double quote marks at the start and end of
// the copy. On success, returns MR_TRUE and sets `ptr' to point to the copy
// of the string. Returns MR_FALSE if `string' is not a valid UTF-8 encoded
// string.

extern MR_bool MR_escape_string_quote(MR_String *ptr, const char * string);

// True if c is an ASCII code point, i.e. U+0000..U+007f.

#define MR_is_ascii(c)              ((unsigned) (c) <= 0x7f)

// True if c is a Unicode surrogate code point, i.e. U+D800..U+DFFF.
// This code is sort-of duplicated, in mercury, in char.char_int_is_surrogate
// in the Mercury standard library.

#define MR_is_surrogate(c)          (((unsigned) (c) & 0x1FF800) == 0xD800)

// True if c is a Unicode control code point, i.e. U+0000..U+001f,
// U+007f..U+009f.

#define MR_is_control(c) \
    ((0x00 <= (unsigned) (c) && (unsigned) (c) <= 0x1f) || \
     (0x7f <= (unsigned) (c) && (unsigned) (c) <= 0x9f))

// UTF-8 manipulation

#define MR_utf8_is_single_byte(c)   (((unsigned) (c) & 0x80) == 0)
#define MR_utf8_is_lead_byte(c)     (((unsigned) (c) - 0xC0) < 0x3E)
#define MR_utf8_is_trail_byte(c)    (((unsigned) (c) & 0xC0) == 0x80)

// XXX ILSEQ The following functions should be rethought to make dealing
// with ill-formed code unit sequences easier.

// Advance `*pos' to the beginning of the next code point in `s'.
// If `*pos' is already at the end of the string, return MR_FALSE
// without modifying `*pos'.
//
// This function simply searches for either a single byte code point
// or for the lead byte of a multi-byte code point; it does no decoding.
// It may therefore skip over bytes in ill-formed sequences.

extern MR_bool          MR_utf8_next(const MR_String s_, MR_Integer *pos);

// Rewind `*pos' to the beginning of the previous code point in `s'.
// If `*pos' is already at the beginning of the string, return MR_FALSE
// without modifying `*pos'.
//
// This function simply searches for either a single byte code point
// or for the lead byte of a multi-byte code point; it does no decoding.
// It may therefore skip over bytes in ill-formed sequences.

extern MR_bool          MR_utf8_prev(const MR_String s_, MR_Integer *pos);

// Decode and return the code point beginning at `pos' in `s'.
// Return 0 if at the end of the string (i.e. the NUL terminator).
// Return -2 if the code unit sequence beginning at that offset is ill-formed.
//
// The _mb version requires s[pos] to be the lead byte of a multibyte code
// point.

extern MR_int_least32_t MR_utf8_get(const MR_String s, MR_Integer pos);
extern MR_int_least32_t MR_utf8_get_mb(const MR_String s, MR_Integer pos,
                            int *width);

// Decode the code point beginning at `pos' in `s', and set `*pos'
// as follows.
//
// If there is a well-formed code point at `pos' then return it,
// and set pos to the position of the first byte following the decoded
// code point.
//
// WARNING: if the code point is the NUL terminator, then pos will end up
// pointing past the end of the string.
//
// If there is an ill-formed code point at pos, then return -2, and set pos
// to the position of the next byte that is either
//
// - a single-byte code point (which may be the string-ending NUL), or
// - the lead byte of a multi-byte code point
//
// The _mb version requires s[pos] to be the lead byte of a multibyte code
// point.

extern MR_int_least32_t MR_utf8_get_next(const MR_String s, MR_Integer *pos);
extern MR_int_least32_t MR_utf8_get_next_mb(const MR_String s,
                            MR_Integer *pos);

// Rewind `*pos' to the beginning of the previous code point in `s'
// and return that code point.
// Return -1 if `*pos' is already at the beginning of the string.
// Return -2 if the code unit sequence beginning at the rewound `*pos'
// is ill-formed.

extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, MR_Integer *pos);

// Return the number of bytes required to encode the code point `c' in UTF-8.
// Returns 0 for surrogate code points or illegal values.

extern size_t           MR_utf8_width(MR_Char c);

// Encode the code point `c' into the buffer `s'.
// Return the number of bytes used if this is successful.
// If encoding fails because `c' is not a valid code point or is a surrogate,
// then leave the buffer unchanged and return 0.

extern size_t           MR_utf8_encode(char s[], MR_Char c);

// Return MR_TRUE iff `s' contains a valid UTF-8 encoded string.

extern MR_bool          MR_utf8_verify(const MR_String s);

// If `s' contains an invalid UTF-8 encoded string, return the number of
// UTF-8 code units between its start and the first invalid UTF-8 character.
// This number may be zero or positive.
// If `s' contains a valid UTF-8 encoded string, return a negative value.

extern MR_Integer       MR_utf8_find_ill_formed_char(const MR_String s);

// Accessing Unicode file names on Windows requires that we use the functions
// taking wide character strings. The following functions handle the
// conversions.
#if defined(MR_WIN32)
extern wchar_t          *MR_utf8_to_wide(const char *s);
extern char             *MR_wide_to_utf8(const wchar_t *ws,
                            MR_AllocSiteInfoPtr alloc_id);
#endif

#endif // not MERCURY_STRING_H