mirror of
https://github.com/Mercury-Language/mercury.git
synced 2026-04-15 17:33:38 +00:00
614 lines
14 KiB
C
614 lines
14 KiB
C
// vim: ts=4 sw=4 expandtab ft=c
|
|
|
|
// Copyright (C) 2000-2002, 2006, 2011-2012 The University of Melbourne.
|
|
// Copyright (C) 2015-2016, 2018-2019, 2023 The Mercury team.
|
|
// This file is distributed under the terms specified in COPYING.LIB.
|
|
|
|
// mercury_string.c - string handling
|
|
|
|
#include "mercury_imp.h"
|
|
#include "mercury_string.h"
|
|
#include "mercury_windows.h"
|
|
|
|
#ifdef _MSC_VER
|
|
// Disable warnings about using _vsnprintf being deprecated.
|
|
#pragma warning(disable:4996)
|
|
|
|
// va_copy is available from VC 2013 onwards.
|
|
#if _MSC_VER < 1800
|
|
#define va_copy(a, b) ((a) = (b))
|
|
#endif
|
|
#endif
|
|
|
|
#if defined(MR_HAVE__VSNPRINTF)
|
|
int
|
|
MR_vsnprintf(char *str, size_t size, const char *format, va_list ap)
|
|
{
|
|
va_list ap_copy;
|
|
int n;
|
|
|
|
if (size == 0) {
|
|
return _vsnprintf(NULL, 0, format, ap);
|
|
}
|
|
|
|
// _vsnprintf does not append a null terminator if the output is truncated.
|
|
// Follow the MS advice of initialising the buffer to null before calling
|
|
// _vsnprintf with a count strictly less than the buffer length.
|
|
memset(str, 0, size);
|
|
va_copy(ap_copy, ap);
|
|
n = _vsnprintf(str, size - 1, format, ap_copy);
|
|
va_end(ap_copy);
|
|
|
|
if (n == -1) {
|
|
// Return the number of characters that would have been written
|
|
// without truncation, to match the behaviour of C99 vsnprintf.
|
|
n = _vsnprintf(NULL, 0, format, ap);
|
|
}
|
|
|
|
return n;
|
|
}
|
|
#endif
|
|
|
|
#if defined(MR_HAVE__SNPRINTF)
|
|
int
|
|
MR_snprintf(char *str, size_t size, const char *format, ...)
|
|
{
|
|
va_list ap;
|
|
int n;
|
|
|
|
va_start(ap, format);
|
|
n = MR_vsnprintf(str, size, format, ap);
|
|
va_end(ap);
|
|
|
|
return n;
|
|
}
|
|
#endif
|
|
|
|
#define BUFFER_SIZE 4096
|
|
|
|
MR_String
|
|
MR_make_string(MR_AllocSiteInfoPtr alloc_id, const char *fmt, ...)
|
|
{
|
|
va_list ap;
|
|
MR_String result;
|
|
int n;
|
|
char *p;
|
|
|
|
int size = BUFFER_SIZE;
|
|
char fixed[BUFFER_SIZE];
|
|
MR_bool dynamically_allocated = MR_FALSE;
|
|
|
|
// On the first iteration we try with a fixed-size buffer.
|
|
// If that didn't work, use a dynamically allocated array twice
|
|
// the size of the fixed array and keep growing the array until
|
|
// the string fits.
|
|
|
|
p = fixed;
|
|
|
|
while (1) {
|
|
// Try to print in the allocated space.
|
|
va_start(ap, fmt);
|
|
n = MR_vsnprintf(p, size, fmt, ap);
|
|
va_end(ap);
|
|
|
|
// If that worked, return the string.
|
|
if (n > -1 && n < size) {
|
|
break;
|
|
}
|
|
|
|
// Else try again with more space.
|
|
if (n > -1) { // glibc 2.1
|
|
size = n + 1; // precisely what is needed
|
|
} else { // glibc 2.0
|
|
size *= 2; // twice the old size
|
|
}
|
|
|
|
if (!dynamically_allocated) {
|
|
p = MR_NEW_ARRAY(char, size);
|
|
dynamically_allocated = MR_TRUE;
|
|
} else {
|
|
p = MR_RESIZE_ARRAY(p, char, size);
|
|
}
|
|
}
|
|
|
|
MR_restore_transient_hp();
|
|
MR_allocate_aligned_string_msg(result, strlen(p), alloc_id);
|
|
MR_save_transient_hp();
|
|
strcpy(result, p);
|
|
|
|
if (dynamically_allocated) {
|
|
MR_free(p);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// The code for this function should be kept in sync with that of the
|
|
// quote_string predicates in library/term_io.m.
|
|
MR_bool
|
|
MR_escape_string_quote(MR_String *ptr, const char * string)
|
|
{
|
|
MR_Integer pos = 0;
|
|
size_t num_code_units = 0;
|
|
MR_Char ch;
|
|
MR_bool must_escape = MR_FALSE;
|
|
|
|
// Check if we need to add character escapes to the string.
|
|
// XXX ILSEQ Check for surrogate code points.
|
|
while ((ch = MR_utf8_get_next((MR_String) string, &pos)) > 0) {
|
|
switch (ch) {
|
|
case '\a':
|
|
case '\b':
|
|
case '\f':
|
|
case '\n':
|
|
case '\t':
|
|
case '\r':
|
|
case '\v':
|
|
case '\"':
|
|
case '\\':
|
|
num_code_units += 2;
|
|
must_escape = MR_TRUE;
|
|
break;
|
|
default:
|
|
if (MR_is_control(ch)) {
|
|
// All control characters that do not have a specific
|
|
// backslash escape are octal escaped.
|
|
// This takes five code units.
|
|
num_code_units += 5;
|
|
must_escape = MR_TRUE;
|
|
} else {
|
|
num_code_units += MR_utf8_width(ch);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check that the string's encoding was valid.
|
|
if (ch < 0) {
|
|
*ptr = NULL;
|
|
return MR_FALSE;
|
|
}
|
|
|
|
if (must_escape) {
|
|
char *dst;
|
|
|
|
MR_allocate_aligned_string_saved_hp(*ptr,
|
|
num_code_units + 2 /* quotes */ + 1 /* \0 */,
|
|
NULL);
|
|
|
|
dst = *ptr;
|
|
dst[0] = '\"';
|
|
dst++;
|
|
pos = 0;
|
|
while ((ch = MR_utf8_get_next((MR_String) string, &pos)) > 0) {
|
|
switch (ch) {
|
|
case '\a':
|
|
dst[0] = '\\';
|
|
dst[1] = 'a';
|
|
dst += 2;
|
|
break;
|
|
case '\b':
|
|
dst[0] = '\\';
|
|
dst[1] = 'b';
|
|
dst += 2;
|
|
break;
|
|
case '\f':
|
|
dst[0] = '\\';
|
|
dst[1] = 'f';
|
|
dst += 2;
|
|
break;
|
|
case '\n':
|
|
dst[0] = '\\';
|
|
dst[1] = 'n';
|
|
dst += 2;
|
|
break;
|
|
case '\t':
|
|
dst[0] = '\\';
|
|
dst[1] = 't';
|
|
dst += 2;
|
|
break;
|
|
case '\r':
|
|
dst[0] = '\\';
|
|
dst[1] = 'r';
|
|
dst += 2;
|
|
break;
|
|
case '\v':
|
|
dst[0] = '\\';
|
|
dst[1] = 'v';
|
|
dst += 2;
|
|
break;
|
|
case '\"':
|
|
dst[0] = '\\';
|
|
dst[1] = '\"';
|
|
dst += 2;
|
|
break;
|
|
case '\\':
|
|
dst[0] = '\\';
|
|
dst[1] = '\\';
|
|
dst += 2;
|
|
break;
|
|
default:
|
|
if (MR_is_control(ch)) {
|
|
sprintf(dst, "\\%03" MR_INTEGER_LENGTH_MODIFIER "o\\",
|
|
(MR_Integer) ch);
|
|
dst += 5;
|
|
} else {
|
|
dst += MR_utf8_encode(dst, ch);
|
|
}
|
|
}
|
|
}
|
|
dst[0] = '\"';
|
|
dst[1] = '\0';
|
|
} else {
|
|
MR_make_aligned_string_copy_saved_hp_quote(*ptr, string, NULL);
|
|
}
|
|
return MR_TRUE;
|
|
}
|
|
|
|
// Note that MR_hash_string{,2,3,4,5,6} are actually defined as macros in
|
|
// mercury_string.h, if we are using GNU C.
|
|
// We define them here whether or not we are using gcc, so that users
|
|
// can easily switch between gcc and cc without rebuilding the libraries.
|
|
|
|
#undef MR_hash_string
|
|
#undef MR_hash_string2
|
|
#undef MR_hash_string3
|
|
#undef MR_hash_string4
|
|
#undef MR_hash_string5
|
|
#undef MR_hash_string6
|
|
|
|
MR_Integer
|
|
MR_hash_string(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string2(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING2_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string3(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING3_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string4(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING4_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string5(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING5_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string6(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING6_FUNC_BODY
|
|
}
|
|
|
|
MR_bool
|
|
MR_utf8_next(const MR_String s_, MR_Integer *pos)
|
|
{
|
|
const unsigned char *s = (const unsigned char *) s_;
|
|
int c;
|
|
|
|
if (s[*pos] == '\0') {
|
|
// End of string.
|
|
return MR_FALSE;
|
|
}
|
|
|
|
for (;;) {
|
|
++(*pos);
|
|
c = s[*pos];
|
|
// This won't run off the end of the string, because
|
|
// MR_utf8_is_single_byte('\0') succeeds.
|
|
if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return MR_TRUE;
|
|
}
|
|
|
|
MR_bool
|
|
MR_utf8_prev(const MR_String s_, MR_Integer *pos)
|
|
{
|
|
const unsigned char *s = (const unsigned char *) s_;
|
|
int c;
|
|
|
|
while (*pos > 0) {
|
|
(*pos)--;
|
|
c = s[*pos];
|
|
if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
|
|
return MR_TRUE;
|
|
}
|
|
}
|
|
|
|
return MR_FALSE;
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get(const MR_String s_, MR_Integer pos)
|
|
{
|
|
const unsigned char *s = (const unsigned char *) s_;
|
|
int c;
|
|
int width;
|
|
|
|
c = s[pos];
|
|
if (MR_is_ascii(c)) {
|
|
return c;
|
|
} else {
|
|
return MR_utf8_get_mb(s_, pos, &width);
|
|
}
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get_mb(const MR_String s_, MR_Integer pos, int *width)
|
|
{
|
|
const unsigned char *s = (const unsigned char *) s_;
|
|
int c;
|
|
int d;
|
|
int minc;
|
|
|
|
c = s[pos];
|
|
|
|
// c <= 0x7f (ASCII) must be handled before calling this function.
|
|
|
|
if (c <= 0xC1) {
|
|
// Trailing byte of multi-byte sequence or an overlong encoding for
|
|
// code point <= 127.
|
|
|
|
return -2;
|
|
}
|
|
|
|
if (c <= 0xDF) {
|
|
// 2-byte sequence.
|
|
c &= 0x1F;
|
|
*width = 2;
|
|
minc = 0x80;
|
|
}
|
|
else if (c <= 0xEF) {
|
|
// 3-byte sequence.
|
|
c &= 0x0F;
|
|
*width = 3;
|
|
minc = 0x800;
|
|
}
|
|
else if (c <= 0xF4) {
|
|
// 4-byte sequence.
|
|
c &= 0x07;
|
|
*width = 4;
|
|
minc = 0x10000;
|
|
}
|
|
else {
|
|
// Otherwise invalid.
|
|
return -2;
|
|
}
|
|
|
|
switch (*width) {
|
|
case 4:
|
|
d = s[++pos];
|
|
if (!MR_utf8_is_trail_byte(d)) {
|
|
return -2;
|
|
}
|
|
c = (c << 6) | (d & 0x3F);
|
|
// fall through
|
|
case 3:
|
|
d = s[++pos];
|
|
if (!MR_utf8_is_trail_byte(d)) {
|
|
return -2;
|
|
}
|
|
c = (c << 6) | (d & 0x3F);
|
|
// fall through
|
|
case 2:
|
|
d = s[++pos];
|
|
if (!MR_utf8_is_trail_byte(d)) {
|
|
return -2;
|
|
}
|
|
c = (c << 6) | (d & 0x3F);
|
|
break;
|
|
}
|
|
|
|
// Check for overlong forms or code point out of range.
|
|
if (c < minc || c > 0x10FFFF) {
|
|
return -2;
|
|
}
|
|
|
|
// Check for surrogate code points.
|
|
if (MR_is_surrogate(c)) {
|
|
return -2;
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get_next(const MR_String s, MR_Integer *pos)
|
|
{
|
|
int c;
|
|
|
|
c = s[*pos];
|
|
if (MR_is_ascii(c)) {
|
|
(*pos)++;
|
|
return c;
|
|
}
|
|
|
|
return MR_utf8_get_next_mb(s, pos);
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get_next_mb(const MR_String s, MR_Integer *pos)
|
|
{
|
|
int c, width;
|
|
|
|
c = MR_utf8_get_mb(s, *pos, &width);
|
|
if (c >= 0) {
|
|
// Multibyte code point.
|
|
(*pos) += width;
|
|
return c;
|
|
}
|
|
|
|
// Some invalid byte sequence. Skip to the start of the next character,
|
|
// but return the ill-formed character.
|
|
MR_utf8_next(s, pos);
|
|
return c;
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_prev_get(const MR_String s, MR_Integer *pos)
|
|
{
|
|
int c, width;
|
|
|
|
if (MR_utf8_prev(s, pos)) {
|
|
c = s[*pos];
|
|
if (MR_is_ascii(c)) {
|
|
return c;
|
|
} else {
|
|
return MR_utf8_get_mb(s, *pos, &width);
|
|
}
|
|
}
|
|
|
|
// Past beginning.
|
|
return -1;
|
|
}
|
|
|
|
size_t
|
|
MR_utf8_width(MR_Char c)
|
|
{
|
|
// So we don't need to check for negative values nor use unsigned ints
|
|
// in the interface, which are a pain.
|
|
|
|
MR_UnsignedChar uc = c;
|
|
|
|
if (uc <= 0x7f) {
|
|
return 1;
|
|
}
|
|
if (uc <= 0x7ff) {
|
|
return 2;
|
|
}
|
|
if (uc <= 0xffff) {
|
|
return (MR_is_surrogate(uc)) ? 0 : 3;
|
|
}
|
|
if (uc <= 0x10ffff) {
|
|
return 4;
|
|
}
|
|
// The rest are illegal.
|
|
return 0;
|
|
}
|
|
|
|
size_t
|
|
MR_utf8_encode(char s_[], MR_Char c)
|
|
{
|
|
unsigned char *s = (unsigned char *) s_;
|
|
MR_UnsignedChar uc = c;
|
|
|
|
if (uc <= 0x7f) {
|
|
s[0] = uc;
|
|
return 1;
|
|
}
|
|
|
|
if (uc <= 0x7ff) {
|
|
s[0] = 0xC0 | ((uc >> 6) & 0x1F);
|
|
s[1] = 0x80 | (uc & 0x3F);
|
|
return 2;
|
|
}
|
|
|
|
if (uc <= 0xffff) {
|
|
if (MR_is_surrogate(uc)) {
|
|
return 0;
|
|
}
|
|
s[0] = 0xE0 | ((uc >> 12) & 0x0F);
|
|
s[1] = 0x80 | ((uc >> 6) & 0x3F);
|
|
s[2] = 0x80 | (uc & 0x3F);
|
|
return 3;
|
|
}
|
|
|
|
if (uc <= 0x10ffff) {
|
|
s[0] = 0xF0 | ((uc >> 18) & 0x07);
|
|
s[1] = 0x80 | ((uc >> 12) & 0x3F);
|
|
s[2] = 0x80 | ((uc >> 6) & 0x3F);
|
|
s[3] = 0x80 | (uc & 0x3F);
|
|
return 4;
|
|
}
|
|
|
|
// Otherwise is illegal.
|
|
return 0;
|
|
}
|
|
|
|
MR_bool
|
|
MR_utf8_verify(const MR_String s)
|
|
{
|
|
MR_Integer pos = 0;
|
|
|
|
for (;;) {
|
|
MR_int_least32_t c;
|
|
|
|
c = MR_utf8_get_next(s, &pos);
|
|
if (c == 0) {
|
|
return MR_TRUE;
|
|
}
|
|
if (c < 0) {
|
|
return MR_FALSE;
|
|
}
|
|
}
|
|
}
|
|
|
|
MR_Integer
|
|
MR_utf8_find_ill_formed_char(const MR_String s)
|
|
{
|
|
MR_Integer pos = 0;
|
|
|
|
for (;;) {
|
|
MR_int_least32_t c;
|
|
|
|
c = MR_utf8_get_next(s, &pos);
|
|
if (c == 0) {
|
|
return -1;
|
|
}
|
|
if (c < 0) {
|
|
return pos;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(MR_WIN32)
|
|
wchar_t *
|
|
MR_utf8_to_wide(const char *s)
|
|
{
|
|
int wslen;
|
|
wchar_t *ws;
|
|
|
|
wslen = MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0);
|
|
if (wslen == 0) {
|
|
MR_fatal_error("MR_utf8_to_wide: MultiByteToWideChar failed");
|
|
}
|
|
ws = MR_GC_NEW_ARRAY(wchar_t, wslen);
|
|
if (0 == MultiByteToWideChar(CP_UTF8, 0, s, -1, ws, wslen)) {
|
|
MR_fatal_error("MR_utf8_to_wide: MultiByteToWideChar failed");
|
|
}
|
|
return ws;
|
|
}
|
|
|
|
char *
|
|
MR_wide_to_utf8(const wchar_t *ws, MR_AllocSiteInfoPtr alloc_id)
|
|
{
|
|
char *s;
|
|
int bytes;
|
|
|
|
bytes = WideCharToMultiByte(CP_UTF8, 0, ws, -1, NULL, 0, NULL, NULL);
|
|
if (bytes == 0) {
|
|
MR_fatal_error("MR_wide_to_utf8: WideCharToMultiByte failed");
|
|
}
|
|
MR_allocate_aligned_string_msg(s, bytes, alloc_id);
|
|
if (0 == WideCharToMultiByte(CP_UTF8, 0, ws, -1, s, bytes, NULL, NULL)) {
|
|
MR_fatal_error("MR_wide_to_utf8: WideCharToMultiByte failed");
|
|
}
|
|
return s;
|
|
}
|
|
#endif // MR_WIN32
|