mirror of
https://github.com/Mercury-Language/mercury.git
synced 2025-12-11 20:03:28 +00:00
Branches: main, 11.07 Optimise some UTF-8 routines in C grades and fix a few bugs. library/string.m: Avoid function calls in unsafe_index, unsafe_index_next, and unsafe_prev_index in the ASCII case. Handle illegal code unit at start of string in first_char(in, uo, in) and first_char(in, uo, uo) modes. runtime/mercury_string.c: runtime/mercury_string.h: Fix a bug where MR_utf8_next would not advance from pos 0. Fortunately MR_utf8_next is only rarely called, to skip past illegal code units. Delete redundant initial test in MR_utf8_prev. Add MR_utf8_get_mb to extract multibyte code points only. Unroll a loop. Add MR_utf8_get_next_mb to extract multibyte code points only. Make MR_utf8_prev_get avoid an extra function call in the ASCII case. Use MR_Integer consistently for string offsets instead of int.
389 lines
7.9 KiB
C
389 lines
7.9 KiB
C
/*
|
|
** vim: ts=4 sw=4 expandtab
|
|
*/
|
|
/*
|
|
** Copyright (C) 2000-2002, 2006, 2011-2012 The University of Melbourne.
|
|
** This file may only be copied under the terms of the GNU Library General
|
|
** Public License - see the file COPYING.LIB in the Mercury distribution.
|
|
*/
|
|
|
|
/* mercury_string.c - string handling */
|
|
|
|
#include "mercury_imp.h"
|
|
#include "mercury_string.h"
|
|
|
|
#if defined(MR_HAVE__VSNPRINTF) && ! defined(MR_HAVE_VSNPRINTF)
|
|
#define vsnprintf _vsnprintf
|
|
#endif
|
|
|
|
#if defined(MR_HAVE_VSNPRINTF) || defined(MR_HAVE__VSNPRINTF)
|
|
#define MR_HAVE_A_VSNPRINTF
|
|
#endif
|
|
|
|
#define BUFFER_SIZE 4096
|
|
|
|
MR_String
|
|
MR_make_string(MR_AllocSiteInfoPtr alloc_id, const char *fmt, ...)
|
|
{
|
|
va_list ap;
|
|
MR_String result;
|
|
int n;
|
|
char *p;
|
|
|
|
#ifdef MR_HAVE_A_VSNPRINTF
|
|
int size = BUFFER_SIZE;
|
|
char fixed[BUFFER_SIZE];
|
|
MR_bool dynamically_allocated = MR_FALSE;
|
|
|
|
/*
|
|
** On the first iteration we try with a fixed-size buffer.
|
|
** If that didn't work, use a dynamically allocated array twice
|
|
** the size of the fixed array and keep growing the array until
|
|
** the string fits.
|
|
*/
|
|
p = fixed;
|
|
|
|
while (1) {
|
|
/* Try to print in the allocated space. */
|
|
va_start(ap, fmt);
|
|
n = vsnprintf(p, size, fmt, ap);
|
|
va_end(ap);
|
|
|
|
/* If that worked, return the string. */
|
|
if (n > -1 && n < size) {
|
|
break;
|
|
}
|
|
|
|
/* Else try again with more space. */
|
|
if (n > -1) { /* glibc 2.1 */
|
|
size = n + 1; /* precisely what is needed */
|
|
} else { /* glibc 2.0 */
|
|
size *= 2; /* twice the old size */
|
|
}
|
|
|
|
if (!dynamically_allocated) {
|
|
p = MR_NEW_ARRAY(char, size);
|
|
dynamically_allocated = MR_TRUE;
|
|
} else {
|
|
p = MR_RESIZE_ARRAY(p, char, size);
|
|
}
|
|
}
|
|
|
|
#else
|
|
/*
|
|
** It is possible for this buffer to overflow,
|
|
** and then bad things may happen.
|
|
*/
|
|
char fixed[40960];
|
|
|
|
va_start(ap, fmt);
|
|
n = vsprintf(fixed, fmt, ap);
|
|
va_end(ap);
|
|
|
|
p = fixed;
|
|
#endif
|
|
MR_restore_transient_hp();
|
|
MR_allocate_aligned_string_msg(result, strlen(p), alloc_id);
|
|
MR_save_transient_hp();
|
|
strcpy(result, p);
|
|
|
|
#ifdef MR_HAVE_A_VSNPRINTF
|
|
if (dynamically_allocated) {
|
|
MR_free(p);
|
|
}
|
|
#endif
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
** Note that MR_hash_string{,2,3} are actually defined as macros in
|
|
** mercury_string.h, if we're using GNU C.
|
|
** We define them here whether or not we're using gcc, so that users
|
|
** can easily switch between gcc and cc without rebuilding the libraries.
|
|
*/
|
|
|
|
#undef MR_hash_string
|
|
#undef MR_hash_string2
|
|
#undef MR_hash_string3
|
|
|
|
MR_Integer
|
|
MR_hash_string(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string2(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING2_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string3(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING3_FUNC_BODY
|
|
}
|
|
|
|
MR_bool
|
|
MR_utf8_next(const MR_String s_, MR_Integer *pos)
|
|
{
|
|
const unsigned char *s = (const unsigned char *)s_;
|
|
int c;
|
|
|
|
if (s[*pos] == '\0') {
|
|
/* End of string. */
|
|
return MR_FALSE;
|
|
}
|
|
|
|
for (;;) {
|
|
++(*pos);
|
|
c = s[*pos];
|
|
if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return MR_TRUE;
|
|
}
|
|
|
|
MR_bool
|
|
MR_utf8_prev(const MR_String s_, MR_Integer *pos)
|
|
{
|
|
const unsigned char *s = (const unsigned char *)s_;
|
|
int c;
|
|
|
|
while (*pos > 0) {
|
|
(*pos)--;
|
|
c = s[*pos];
|
|
if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
|
|
return MR_TRUE;
|
|
}
|
|
}
|
|
|
|
return MR_FALSE;
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get(const MR_String s_, MR_Integer pos)
|
|
{
|
|
const unsigned char *s = (const unsigned char *)s_;
|
|
int c;
|
|
int width;
|
|
|
|
c = s[pos];
|
|
if (MR_is_ascii(c)) {
|
|
return c;
|
|
} else {
|
|
return MR_utf8_get_mb(s_, pos, &width);
|
|
}
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get_mb(const MR_String s_, MR_Integer pos, int *width)
|
|
{
|
|
const unsigned char *s = (const unsigned char *)s_;
|
|
int c;
|
|
int d;
|
|
int minc;
|
|
|
|
c = s[pos];
|
|
|
|
/* c <= 0x7f (ASCII) must be handled before calling this function. */
|
|
|
|
if (c <= 0xC1) {
|
|
/* Trailing byte of multi-byte sequence or an overlong encoding for
|
|
* code point <= 127.
|
|
*/
|
|
return -2;
|
|
}
|
|
|
|
if (c <= 0xDF) {
|
|
/* 2-byte sequence. */
|
|
c &= 0x1F;
|
|
*width = 2;
|
|
minc = 0x80;
|
|
}
|
|
else if (c <= 0xEF) {
|
|
/* 3-byte sequence. */
|
|
c &= 0x0F;
|
|
*width = 3;
|
|
minc = 0x800;
|
|
}
|
|
else if (c <= 0xF4) {
|
|
/* 4-byte sequence. */
|
|
c &= 0x07;
|
|
*width = 4;
|
|
minc = 0x10000;
|
|
}
|
|
else {
|
|
/* Otherwise invalid. */
|
|
return -2;
|
|
}
|
|
|
|
switch (*width) {
|
|
case 4:
|
|
d = s[++pos];
|
|
if (!MR_utf8_is_trail_byte(d)) {
|
|
return -2;
|
|
}
|
|
c = (c << 6) | (d & 0x3F);
|
|
/* fall through */
|
|
case 3:
|
|
d = s[++pos];
|
|
if (!MR_utf8_is_trail_byte(d)) {
|
|
return -2;
|
|
}
|
|
c = (c << 6) | (d & 0x3F);
|
|
/* fall through */
|
|
case 2:
|
|
d = s[++pos];
|
|
if (!MR_utf8_is_trail_byte(d)) {
|
|
return -2;
|
|
}
|
|
c = (c << 6) | (d & 0x3F);
|
|
break;
|
|
}
|
|
|
|
/* Check for overlong forms, which could be used to bypass security
|
|
* validations. We could also check code points aren't above U+10FFFF or in
|
|
* the surrogate ranges, but we don't.
|
|
*/
|
|
|
|
if (c < minc) {
|
|
return -2;
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get_next(const MR_String s, MR_Integer *pos)
|
|
{
|
|
int c;
|
|
|
|
c = s[*pos];
|
|
if (MR_is_ascii(c)) {
|
|
(*pos)++;
|
|
return c;
|
|
}
|
|
|
|
return MR_utf8_get_next_mb(s, pos);
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get_next_mb(const MR_String s, MR_Integer *pos)
|
|
{
|
|
int c, width;
|
|
|
|
c = MR_utf8_get_mb(s, *pos, &width);
|
|
if (c >= 0) {
|
|
/* Multibyte code point. */
|
|
(*pos) += width;
|
|
return c;
|
|
}
|
|
|
|
/* Some invalid byte sequence. */
|
|
MR_utf8_next(s, pos);
|
|
return c;
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_prev_get(const MR_String s, MR_Integer *pos)
|
|
{
|
|
int c, width;
|
|
|
|
if (MR_utf8_prev(s, pos)) {
|
|
c = s[*pos];
|
|
if (MR_is_ascii(c)) {
|
|
return c;
|
|
} else {
|
|
return MR_utf8_get_mb(s, *pos, &width);
|
|
}
|
|
}
|
|
|
|
/* Past beginning. */
|
|
return -1;
|
|
}
|
|
|
|
size_t
|
|
MR_utf8_width(MR_Char c)
|
|
{
|
|
/* So we don't need to check for negative values nor use unsigned ints
|
|
* in the interface, which are a pain.
|
|
*/
|
|
MR_UnsignedChar uc = c;
|
|
|
|
if (uc <= 0x7f) {
|
|
return 1;
|
|
}
|
|
if (uc <= 0x7ff) {
|
|
return 2;
|
|
}
|
|
if (uc <= 0xffff) {
|
|
return (MR_is_surrogate(uc)) ? 0 : 3;
|
|
}
|
|
if (uc <= 0x10ffff) {
|
|
return 4;
|
|
}
|
|
/* The rest are illegal. */
|
|
return 0;
|
|
}
|
|
|
|
size_t
|
|
MR_utf8_encode(char s_[], MR_Char c)
|
|
{
|
|
unsigned char *s = (unsigned char *)s_;
|
|
MR_UnsignedChar uc = c;
|
|
|
|
if (uc <= 0x7f) {
|
|
s[0] = uc;
|
|
return 1;
|
|
}
|
|
|
|
if (uc <= 0x7ff) {
|
|
s[0] = 0xC0 | ((uc >> 6) & 0x1F);
|
|
s[1] = 0x80 | (uc & 0x3F);
|
|
return 2;
|
|
}
|
|
|
|
if (uc <= 0xffff) {
|
|
if (MR_is_surrogate(uc)) {
|
|
return 0;
|
|
}
|
|
s[0] = 0xE0 | ((uc >> 12) & 0x0F);
|
|
s[1] = 0x80 | ((uc >> 6) & 0x3F);
|
|
s[2] = 0x80 | (uc & 0x3F);
|
|
return 3;
|
|
}
|
|
|
|
if (uc <= 0x10ffff) {
|
|
s[0] = 0xF0 | ((uc >> 18) & 0x07);
|
|
s[1] = 0x80 | ((uc >> 12) & 0x3F);
|
|
s[2] = 0x80 | ((uc >> 6) & 0x3F);
|
|
s[3] = 0x80 | (uc & 0x3F);
|
|
return 4;
|
|
}
|
|
|
|
/* Otherwise is illegal. */
|
|
return 0;
|
|
}
|
|
|
|
MR_bool
|
|
MR_utf8_verify(const MR_String s)
|
|
{
|
|
MR_Integer pos = 0;
|
|
|
|
for (;;) {
|
|
MR_int_least32_t c;
|
|
|
|
c = MR_utf8_get_next(s, &pos);
|
|
if (c == 0) {
|
|
return MR_TRUE;
|
|
}
|
|
if (c < 0) {
|
|
return MR_FALSE;
|
|
}
|
|
}
|
|
}
|