mirror of
https://github.com/Mercury-Language/mercury.git
synced 2026-04-15 17:33:38 +00:00
library/mercury_term_lexer.m:
Convert any occurrences of the \e escape sequence to the escape character.
List all the places in the library, compiler and runtime that also handle
escape sequences, some of which handle all these sequences,
and some of which handle only subsets.
Sort the letters in recognized escape sequences.
compiler/parse_tree_out_pragma.m:
library/rtti_implementation.m:
library/term_io.m:
Add comments to all the other places that handle escape sequences
that direct readers to mercury_term_lexer.m as containing the master list
of such sequences.
Add commented-out code that, after stage 1 has been installed,
stage 2 should enable.
runtime/mercury_ml_expand_body.h:
runtime/mercury_string.c:
Turn escape characters back into their escape sequence form
for characters and strings.
tests/valid_seq/char_escape_opt_helper_1.m:
Test whether the compiler accepts \e as an escape sequence.
compiler/options.m:
Add a mechanism for detecting the presence of this diff in the
installed compiler.
630 lines
15 KiB
C
630 lines
15 KiB
C
// vim: ts=4 sw=4 expandtab ft=c
|
|
|
|
// Copyright (C) 2000-2002, 2006, 2011-2012 The University of Melbourne.
|
|
// Copyright (C) 2015-2016, 2018-2019, 2023-2024 The Mercury team.
|
|
// This file is distributed under the terms specified in COPYING.LIB.
|
|
|
|
// mercury_string.c - string handling
|
|
|
|
#include "mercury_imp.h"
|
|
#include "mercury_string.h"
|
|
#include "mercury_windows.h"
|
|
|
|
#ifdef _MSC_VER
|
|
// Disable warnings about using _vsnprintf being deprecated.
|
|
#pragma warning(disable:4996)
|
|
|
|
// va_copy is available from VC 2013 onwards.
|
|
#if _MSC_VER < 1800
|
|
#define va_copy(a, b) ((a) = (b))
|
|
#endif
|
|
#endif
|
|
|
|
#if defined(MR_HAVE__VSNPRINTF)
|
|
int
|
|
MR_vsnprintf(char *str, size_t size, const char *format, va_list ap)
|
|
{
|
|
va_list ap_copy;
|
|
int n;
|
|
|
|
if (size == 0) {
|
|
return _vsnprintf(NULL, 0, format, ap);
|
|
}
|
|
|
|
// _vsnprintf does not append a null terminator if the output is truncated.
|
|
// Follow the MS advice of initialising the buffer to null before calling
|
|
// _vsnprintf with a count strictly less than the buffer length.
|
|
memset(str, 0, size);
|
|
va_copy(ap_copy, ap);
|
|
n = _vsnprintf(str, size - 1, format, ap_copy);
|
|
va_end(ap_copy);
|
|
|
|
if (n == -1) {
|
|
// Return the number of characters that would have been written
|
|
// without truncation, to match the behaviour of C99 vsnprintf.
|
|
n = _vsnprintf(NULL, 0, format, ap);
|
|
}
|
|
|
|
return n;
|
|
}
|
|
#endif
|
|
|
|
#if defined(MR_HAVE__SNPRINTF)
|
|
int
|
|
MR_snprintf(char *str, size_t size, const char *format, ...)
|
|
{
|
|
va_list ap;
|
|
int n;
|
|
|
|
va_start(ap, format);
|
|
n = MR_vsnprintf(str, size, format, ap);
|
|
va_end(ap);
|
|
|
|
return n;
|
|
}
|
|
#endif
|
|
|
|
#define BUFFER_SIZE 4096
|
|
|
|
MR_String
|
|
MR_make_string(MR_AllocSiteInfoPtr alloc_id, const char *fmt, ...)
|
|
{
|
|
va_list ap;
|
|
MR_String result;
|
|
int n;
|
|
char *p;
|
|
|
|
int size = BUFFER_SIZE;
|
|
char fixed[BUFFER_SIZE];
|
|
MR_bool dynamically_allocated = MR_FALSE;
|
|
|
|
// On the first iteration we try with a fixed-size buffer.
|
|
// If that didn't work, use a dynamically allocated array twice
|
|
// the size of the fixed array and keep growing the array until
|
|
// the string fits.
|
|
|
|
p = fixed;
|
|
|
|
while (1) {
|
|
// Try to print in the allocated space.
|
|
va_start(ap, fmt);
|
|
n = MR_vsnprintf(p, size, fmt, ap);
|
|
va_end(ap);
|
|
|
|
// If that worked, return the string.
|
|
if (n > -1 && n < size) {
|
|
break;
|
|
}
|
|
|
|
// Else try again with more space.
|
|
if (n > -1) { // glibc 2.1
|
|
size = n + 1; // precisely what is needed
|
|
} else { // glibc 2.0
|
|
size *= 2; // twice the old size
|
|
}
|
|
|
|
if (!dynamically_allocated) {
|
|
p = MR_NEW_ARRAY(char, size);
|
|
dynamically_allocated = MR_TRUE;
|
|
} else {
|
|
p = MR_RESIZE_ARRAY(p, char, size);
|
|
}
|
|
}
|
|
|
|
MR_restore_transient_hp();
|
|
MR_allocate_aligned_string_msg(result, strlen(p), alloc_id);
|
|
MR_save_transient_hp();
|
|
strcpy(result, p);
|
|
|
|
if (dynamically_allocated) {
|
|
MR_free(p);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// The code for this function should be kept in sync with that of the
|
|
// quote_string predicates in library/term_io.m.
|
|
MR_bool
|
|
MR_escape_string_quote(MR_String *ptr, const char * string)
|
|
{
|
|
MR_Integer pos = 0;
|
|
size_t num_code_units = 0;
|
|
MR_Char ch;
|
|
MR_bool must_escape = MR_FALSE;
|
|
|
|
// Check if we need to add character escapes to the string,
|
|
// and in case the answer turns out to be "yes", compute the number
|
|
// of code units that the escaped string would need space for.
|
|
// XXX ILSEQ Check for surrogate code points.
|
|
while ((ch = MR_utf8_get_next((MR_String) string, &pos)) > 0) {
|
|
switch (ch) {
|
|
case '\a':
|
|
case '\b':
|
|
case '\f':
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case '\v':
|
|
// While gcc and clang support '\e', some other C compilers do not.
|
|
case '\x1B':
|
|
case '\"':
|
|
case '\\':
|
|
num_code_units += 2;
|
|
must_escape = MR_TRUE;
|
|
break;
|
|
default:
|
|
if (MR_is_control(ch)) {
|
|
// All control characters that do not have a specific
|
|
// backslash escape are octal escaped.
|
|
// This takes five code units (see below).
|
|
num_code_units += 5;
|
|
must_escape = MR_TRUE;
|
|
} else {
|
|
num_code_units += MR_utf8_width(ch);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check that the string's encoding was valid.
|
|
if (ch < 0) {
|
|
*ptr = NULL;
|
|
return MR_FALSE;
|
|
}
|
|
|
|
if (must_escape) {
|
|
char *dst;
|
|
|
|
// We need two code units of space for the initial and final quotes,
|
|
// and one for the final NUL char.
|
|
num_code_units += 3;
|
|
MR_allocate_aligned_string_saved_hp(*ptr, num_code_units, NULL);
|
|
|
|
dst = *ptr;
|
|
dst[0] = '\"';
|
|
dst++;
|
|
pos = 0;
|
|
while ((ch = MR_utf8_get_next((MR_String) string, &pos)) > 0) {
|
|
switch (ch) {
|
|
case '\a':
|
|
dst[0] = '\\';
|
|
dst[1] = 'a';
|
|
dst += 2;
|
|
break;
|
|
case '\b':
|
|
dst[0] = '\\';
|
|
dst[1] = 'b';
|
|
dst += 2;
|
|
break;
|
|
case '\f':
|
|
dst[0] = '\\';
|
|
dst[1] = 'f';
|
|
dst += 2;
|
|
break;
|
|
case '\n':
|
|
dst[0] = '\\';
|
|
dst[1] = 'n';
|
|
dst += 2;
|
|
break;
|
|
case '\r':
|
|
dst[0] = '\\';
|
|
dst[1] = 'r';
|
|
dst += 2;
|
|
break;
|
|
case '\t':
|
|
dst[0] = '\\';
|
|
dst[1] = 't';
|
|
dst += 2;
|
|
break;
|
|
case '\v':
|
|
dst[0] = '\\';
|
|
dst[1] = 'v';
|
|
dst += 2;
|
|
break;
|
|
// See the comment above.
|
|
case '\x1B':
|
|
dst[0] = '\\';
|
|
dst[1] = 'e';
|
|
dst += 2;
|
|
break;
|
|
case '\"':
|
|
dst[0] = '\\';
|
|
dst[1] = '\"';
|
|
dst += 2;
|
|
break;
|
|
case '\\':
|
|
dst[0] = '\\';
|
|
dst[1] = '\\';
|
|
dst += 2;
|
|
break;
|
|
default:
|
|
if (MR_is_control(ch)) {
|
|
sprintf(dst, "\\%03" MR_INTEGER_LENGTH_MODIFIER "o\\",
|
|
(MR_Integer) ch);
|
|
dst += 5;
|
|
} else {
|
|
dst += MR_utf8_encode(dst, ch);
|
|
}
|
|
}
|
|
}
|
|
dst[0] = '\"';
|
|
dst[1] = '\0';
|
|
} else {
|
|
MR_make_aligned_string_copy_saved_hp_quote(*ptr, string, NULL);
|
|
}
|
|
return MR_TRUE;
|
|
}
|
|
|
|
// Note that MR_hash_string{,2,3,4,5,6} are actually defined as macros in
|
|
// mercury_string.h, if we are using GNU C.
|
|
// We define them here whether or not we are using gcc, so that users
|
|
// can easily switch between gcc and cc without rebuilding the libraries.
|
|
|
|
#undef MR_hash_string
|
|
#undef MR_hash_string2
|
|
#undef MR_hash_string3
|
|
#undef MR_hash_string4
|
|
#undef MR_hash_string5
|
|
#undef MR_hash_string6
|
|
|
|
MR_Integer
|
|
MR_hash_string(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string2(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING2_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string3(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING3_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string4(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING4_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string5(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING5_FUNC_BODY
|
|
}
|
|
|
|
MR_Integer
|
|
MR_hash_string6(MR_ConstString s)
|
|
{
|
|
MR_HASH_STRING6_FUNC_BODY
|
|
}
|
|
|
|
MR_bool
|
|
MR_utf8_next(const MR_String s_, MR_Integer *pos)
|
|
{
|
|
// XXX Several functions have this cast from const MR_String. Why?
|
|
// With gcc 7.5.0 a least, things work fine without the cast.
|
|
const unsigned char *s = (const unsigned char *) s_;
|
|
int c;
|
|
|
|
if (s[*pos] == '\0') {
|
|
// End of string.
|
|
return MR_FALSE;
|
|
}
|
|
|
|
// NOTE In situations where the input string contains many
|
|
// multi-byte code points, it would be faster to replace all references
|
|
// to *pos in this loop with a local variable, copying the values of *pos
|
|
// to it and from it before loop entry and after loop exit respectively.
|
|
// However, since we expect the bulk of our input strings to consist
|
|
// of ASCII characters, there is no point.
|
|
//
|
|
// This consideration also applies to many of the functions below.
|
|
for (;;) {
|
|
++(*pos);
|
|
c = s[*pos];
|
|
// This won't run off the end of the string, because
|
|
// MR_utf8_is_single_byte('\0') succeeds.
|
|
if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return MR_TRUE;
|
|
}
|
|
|
|
MR_bool
|
|
MR_utf8_prev(const MR_String s_, MR_Integer *pos)
|
|
{
|
|
const unsigned char *s = (const unsigned char *) s_;
|
|
int c;
|
|
|
|
while (*pos > 0) {
|
|
(*pos)--;
|
|
c = s[*pos];
|
|
if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
|
|
return MR_TRUE;
|
|
}
|
|
}
|
|
|
|
return MR_FALSE;
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get(const MR_String s_, MR_Integer pos)
|
|
{
|
|
const unsigned char *s = (const unsigned char *) s_;
|
|
int c;
|
|
int width;
|
|
|
|
c = s[pos];
|
|
if (MR_is_ascii(c)) {
|
|
return c;
|
|
} else {
|
|
return MR_utf8_get_mb(s_, pos, &width);
|
|
}
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get_mb(const MR_String s_, MR_Integer pos, int *width)
|
|
{
|
|
const unsigned char *s = (const unsigned char *) s_;
|
|
int c;
|
|
int d;
|
|
int min_c;
|
|
|
|
c = s[pos];
|
|
|
|
// c <= 0x7f (ASCII) must be handled before calling this function.
|
|
|
|
if (c <= 0xC1) {
|
|
// Trailing byte of multi-byte sequence or an overlong encoding for
|
|
// code point <= 127.
|
|
return -2;
|
|
}
|
|
|
|
if (c <= 0xDF) {
|
|
// 2-byte sequence.
|
|
c &= 0x1F;
|
|
*width = 2;
|
|
min_c = 0x80;
|
|
}
|
|
else if (c <= 0xEF) {
|
|
// 3-byte sequence.
|
|
c &= 0x0F;
|
|
*width = 3;
|
|
min_c = 0x800;
|
|
}
|
|
else if (c <= 0xF4) {
|
|
// 4-byte sequence.
|
|
c &= 0x07;
|
|
*width = 4;
|
|
min_c = 0x10000;
|
|
}
|
|
else {
|
|
// Otherwise invalid.
|
|
return -2;
|
|
}
|
|
|
|
switch (*width) {
|
|
case 4:
|
|
d = s[++pos];
|
|
if (!MR_utf8_is_trail_byte(d)) {
|
|
return -2;
|
|
}
|
|
c = (c << 6) | (d & 0x3F);
|
|
// fall through
|
|
case 3:
|
|
d = s[++pos];
|
|
if (!MR_utf8_is_trail_byte(d)) {
|
|
return -2;
|
|
}
|
|
c = (c << 6) | (d & 0x3F);
|
|
// fall through
|
|
case 2:
|
|
d = s[++pos];
|
|
if (!MR_utf8_is_trail_byte(d)) {
|
|
return -2;
|
|
}
|
|
c = (c << 6) | (d & 0x3F);
|
|
break;
|
|
}
|
|
|
|
// Check for an overlong form, for a code point out of range, and
|
|
// for a surrogate code point.
|
|
if (c < min_c || c > 0x10FFFF || MR_is_surrogate(c)) {
|
|
return -2;
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get_next(const MR_String s, MR_Integer *pos)
|
|
{
|
|
int c;
|
|
|
|
c = s[*pos];
|
|
if (MR_is_ascii(c)) {
|
|
(*pos)++;
|
|
return c;
|
|
}
|
|
|
|
return MR_utf8_get_next_mb(s, pos);
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_get_next_mb(const MR_String s, MR_Integer *pos)
|
|
{
|
|
int c, width;
|
|
|
|
c = MR_utf8_get_mb(s, *pos, &width);
|
|
if (c >= 0) {
|
|
// Multibyte code point.
|
|
(*pos) += width;
|
|
return c;
|
|
}
|
|
|
|
// Some invalid byte sequence. Skip to the start of the next character,
|
|
// but return the indication of the presence of an ill-formed character.
|
|
MR_utf8_next(s, pos);
|
|
return c;
|
|
}
|
|
|
|
MR_int_least32_t
|
|
MR_utf8_prev_get(const MR_String s, MR_Integer *pos)
|
|
{
|
|
int c, width;
|
|
|
|
if (MR_utf8_prev(s, pos)) {
|
|
c = s[*pos];
|
|
if (MR_is_ascii(c)) {
|
|
return c;
|
|
} else {
|
|
return MR_utf8_get_mb(s, *pos, &width);
|
|
}
|
|
}
|
|
|
|
// Past beginning.
|
|
return -1;
|
|
}
|
|
|
|
size_t
|
|
MR_utf8_width(MR_Char c)
|
|
{
|
|
// So we don't need to check for negative values *or* use unsigned ints
|
|
// in the interface, which are a pain.
|
|
MR_UnsignedChar uc = c;
|
|
|
|
if (uc <= 0x7f) {
|
|
return 1;
|
|
}
|
|
if (uc <= 0x7ff) {
|
|
return 2;
|
|
}
|
|
if (uc <= 0xffff) {
|
|
return (MR_is_surrogate(uc)) ? 0 : 3;
|
|
}
|
|
if (uc <= 0x10ffff) {
|
|
return 4;
|
|
}
|
|
|
|
// The rest are illegal.
|
|
return 0;
|
|
}
|
|
|
|
size_t
|
|
MR_utf8_encode(char s_[], MR_Char c)
|
|
{
|
|
unsigned char *s = (unsigned char *) s_;
|
|
MR_UnsignedChar uc = c;
|
|
|
|
if (uc <= 0x7f) {
|
|
s[0] = uc;
|
|
return 1;
|
|
}
|
|
|
|
if (uc <= 0x7ff) {
|
|
s[0] = 0xC0 | ((uc >> 6) & 0x1F);
|
|
s[1] = 0x80 | (uc & 0x3F);
|
|
return 2;
|
|
}
|
|
|
|
if (uc <= 0xffff) {
|
|
if (MR_is_surrogate(uc)) {
|
|
return 0;
|
|
}
|
|
s[0] = 0xE0 | ((uc >> 12) & 0x0F);
|
|
s[1] = 0x80 | ((uc >> 6) & 0x3F);
|
|
s[2] = 0x80 | (uc & 0x3F);
|
|
return 3;
|
|
}
|
|
|
|
if (uc <= 0x10ffff) {
|
|
s[0] = 0xF0 | ((uc >> 18) & 0x07);
|
|
s[1] = 0x80 | ((uc >> 12) & 0x3F);
|
|
s[2] = 0x80 | ((uc >> 6) & 0x3F);
|
|
s[3] = 0x80 | (uc & 0x3F);
|
|
return 4;
|
|
}
|
|
|
|
// Otherwise is illegal.
|
|
return 0;
|
|
}
|
|
|
|
MR_bool
|
|
MR_utf8_verify(const MR_String s)
|
|
{
|
|
MR_Integer pos = 0;
|
|
|
|
for (;;) {
|
|
MR_int_least32_t c;
|
|
|
|
c = MR_utf8_get_next(s, &pos);
|
|
if (c == 0) {
|
|
return MR_TRUE;
|
|
}
|
|
if (c < 0) {
|
|
return MR_FALSE;
|
|
}
|
|
}
|
|
}
|
|
|
|
MR_Integer
|
|
MR_utf8_find_ill_formed_char(const MR_String s)
|
|
{
|
|
MR_Integer pos = 0;
|
|
|
|
for (;;) {
|
|
MR_int_least32_t c;
|
|
|
|
c = MR_utf8_get_next(s, &pos);
|
|
if (c == 0) {
|
|
return -1;
|
|
}
|
|
if (c < 0) {
|
|
return pos;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(MR_WIN32)
|
|
wchar_t *
|
|
MR_utf8_to_wide(const char *s)
|
|
{
|
|
int wslen;
|
|
wchar_t *ws;
|
|
|
|
wslen = MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0);
|
|
if (wslen == 0) {
|
|
MR_fatal_error("MR_utf8_to_wide: MultiByteToWideChar failed");
|
|
}
|
|
ws = MR_GC_NEW_ARRAY(wchar_t, wslen);
|
|
if (0 == MultiByteToWideChar(CP_UTF8, 0, s, -1, ws, wslen)) {
|
|
MR_fatal_error("MR_utf8_to_wide: MultiByteToWideChar failed");
|
|
}
|
|
return ws;
|
|
}
|
|
|
|
char *
|
|
MR_wide_to_utf8(const wchar_t *ws, MR_AllocSiteInfoPtr alloc_id)
|
|
{
|
|
char *s;
|
|
int bytes;
|
|
|
|
bytes = WideCharToMultiByte(CP_UTF8, 0, ws, -1, NULL, 0, NULL, NULL);
|
|
if (bytes == 0) {
|
|
MR_fatal_error("MR_wide_to_utf8: WideCharToMultiByte failed");
|
|
}
|
|
MR_allocate_aligned_string_msg(s, bytes, alloc_id);
|
|
if (0 == WideCharToMultiByte(CP_UTF8, 0, ws, -1, s, bytes, NULL, NULL)) {
|
|
MR_fatal_error("MR_wide_to_utf8: WideCharToMultiByte failed");
|
|
}
|
|
return s;
|
|
}
|
|
#endif // MR_WIN32
|