mercury/runtime/mercury_string.h

/*
** Copyright (C) 1995-2005, 2007, 2011-2012 The University of Melbourne.
** This file may only be copied under the terms of the GNU Library General
** Public License - see the file COPYING.LIB in the Mercury distribution.
*/

/* mercury_string.h - string handling */

#ifndef MERCURY_STRING_H
#define MERCURY_STRING_H

#include "mercury_heap.h"	/* for MR_offset_incr_hp_atomic */

#include <string.h>	/* for strcmp() etc. */
#include <stdarg.h>

/*
** Mercury characters (Unicode code points) are given type `MR_Char', which is
** a typedef for `MR_int_least32_t'.
** Mercury strings are stored as pointers to '\0'-terminated arrays of `char'.
** Strings are UTF-8 encoded.
** Mercury strings must not contain null characters. Unexpected null characters
** are a source of security vulnerabilities.
**
** The actual typedefs are in mercury_types.h to avoid problems with
** circular #includes.
**
** typedef MR_int_least32_t MR_Char;
** typedef MR_uint_least32_t MR_UnsignedChar;
**
** typedef char *MR_String;
** typedef const char *MR_ConstString;
*/

/*
** MR_string_const("...", len):
**	Given a C string literal and its length, returns a Mercury string.
*/

#define MR_string_const(string, len) ((MR_String) string)

#define MR_make_string_const(string) \
		MR_string_const((string), sizeof(string) - 1)

/*
** MR_bool MR_string_equal(MR_ConstString s1, MR_ConstString s2):
** Return true iff the two Mercury strings s1 and s2 are equal.
*/

#define MR_string_equal(s1,s2) (strcmp((char*)(s1),(char*)(s2))==0)

/*
** void MR_make_aligned_string(MR_String ptr, const char *string):
**	Given a C string `string', set `ptr' to be a Mercury string
**	with the same contents.  (`ptr' must be an lvalue.)
**	If the resulting Mercury string is to be used by Mercury code,
**	then the string pointed to by `string' should have been either
**	statically allocated or allocated on the Mercury heap.
**
** BEWARE: this may modify `MR_hp', so it must only be called from
** places where `MR_hp' is valid.  If calling it from inside a C function,
** rather than inside Mercury code, you may need to call
** MR_{save/restore}_transient_hp().
**
** Algorithm: if the string is aligned, just set ptr equal to it.
** Otherwise, allocate space on the heap and copy the C string to
** the Mercury string.
*/

#define MR_make_aligned_string(ptr, string) 				\
	do { 								\
		if (MR_tag((MR_Word) (string)) != 0) {			\
			MR_make_aligned_string_copy((ptr), (string));	\
		} else { 						\
			/* The cast is there to cast away const, if needed */ \
			(ptr) = (MR_String) (string);			\
		}							\
	} while(0)

/*
** void MR_make_aligned_string_copy(MR_String ptr, const char * string):
** Same as MR_make_aligned_string(ptr, string), except that the string
** is guaranteed to be copied. This is useful for copying C strings
** onto the Mercury heap.
**
** BEWARE: this may modify `MR_hp', so it must only be called from
** places where `MR_hp' is valid.  If calling it from inside a C function,
** rather than inside Mercury code, you may need to call
** MR_{save/restore}_transient_hp().
*/

#define MR_make_aligned_string_copy(ptr, string)                              \
	MR_make_aligned_string_copy_msg((ptr), (string), NULL)

#define MR_make_aligned_string_copy_msg(ptr, string, alloc_id)                \
	do {                                                                  \
		MR_Word make_aligned_string_tmp;                              \
		char	*make_aligned_string_ptr;                             \
                                                                              \
		MR_offset_incr_hp_atomic_msg(make_aligned_string_tmp, 0,      \
			(strlen(string) + sizeof(MR_Word)) / sizeof(MR_Word), \
			(alloc_id), "string.string/0");                       \
		make_aligned_string_ptr =                                     \
			(char *) make_aligned_string_tmp;                     \
		strcpy(make_aligned_string_ptr, (string));                    \
		(ptr) = make_aligned_string_ptr;                              \
	} while(0)

/*
** void MR_make_aligned_string_copy_saved_hp(MR_String ptr,
** 	const char * string):
** Same as MR_make_aligned_string_copy(ptr, string), except that it uses
** MR_offset_incr_saved_hp_atomic instead of MR_offset_incr_hp_atomic.
*/

#define MR_make_aligned_string_copy_saved_hp(ptr, string, alloc_id)           \
	do {                                                                  \
		MR_Word	make_aligned_string_tmp;                              \
		char	*make_aligned_string_ptr;                             \
                                                                              \
		MR_offset_incr_saved_hp_atomic(make_aligned_string_tmp,	0,    \
		    (strlen(string) + sizeof(MR_Word)) / sizeof(MR_Word),     \
		    (alloc_id), "string.string/0");                           \
		make_aligned_string_ptr =                                     \
		    (char *) make_aligned_string_tmp;                         \
		strcpy(make_aligned_string_ptr, (string));                    \
		(ptr) = make_aligned_string_ptr;                              \
	} while(0)

/*
** void MR_make_aligned_string_copy_saved_hp_quote(MR_String ptr,
**		const char * string):
** Same as MR_make_aligned_string_copy_saved_hp(ptr, string), except that
** it puts double quote marks at the start and end of the string.
*/

#define MR_make_aligned_string_copy_saved_hp_quote(ptr, string, alloc_id)     \
	do {                                                                  \
		MR_Word	make_aligned_string_tmp;                              \
		char	*make_aligned_string_ptr;                             \
                                                                              \
		MR_offset_incr_saved_hp_atomic(make_aligned_string_tmp,	0,    \
		    (strlen(string) + 2 + sizeof(MR_Word)) / sizeof(MR_Word), \
		    (alloc_id), "string.string/0");                           \
		make_aligned_string_ptr =                                     \
		    (char *) make_aligned_string_tmp;                         \
		sprintf(make_aligned_string_ptr, "%c%s%c", '"', string, '"'); \
		(ptr) = make_aligned_string_ptr;                              \
	} while(0)

/*
** void MR_allocate_aligned_string_msg(MR_String ptr, size_t len,
**	MR_Code *proclabel):
** Allocate enough word aligned memory to hold len characters.  Also
** record for memory profiling purposes the location, proclabel, of the
** allocation if profiling is enabled.
**
** BEWARE: this may modify `MR_hp', so it must only be called from
** places where `MR_hp' is valid.  If calling it from inside a C function,
** rather than inside Mercury code, you may need to call
** MR_{save/restore}_transient_hp().
*/

#define MR_allocate_aligned_string_msg(ptr, len, alloc_id)                    \
	do {                                                                  \
		MR_Word	make_aligned_string_tmp;                              \
		char	*make_aligned_string_ptr;                             \
                                                                              \
		MR_offset_incr_hp_atomic_msg(make_aligned_string_tmp, 0,      \
		    ((len) + sizeof(MR_Word)) / sizeof(MR_Word),              \
		    (alloc_id), "string.string/0");                           \
		make_aligned_string_ptr =                                     \
		    (char *) make_aligned_string_tmp;                         \
		(ptr) = make_aligned_string_ptr;                              \
	} while(0)

#define MR_allocate_aligned_string_saved_hp(ptr, len, alloc_id)               \
	do {                                                                  \
		MR_Word	make_aligned_string_tmp;                              \
		char	*make_aligned_string_ptr;                             \
                                                                              \
		MR_offset_incr_saved_hp_atomic(make_aligned_string_tmp, 0,    \
		    ((len) + sizeof(MR_Word)) / sizeof(MR_Word),              \
		    (alloc_id), "string.string/0");                           \
		make_aligned_string_ptr =                                     \
		    (char *) make_aligned_string_tmp;                         \
		(ptr) = make_aligned_string_ptr;                              \
	} while(0)

/*
** MR_do_hash_string{,2,3}(int & hash, MR_Word string):
** Given a Mercury string `string', set `hash' to the hash value
** for that string.  (`hash' must be an lvalue.)
**
** This is an implementation detail used to implement MR_hash_string{,2,3}().
** It should not be used directly. Use MR_hash_string{,2,3}() instead.
**
** Note that these functions are also defined in library/string.m.
** The definition here and in string.m must be kept equivalent.
*/

#define MR_do_hash_string(hash, s)					\
	{								\
		int len;						\
		MR_CHECK_EXPR_TYPE(hash, int);				\
		MR_CHECK_EXPR_TYPE(s, MR_ConstString);			\
		len = 0;						\
		hash = 0;						\
		while (((const unsigned char *)(s))[len]) {		\
			hash ^= (hash << 5);				\
			hash ^= ((const unsigned char *)(s))[len];	\
			len++;						\
		}							\
		hash ^= len;						\
	}

#define MR_do_hash_string2(hash, s)					\
	{								\
		int len;						\
		MR_CHECK_EXPR_TYPE(hash, int);				\
		MR_CHECK_EXPR_TYPE(s, MR_ConstString);			\
		len = 0;						\
		hash = 0;						\
		while (((const unsigned char *)(s))[len]) {		\
			hash = hash * 37;				\
			hash += ((const unsigned char *)(s))[len];	\
			len++;						\
		}							\
		hash ^= len;						\
	}

#define MR_do_hash_string3(hash, s)					\
	{								\
		int len;						\
		MR_CHECK_EXPR_TYPE(hash, int);				\
		MR_CHECK_EXPR_TYPE(s, MR_ConstString);			\
		len = 0;						\
		hash = 0;						\
		while (((const unsigned char *)(s))[len]) {		\
			hash = hash * 49;				\
			hash += ((const unsigned char *)(s))[len];	\
			len++;						\
		}							\
		hash ^= len;						\
	}

/*
** MR_hash_string{,2,3}(s):
**	Given a Mercury string `s', return a hash value for that string.
*/

MR_Integer	MR_hash_string(MR_ConstString);
MR_Integer	MR_hash_string2(MR_ConstString);
MR_Integer	MR_hash_string3(MR_ConstString);

#if defined(MR_GNUC)
#define MR_hash_string(s)						\
	({								\
	 	MR_Integer hash_string_result;				\
		MR_CHECK_EXPR_TYPE(s, MR_ConstString);			\
		MR_do_hash_string(hash_string_result, s);		\
		hash_string_result;					\
	})

#define MR_hash_string2(s)						\
	({								\
	 	MR_Integer hash_string_result;				\
		MR_CHECK_EXPR_TYPE(s, MR_ConstString);			\
		MR_do_hash_string2(hash_string_result, s);		\
		hash_string_result;					\
	})

#define MR_hash_string3(s)						\
	({								\
	 	MR_Integer hash_string_result;				\
		MR_CHECK_EXPR_TYPE(s, MR_ConstString);			\
		MR_do_hash_string3(hash_string_result, s);		\
		hash_string_result;					\
	})
#endif

/*
** If we are not using gcc, the actual definitions of these functions
** are runtime/mercury_string.c; they use the macros below.
*/

#define MR_HASH_STRING_FUNC_BODY					\
	   MR_Integer hash_string_result;				\
	   MR_do_hash_string(hash_string_result, s);			\
	   return hash_string_result;
#define MR_HASH_STRING2_FUNC_BODY					\
	   MR_Integer hash_string_result;				\
	   MR_do_hash_string2(hash_string_result, s);			\
	   return hash_string_result;
#define MR_HASH_STRING3_FUNC_BODY					\
	   MR_Integer hash_string_result;				\
	   MR_do_hash_string3(hash_string_result, s);			\
	   return hash_string_result;

/*
** A version of strcmp to which we can pass Mercury words
** without having to cast the arguments first.
*/

#define MR_strcmp(s, t) 	strcmp((const char *)(s), (const char *)(t))

/*
** Return an MR_String which has been created using the format string,
** fmt, passed to sprintf.  If memory profiling is turned on, record the
** allocation as coming from proclabel.  The MR_String returned has been
** allocated on the mercury heap using MR_allocate_aligned_string_msg.
**
** BEWARE: this may modify the saved copy of `MR_hp', so it must only be
** called from places where the saved copy of `MR_hp' is valid.
** You will generally need to call MR_{save/restore}_transient_hp()
** before/after calling this function.
*/

MR_String MR_make_string(MR_AllocSiteInfoPtr alloc_id, const char *fmt, ...);

/*
** True if c is an ASCII code point, i.e. U+0000..U+007f.
*/
#define MR_is_ascii(c)              ((unsigned)(c) <= 0x7f)

/*
** True if c is a Unicode surrogate code point, i.e. U+D800..U+DFFF.
*/
#define MR_is_surrogate(c)          (((unsigned)(c) & 0xF800) == 0xD800)

/*
** UTF-8 manipulation
*/

#define MR_utf8_is_single_byte(c)   (((unsigned)(c) & 0x80) == 0)
#define MR_utf8_is_lead_byte(c)     (((unsigned)(c) - 0xC0) < 0x3E)
#define MR_utf8_is_trail_byte(c)    (((unsigned)(c) & 0xC0) == 0x80)

/*
** Advance `*pos' to the beginning of the next code point in `s'.
** If `*pos' is already at the end of the string then return MR_FALSE
** without modifying `*pos'.
*/
extern MR_bool  MR_utf8_next(const MR_String s_, MR_Integer *pos);

/*
** Rewind `*pos' to the beginning of the previous code point in `s'.
** If `*pos' is already at the beginning of the string then return MR_FALSE
** without modifying `*pos'.
*/
extern MR_bool  MR_utf8_prev(const MR_String s_, MR_Integer *pos);

/*
** Decode and return the code point beginning at `pos' in `s'.
** Return 0 if at the end of the string (i.e. the NUL terminator).
** If an illegal code sequence exists at that offset, return -2.
**
** The _mb version requires s[pos] to be the lead byte of a multibyte code
** point.
*/
extern MR_int_least32_t MR_utf8_get(const MR_String s, MR_Integer pos);
extern MR_int_least32_t MR_utf8_get_mb(const MR_String s, MR_Integer pos,
    int *width);

/*
** Decode the code point beginning at `pos' in `s', and advance `*pos'.
** The _mb version requires s[pos] to be the lead byte of a multibyte code
** point.
*/
extern MR_int_least32_t MR_utf8_get_next(const MR_String s, MR_Integer *pos);
extern MR_int_least32_t MR_utf8_get_next_mb(const MR_String s,
    MR_Integer *pos);

/*
** Rewind `*pos' to the beginning of the previous code point in `s'
** and return that code code.
** Return -1 if `*pos' is already at the beginning of the string.
*/
extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, MR_Integer *pos);

/*
** Return the number of bytes required to encode the code point `c' in UTF-8.
*/
extern size_t   MR_utf8_width(MR_Char c);

/*
** Encode the code point `c' into the buffer `s'.
** Return the number of bytes used.
*/
extern size_t   MR_utf8_encode(char s[], MR_Char c);

/*
** Return MR_TRUE iff `s' contains a valid UTF-8 encoded string.
*/
extern MR_bool  MR_utf8_verify(const MR_String s);

#endif /* not MERCURY_STRING_H */