/* ** Copyright (C) 1995-2005, 2007, 2011-2012 The University of Melbourne. ** This file may only be copied under the terms of the GNU Library General ** Public License - see the file COPYING.LIB in the Mercury distribution. */ /* mercury_string.h - string handling */ #ifndef MERCURY_STRING_H #define MERCURY_STRING_H #include "mercury_heap.h" /* for MR_offset_incr_hp_atomic */ #include /* for strcmp() etc. */ #include /* ** Mercury characters (Unicode code points) are given type `MR_Char', which is ** a typedef for `MR_int_least32_t'. ** Mercury strings are stored as pointers to '\0'-terminated arrays of `char'. ** Strings are UTF-8 encoded. ** Mercury strings must not contain null characters. Unexpected null characters ** are a source of security vulnerabilities. ** ** The actual typedefs are in mercury_types.h to avoid problems with ** circular #includes. ** ** typedef MR_int_least32_t MR_Char; ** typedef MR_uint_least32_t MR_UnsignedChar; ** ** typedef char *MR_String; ** typedef const char *MR_ConstString; */ /* ** MR_string_const("...", len): ** Given a C string literal and its length, returns a Mercury string. */ #define MR_string_const(string, len) ((MR_String) string) #define MR_make_string_const(string) \ MR_string_const((string), sizeof(string) - 1) /* ** MR_bool MR_string_equal(MR_ConstString s1, MR_ConstString s2): ** Return true iff the two Mercury strings s1 and s2 are equal. */ #define MR_string_equal(s1,s2) (strcmp((char*)(s1),(char*)(s2))==0) /* ** void MR_make_aligned_string(MR_String ptr, const char *string): ** Given a C string `string', set `ptr' to be a Mercury string ** with the same contents. (`ptr' must be an lvalue.) ** If the resulting Mercury string is to be used by Mercury code, ** then the string pointed to by `string' should have been either ** statically allocated or allocated on the Mercury heap. ** ** BEWARE: this may modify `MR_hp', so it must only be called from ** places where `MR_hp' is valid. If calling it from inside a C function, ** rather than inside Mercury code, you may need to call ** MR_{save/restore}_transient_hp(). ** ** Algorithm: if the string is aligned, just set ptr equal to it. ** Otherwise, allocate space on the heap and copy the C string to ** the Mercury string. */ #define MR_make_aligned_string(ptr, string) \ do { \ if (MR_tag((MR_Word) (string)) != 0) { \ MR_make_aligned_string_copy((ptr), (string)); \ } else { \ /* The cast is there to cast away const, if needed */ \ (ptr) = (MR_String) (string); \ } \ } while(0) /* ** void MR_make_aligned_string_copy(MR_String ptr, const char * string): ** Same as MR_make_aligned_string(ptr, string), except that the string ** is guaranteed to be copied. This is useful for copying C strings ** onto the Mercury heap. ** ** BEWARE: this may modify `MR_hp', so it must only be called from ** places where `MR_hp' is valid. If calling it from inside a C function, ** rather than inside Mercury code, you may need to call ** MR_{save/restore}_transient_hp(). */ #define MR_make_aligned_string_copy(ptr, string) \ MR_make_aligned_string_copy_msg((ptr), (string), NULL) #define MR_make_aligned_string_copy_msg(ptr, string, alloc_id) \ do { \ MR_Word make_aligned_string_tmp; \ char *make_aligned_string_ptr; \ \ MR_offset_incr_hp_atomic_msg(make_aligned_string_tmp, 0, \ (strlen(string) + sizeof(MR_Word)) / sizeof(MR_Word), \ (alloc_id), "string.string/0"); \ make_aligned_string_ptr = \ (char *) make_aligned_string_tmp; \ strcpy(make_aligned_string_ptr, (string)); \ (ptr) = make_aligned_string_ptr; \ } while(0) /* ** void MR_make_aligned_string_copy_saved_hp(MR_String ptr, ** const char * string): ** Same as MR_make_aligned_string_copy(ptr, string), except that it uses ** MR_offset_incr_saved_hp_atomic instead of MR_offset_incr_hp_atomic. */ #define MR_make_aligned_string_copy_saved_hp(ptr, string, alloc_id) \ do { \ MR_Word make_aligned_string_tmp; \ char *make_aligned_string_ptr; \ \ MR_offset_incr_saved_hp_atomic(make_aligned_string_tmp, 0, \ (strlen(string) + sizeof(MR_Word)) / sizeof(MR_Word), \ (alloc_id), "string.string/0"); \ make_aligned_string_ptr = \ (char *) make_aligned_string_tmp; \ strcpy(make_aligned_string_ptr, (string)); \ (ptr) = make_aligned_string_ptr; \ } while(0) /* ** void MR_make_aligned_string_copy_saved_hp_quote(MR_String ptr, ** const char * string): ** Same as MR_make_aligned_string_copy_saved_hp(ptr, string), except that ** it puts double quote marks at the start and end of the string. */ #define MR_make_aligned_string_copy_saved_hp_quote(ptr, string, alloc_id) \ do { \ MR_Word make_aligned_string_tmp; \ char *make_aligned_string_ptr; \ \ MR_offset_incr_saved_hp_atomic(make_aligned_string_tmp, 0, \ (strlen(string) + 2 + sizeof(MR_Word)) / sizeof(MR_Word), \ (alloc_id), "string.string/0"); \ make_aligned_string_ptr = \ (char *) make_aligned_string_tmp; \ sprintf(make_aligned_string_ptr, "%c%s%c", '"', string, '"'); \ (ptr) = make_aligned_string_ptr; \ } while(0) /* ** void MR_allocate_aligned_string_msg(MR_String ptr, size_t len, ** MR_Code *proclabel): ** Allocate enough word aligned memory to hold len characters. Also ** record for memory profiling purposes the location, proclabel, of the ** allocation if profiling is enabled. ** ** BEWARE: this may modify `MR_hp', so it must only be called from ** places where `MR_hp' is valid. If calling it from inside a C function, ** rather than inside Mercury code, you may need to call ** MR_{save/restore}_transient_hp(). */ #define MR_allocate_aligned_string_msg(ptr, len, alloc_id) \ do { \ MR_Word make_aligned_string_tmp; \ char *make_aligned_string_ptr; \ \ MR_offset_incr_hp_atomic_msg(make_aligned_string_tmp, 0, \ ((len) + sizeof(MR_Word)) / sizeof(MR_Word), \ (alloc_id), "string.string/0"); \ make_aligned_string_ptr = \ (char *) make_aligned_string_tmp; \ (ptr) = make_aligned_string_ptr; \ } while(0) #define MR_allocate_aligned_string_saved_hp(ptr, len, alloc_id) \ do { \ MR_Word make_aligned_string_tmp; \ char *make_aligned_string_ptr; \ \ MR_offset_incr_saved_hp_atomic(make_aligned_string_tmp, 0, \ ((len) + sizeof(MR_Word)) / sizeof(MR_Word), \ (alloc_id), "string.string/0"); \ make_aligned_string_ptr = \ (char *) make_aligned_string_tmp; \ (ptr) = make_aligned_string_ptr; \ } while(0) /* ** MR_do_hash_string{,2,3}(int & hash, MR_Word string): ** Given a Mercury string `string', set `hash' to the hash value ** for that string. (`hash' must be an lvalue.) ** ** This is an implementation detail used to implement MR_hash_string{,2,3}(). ** It should not be used directly. Use MR_hash_string{,2,3}() instead. ** ** Note that these functions are also defined in library/string.m. ** The definition here and in string.m must be kept equivalent. */ #define MR_do_hash_string(hash, s) \ { \ int len; \ MR_CHECK_EXPR_TYPE(hash, int); \ MR_CHECK_EXPR_TYPE(s, MR_ConstString); \ len = 0; \ hash = 0; \ while (((const unsigned char *)(s))[len]) { \ hash ^= (hash << 5); \ hash ^= ((const unsigned char *)(s))[len]; \ len++; \ } \ hash ^= len; \ } #define MR_do_hash_string2(hash, s) \ { \ int len; \ MR_CHECK_EXPR_TYPE(hash, int); \ MR_CHECK_EXPR_TYPE(s, MR_ConstString); \ len = 0; \ hash = 0; \ while (((const unsigned char *)(s))[len]) { \ hash = hash * 37; \ hash += ((const unsigned char *)(s))[len]; \ len++; \ } \ hash ^= len; \ } #define MR_do_hash_string3(hash, s) \ { \ int len; \ MR_CHECK_EXPR_TYPE(hash, int); \ MR_CHECK_EXPR_TYPE(s, MR_ConstString); \ len = 0; \ hash = 0; \ while (((const unsigned char *)(s))[len]) { \ hash = hash * 49; \ hash += ((const unsigned char *)(s))[len]; \ len++; \ } \ hash ^= len; \ } /* ** MR_hash_string{,2,3}(s): ** Given a Mercury string `s', return a hash value for that string. */ MR_Integer MR_hash_string(MR_ConstString); MR_Integer MR_hash_string2(MR_ConstString); MR_Integer MR_hash_string3(MR_ConstString); #if defined(MR_GNUC) #define MR_hash_string(s) \ ({ \ MR_Integer hash_string_result; \ MR_CHECK_EXPR_TYPE(s, MR_ConstString); \ MR_do_hash_string(hash_string_result, s); \ hash_string_result; \ }) #define MR_hash_string2(s) \ ({ \ MR_Integer hash_string_result; \ MR_CHECK_EXPR_TYPE(s, MR_ConstString); \ MR_do_hash_string2(hash_string_result, s); \ hash_string_result; \ }) #define MR_hash_string3(s) \ ({ \ MR_Integer hash_string_result; \ MR_CHECK_EXPR_TYPE(s, MR_ConstString); \ MR_do_hash_string3(hash_string_result, s); \ hash_string_result; \ }) #endif /* ** If we are not using gcc, the actual definitions of these functions ** are runtime/mercury_string.c; they use the macros below. */ #define MR_HASH_STRING_FUNC_BODY \ MR_Integer hash_string_result; \ MR_do_hash_string(hash_string_result, s); \ return hash_string_result; #define MR_HASH_STRING2_FUNC_BODY \ MR_Integer hash_string_result; \ MR_do_hash_string2(hash_string_result, s); \ return hash_string_result; #define MR_HASH_STRING3_FUNC_BODY \ MR_Integer hash_string_result; \ MR_do_hash_string3(hash_string_result, s); \ return hash_string_result; /* ** A version of strcmp to which we can pass Mercury words ** without having to cast the arguments first. */ #define MR_strcmp(s, t) strcmp((const char *)(s), (const char *)(t)) /* ** Return an MR_String which has been created using the format string, ** fmt, passed to sprintf. If memory profiling is turned on, record the ** allocation as coming from proclabel. The MR_String returned has been ** allocated on the mercury heap using MR_allocate_aligned_string_msg. ** ** BEWARE: this may modify the saved copy of `MR_hp', so it must only be ** called from places where the saved copy of `MR_hp' is valid. ** You will generally need to call MR_{save/restore}_transient_hp() ** before/after calling this function. */ MR_String MR_make_string(MR_AllocSiteInfoPtr alloc_id, const char *fmt, ...); /* ** True if c is an ASCII code point, i.e. U+0000..U+007f. */ #define MR_is_ascii(c) ((unsigned)(c) <= 0x7f) /* ** True if c is a Unicode surrogate code point, i.e. U+D800..U+DFFF. */ #define MR_is_surrogate(c) (((unsigned)(c) & 0xF800) == 0xD800) /* ** UTF-8 manipulation */ #define MR_utf8_is_single_byte(c) (((unsigned)(c) & 0x80) == 0) #define MR_utf8_is_lead_byte(c) (((unsigned)(c) - 0xC0) < 0x3E) #define MR_utf8_is_trail_byte(c) (((unsigned)(c) & 0xC0) == 0x80) /* ** Advance `*pos' to the beginning of the next code point in `s'. ** If `*pos' is already at the end of the string then return MR_FALSE ** without modifying `*pos'. */ extern MR_bool MR_utf8_next(const MR_String s_, MR_Integer *pos); /* ** Rewind `*pos' to the beginning of the previous code point in `s'. ** If `*pos' is already at the beginning of the string then return MR_FALSE ** without modifying `*pos'. */ extern MR_bool MR_utf8_prev(const MR_String s_, MR_Integer *pos); /* ** Decode and return the code point beginning at `pos' in `s'. ** Return 0 if at the end of the string (i.e. the NUL terminator). ** If an illegal code sequence exists at that offset, return -2. ** ** The _mb version requires s[pos] to be the lead byte of a multibyte code ** point. */ extern MR_int_least32_t MR_utf8_get(const MR_String s, MR_Integer pos); extern MR_int_least32_t MR_utf8_get_mb(const MR_String s, MR_Integer pos, int *width); /* ** Decode the code point beginning at `pos' in `s', and advance `*pos'. ** The _mb version requires s[pos] to be the lead byte of a multibyte code ** point. */ extern MR_int_least32_t MR_utf8_get_next(const MR_String s, MR_Integer *pos); extern MR_int_least32_t MR_utf8_get_next_mb(const MR_String s, MR_Integer *pos); /* ** Rewind `*pos' to the beginning of the previous code point in `s' ** and return that code code. ** Return -1 if `*pos' is already at the beginning of the string. */ extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, MR_Integer *pos); /* ** Return the number of bytes required to encode the code point `c' in UTF-8. */ extern size_t MR_utf8_width(MR_Char c); /* ** Encode the code point `c' into the buffer `s'. ** Return the number of bytes used. */ extern size_t MR_utf8_encode(char s[], MR_Char c); /* ** Return MR_TRUE iff `s' contains a valid UTF-8 encoded string. */ extern MR_bool MR_utf8_verify(const MR_String s); #endif /* not MERCURY_STRING_H */