mercury/util/mdemangle.c

/*---------------------------------------------------------------------------*/

/*
** Copyright (C) 1995-1999 The University of Melbourne.
** This file may only be copied under the terms of the GNU General
** Public License - see the file COPYING in the Mercury distribution.
*/

/*
** File: mdemangle.c
** Author: fjh
**
** A mercury symbol demangler.
** This is used to convert error messages from the linker back
** into a form that users can understand.
**
** This is implemented in C to minimize startup time and memory usage.
**
** BEWARE:
** This code is duplicated in profiler/demangle.m and profiler/mdemangle.m.
** Any changes here will need to be duplicated there and vice versa.
*/

#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include "mercury_std.h"

/* We used this for the size of fixed-length buffers in a few places <sigh> */
#define MAX_SYMBOL_LENGTH 1000

static void demangle(const char *name);
static const char *strip_module_name(char **start_ptr, char *end,
		const char *trailing_context[]);
static bool check_for_suffix(char *start, char *position, const char *suffix,
		int sizeof_suffix, int *mode_num2);
static char *fix_mangled_ascii(char *str, char **end);
static bool fix_mangled_special_case(char *str, char **end);
static bool cut_at_double_underscore(char **str, char *end);
static bool cut_trailing_integer(char *str, char **end, int *num);
static bool cut_trailing_underscore_integer(char *str, char **end, int *num);
static bool strip_prefix(char **str, const char *prefix);
static bool strip_leading_integer(char **start_ptr, int *num);

/*
** Bloody SunOS 4.x doesn't have memmove()...
** Using memcpy() may not work, but it doesn't really matter
** if the demangler doesn't work 100% correctly on SunOS 4.x.
*/
#ifndef HAVE_MEMMOVE
#define memmove memcpy
#endif

int
main(int argc, char **argv)
{
	if (argc > 1) {
		/*
		** invoke demangle() on each command line argument
		*/
		int i;
		for (i = 1; i < argc; i++) {
			demangle(argv[i]);
			putchar('\n');
		}
	} else {
		/*
		** copy stdin to stdout, calling demangle() for
		** every valid C identifier in the input
		*/
		for (;;) {
			char buf[MAX_SYMBOL_LENGTH];
			size_t len = 0;
			int c = getchar();
			while (c != EOF && (isalnum(c) || c == '_')) {
				if (len >= sizeof(buf) - 1)
					break;
				buf[len++] = c;
				c = getchar();
			}
			if (len > 0) {
				buf[len] = '\0';
				demangle(buf);
				fflush(stdout);
			}
			if (c == EOF)
				break;
			putchar(c);
		}
	}
	return 0;
}

/*
** demangle() - convert a mangled Mercury identifier into
** human-readable form and then print it to stdout
*/

static void
demangle(const char *orig_name)
{
	static const char entry[]   = "_entry_";
	static const char mercury[] = "mercury__";
	static const char func_prefix[] = "fn__"; /* added for functions */
	static const char unify[]   = "__Unify___";
	static const char compare[] = "__Compare___";
	static const char mindex[]  = "__Index___";
	/* we call it `mindex' rather than `index' to
	   avoid a naming conflict with strchr's alter ego index() */

	static const char introduced[]  = "IntroducedFrom__";
	static const char deforestation[]  = "DeforestationIn__";
	static const char accumulator[]  = "AccFrom__";
	static const char pred[]  = "pred__";
	static const char func[]  = "func__";

	static const char ua_suffix[] = "__ua"; /* added by unused_args.m */
	static const char ua_suffix2[] = "__uab"; /* added by unused_args.m */
	static const char ho_suffix[] = "__ho"; /* added by higher_order.m */

	static const char mercury_data[] = "mercury_data_";
	static const char type_ctor_layout[] = "type_ctor_layout_";
	static const char type_ctor_info[] = "type_ctor_info_";
	static const char type_ctor_functors[] = "type_ctor_functors_";
	static const char base_typeclass_info[] = "__base_typeclass_info_";
	static const char common[] = "common";
	static const char arity_string[] = "arity";
	static const char underscores_arity_string[] = "__arity";

	static const char * trailing_context_1[] = {
		introduced,
		deforestation,
		accumulator,
		NULL
	};

	static const char * trailing_context_2[] = {
		type_ctor_layout,
		type_ctor_info,
		type_ctor_functors,
		common,
		NULL
	};

	static const char * trailing_context_3[] = {
		arity_string,
		NULL
	};

	char name[MAX_SYMBOL_LENGTH];
	char *start = name;
	const char *module = "";	/* module name */
	char *end = name + strlen(orig_name);
	char *position;		/* current position in string */
	int mode_num;
	int mode_num2;
	int arity;
	const char *pred_or_func; /* either "predicate" or "function" */
	bool unused_args = FALSE; /* does this proc have any unused arguments */
	bool higher_order = FALSE; /* has this proc been specialized */
	int internal = -1;
	int lambda_line = 0;
	int lambda_seq_number = 0;
	char *lambda_pred_name = NULL;
	const char *lambda_kind = NULL;
	enum { ORDINARY, UNIFY, COMPARE, INDEX,
		LAMBDA, DEFORESTATION, ACCUMULATOR }
		category;
	enum { COMMON, INFO, LAYOUT, FUNCTORS } data_category;
	const char * class_name;
	int class_arity;
	char class_arg_buf[MAX_SYMBOL_LENGTH];
	int class_arg_num;
	const char* class_arg;

	/*
	** copy orig_name to a local buffer which we can modify,
	** making sure that we don't overflow the buffer
	*/
	if (strlen(orig_name) >= sizeof(name)) {
		goto wrong_format;
	}
	strcpy(name, orig_name);

	/*
	** skip any leading underscore inserted by the C compiler
	** (but don't skip it if it came from the `_entry_' prefix)
	*/
	if (*start == '_' && strncmp(start, entry, strlen(entry)) != 0) {
		start++;
	}

	/*
	** skip the `_entry_' prefix, if any
	*/
	strip_prefix(&start, entry);

	/*
	** strip off the `mercury__' prefix
	*/

	if (!strip_prefix(&start, mercury)) {
		goto not_plain_mercury;
	}

/*
** Code for dealing with predicate symbols.
*/

	/*
	** strip off the `fn__' prefix, if any
	*/
	if (strip_prefix(&start, func_prefix)) {
		pred_or_func = "function";
	} else {
		pred_or_func = "predicate";
	}

	/*
	** Get integer from end of string (it might be the mode number,
	** it might be the internal label number). We'll assume its mode
	** number for the moment.
	*/

	if (!cut_trailing_integer(start, &end, &mode_num)) {
		goto wrong_format;
	}

	if (end == start) goto wrong_format;

	/*
	** if we got to an `i', that means it is an internal
	** label of the form `mercury__append_3_0_i1'
	** in that case, save the internal label number and then
	** get the mode number
	*/
	if (*--end == 'i') {
		internal = mode_num;
		if (end == start || *--end != '_') goto wrong_format;

		if (!cut_trailing_underscore_integer(start, &end, &mode_num)) {
			goto wrong_format;
		}
	}

	/*
	** scan back past the arity number and then parse it
	*/

	if (!cut_trailing_underscore_integer(start, &end, &arity)) {
		goto wrong_format;
	}

	/*
	** Now start processing from the start of the string again.
	** Check whether the start of the string matches the name of
	** one of the special compiler-generated predicates; if so,
	** set the `category' to the appropriate value and then
	** skip past the prefix.
	*/

	if (strip_prefix(&start, unify)) {
		category = UNIFY;
	} else if (strip_prefix(&start, compare)) {
		category = COMPARE;
		if (mode_num != 0) goto wrong_format;
	} else if (strip_prefix(&start, mindex)) {
		category = INDEX;
		if (mode_num != 0) goto wrong_format;
	} else {
		category = ORDINARY;
	}

	/*
	** Fix any ascii codes mangled in the predicate name
	*/
	start = fix_mangled_ascii(start, &end);

	/*
	** Process the mangling introduced by unused_args.m.
	** This involves stripping off the `__ua<m>' or `__uab<m>' added to
	** the end of the predicate/function name, where m is the mode number.
	*/

	position = end;	/* save end of name */

	do {
		if (position == start) goto wrong_format;
		position--;
	} while (MR_isdigit(*position));
		/* get the mode number */

	if (check_for_suffix(start, position, ua_suffix,
			sizeof(ua_suffix), &mode_num2)) {
		unused_args = TRUE;
		end = position + 1 - (sizeof(ua_suffix) - 1);
		mode_num = mode_num2 % 10000;
	} else if (check_for_suffix(start, position, ua_suffix2,
			sizeof(ua_suffix2), &mode_num2)) {
		unused_args = TRUE;
		end = position + 1 - (sizeof(ua_suffix2) - 1);
		mode_num = mode_num2 % 10000;
	}

	/*
	** Process the mangling introduced by higher_order.m.
	** This involves stripping off the `__ho<n>' where
	** n is a unique identifier for this specialized version
	*/

	position = end;

	do {
		if (position == start) goto wrong_format;
		position--;
	} while (MR_isdigit(*position));
	if (check_for_suffix(start, position, ho_suffix,
			sizeof(ho_suffix), &mode_num2)) {
		end = position + 1 - (sizeof(ho_suffix) - 1);
		higher_order = TRUE;
	}

	/*
	** Cut off the string before the start of the arity number,
	** and the unused_args and specialization information,
	** i.e. at the end of the predicate name or type name.
	*/
	*end = '\0';

	/*
	** Make sure special predicates with unused_args
	** are reported correctly.
	*/

	if (unused_args && category != ORDINARY) {
		if (!cut_trailing_integer(start, &end, &arity)) {
			goto wrong_format;
		}
	}

	module = strip_module_name(&start, end, trailing_context_1);

	/*
	** look for "IntroducedFrom" or "DeforestationIn" or "AccFrom"
	*/
	if (category == ORDINARY) {
		if (strip_prefix(&start, introduced)) {
			category = LAMBDA;
		} else if (strip_prefix(&start, deforestation)) {
			category = DEFORESTATION;
		} else if (strip_prefix(&start, accumulator)) {
			category = ACCUMULATOR;
		}
	}

	if (category == LAMBDA || category == DEFORESTATION ||
			category == ACCUMULATOR)
	{
		if (strip_prefix(&start, pred)) {
			lambda_kind = "pred";
		} else if (strip_prefix(&start, func)) {
			lambda_kind = "func";
		} else {
			goto wrong_format;
		}
		lambda_pred_name = start;
		if (!cut_at_double_underscore(&start, end)) {
			goto wrong_format;
		}
		lambda_line = 0;
		while (start < end && MR_isdigit(*start)) {
			lambda_line = lambda_line * 10 + (*start - '0');
			start++;
		}
		if (!cut_at_double_underscore(&start, end)) {
			goto wrong_format;
		}
		lambda_seq_number = 0;
		while (start < end && MR_isdigit(*start)) {
			lambda_seq_number = lambda_seq_number * 10 +
				(*start - '0');
			start++;
		}
	}

	/*
	** Now, finally, we can print the demangled symbol name
	*/
	printf("<");
	switch(category) {
	case UNIFY:
		printf("unification predicate for type '%s:%s'/%d mode %d",
			module, start, arity, mode_num);
		break;
	case COMPARE:
		printf("compare/3 predicate for type '%s:%s'/%d",
			module, start, arity);
		break;
	case INDEX:
		printf("index/2 predicate for type '%s:%s'/%d",
			module, start, arity);
		break;
	case LAMBDA:
		printf("%s goal (#%d) from '%s' in module '%s' line %d",
			lambda_kind, lambda_seq_number,
			lambda_pred_name, module, lambda_line);
		break;
	case ACCUMULATOR:
		printf("accumulator procedure from '%s' in module '%s' line %d",
			lambda_pred_name, module, lambda_line);
		break;
	case DEFORESTATION:
		printf("deforestation procedure (#%d) from '%s' in module '%s' line %d",
			lambda_seq_number, lambda_pred_name,
			module, lambda_line);
		break;
	default:
		if (*module == '\0') {
			printf("%s '%s'/%d mode %d",
				pred_or_func, start, arity, mode_num);
		} else {
			printf("%s '%s:%s'/%d mode %d",
				pred_or_func, module, start, arity, mode_num);
		}
	}
	if (higher_order) {
		printf(" (specialized)");
	}
	if (unused_args) {
		printf(" (minus unused args)");
	}
	if (internal != -1) {
		printf(" label %d", internal);
	}
	printf(">");
	return;

/*
** Code to deal with mercury_data items.
*/

not_plain_mercury:

	if (!strip_prefix(&start, mercury_data)) {
		goto wrong_format;
	}

	if (strip_prefix(&start, base_typeclass_info)) {
		goto typeclass_info;
	}

	module = strip_module_name(&start, end, trailing_context_2);

	if (strip_prefix(&start, type_ctor_info)) {
		data_category = INFO;
		if (!cut_trailing_underscore_integer(start, &end, &arity)) {
			goto wrong_format;
		}
	} else if (strip_prefix(&start, type_ctor_layout)) {
		data_category = LAYOUT;
		if (!cut_trailing_underscore_integer(start, &end, &arity)) {
			goto wrong_format;
		}
	} else if (strip_prefix(&start, type_ctor_functors)) {
		data_category = FUNCTORS;
		if (!cut_trailing_underscore_integer(start, &end, &arity)) {
			goto wrong_format;
		}
	} else if (strip_prefix(&start, common)) {
		data_category = COMMON;
		if (!cut_trailing_underscore_integer(start, &end, &arity)) {
			goto wrong_format;
		}
	} else {
		goto wrong_format;
	}

	start = fix_mangled_ascii(start, &end);

	switch (data_category) {

	case INFO:
		if (*module == '\0') {
			printf("<type_ctor_info for type '%s'/%d>",
				start, arity);
		} else {
			printf("<type_ctor_info for type '%s:%s'/%d>",
				module, start, arity);
		}
		break;
	case LAYOUT:
		if (*module == '\0') {
			printf("<type_ctor_layout for type '%s'/%d>",
				start, arity);
		} else {
			printf("<type_ctor_layout for type '%s:%s'/%d>",
				module, start, arity);
		}
		break;
	case FUNCTORS:
		if (*module == '\0') {
			printf("<type_ctor_functors for type '%s'/%d>",
				start, arity);
		} else {
			printf("<type_ctor_functors for type '%s:%s'/%d>",
				module, start, arity);
		}
		break;
	case COMMON:
		printf("<shared constant number %d for module %s>",
			arity, module);
		break;

	default:
		goto wrong_format;

	}
	return;

typeclass_info:
	/*
	** Parse the class name and class arity, which have the following
	** layout:
	**	<module-qualified class name>__arity<arity>__
	*/
	class_name = strip_module_name(&start, end, trailing_context_3);
	/* XXX fix_mangled_ascii() */
	if (!(strip_prefix(&start, arity_string)
		&& strip_leading_integer(&start, &class_arity)
		&& strip_prefix(&start, "__")))
	{
		goto wrong_format;
	}

	/*
	** Parse the class argument types, which each have the following
	** layout:
	**	<module-qualified type name>__arity<arity>__
	**
	** We store the human-readable formatted output in
	** class_arg_buf as we go.
	*/
	fix_mangled_ascii(start, &end);
	strcpy(class_arg_buf, "");
	for (class_arg_num = 0; class_arg_num < class_arity; class_arg_num++) {
		if (class_arg_num != 0) {
			strcat(class_arg_buf, ", ");
		}
		class_arg = strip_module_name(&start, end, trailing_context_3);
		if (!(strip_prefix(&start, arity_string)
		      && strip_leading_integer(&start, &arity)
		      && strip_prefix(&start, "__")))
		{
			goto wrong_format;
		}
		sprintf(class_arg_buf + strlen(class_arg_buf),
			"%s/%d", class_arg, arity);
	}

	/*
	** now print the results
	*/
	printf("<instance declaration for %s(%s)>",
		class_name, class_arg_buf);
	return;

wrong_format:
	printf("%s", orig_name);
	return;
} /* end demangle() */

	/*
	** Remove a module name prefix.
	** Just keep munching up double-underscores until we
	** get to something that matches the specified trailing context,
	** at which point we stop, or until there are no double-underscores
	** left.
	*/
static const char *
strip_module_name(char **start_ptr, char *end, const char *trailing_context[])
{
	const char *module;		/* module name */
	char *module_end;		/* end of the module name */
	char *next_double_underscore;
	char *start;

	start = *start_ptr;

	/*
	** Strip off the module name
	*/
	module = start;
	module_end = start;
	while ((next_double_underscore = strstr(start, "__")) != NULL) {
		int len, i;

		/*
		** Check for special cases
		*/
		bool stop = FALSE;
		for (i = 0; trailing_context[i] != NULL; i++) {
			if (strncmp(start,
				trailing_context[i],
				strlen(trailing_context[i])) == 0)
			{
				stop = TRUE;
			}
		}
		if (stop) break;

		len = next_double_underscore - start;
		if (module != module_end) {
			/*
			** append a module qualifier, and
			** shift the module name into the right place
			*/
			*module_end = ':';
			module_end++;
			memmove(module_end, start, len);
		}
		module_end += len;

		start = next_double_underscore + 2;
	}
	if (module == module_end) {
		module = "";
	} else {
		*module_end = '\0';
	}

	*start_ptr = start;
	return module;
}

	/*
	** Remove the prefix from a string, if it has
	** it.
	** Returns TRUE if it has that prefix, and newstr will
	** then point to the rest of that string.
	** If the string doesn't have that prefix, newstr will
	** be unchanged, and the function will return FALSE.
	*/
static bool
strip_prefix(char **str, const char *prefix)
{
	int len;

	len = strlen(prefix);

	if (strncmp(*str, prefix, len) == 0) {
		*str += len;
		return TRUE;
	}
	return FALSE;
}

	/*
	** If the string pointed to by *start_ptr starts with
	** an integer, then advance *start_ptr past the leading integer,
	** store the value of the integer in the int pointed to by `num',
	** and return true; otherwise leave *start_ptr unchanged and
	** return false.  (The string itself is always left unchanged.)
	*/
static bool
strip_leading_integer(char **start_ptr, int *num)
{
	char *start = *start_ptr;
	char save_char;
	bool got_int;;

	while(MR_isdigit(*start)) {
		start++;
	}
	if (start == *start_ptr) return FALSE;
	save_char = *start;
	*start = '\0';
	got_int = (sscanf(*start_ptr, "%d", num) == 1);
	*start = save_char;
	if (got_int) {
		*start_ptr = start;
		return TRUE;
	} else {
		return FALSE;
	}
}

	/*
	** Remove trailing integer (at the supplied `real_end' of the
	** string), and return it in the int pointed to by `num'.   True
	** is returned if there is an integer at the end, false if not.
	** If false is returned, the string will not be cut.
	** `real_end' is updated with the new end of the string
	*/
static bool
cut_trailing_integer(char *str, char **real_end, int *num)
{
	char *end = *real_end;

	do {
		if (end == str) return FALSE;
		end--;
	} while (MR_isdigit(*end));

	if (sscanf(end + 1, "%d", num) != 1) {
		return FALSE;
	}
	*++end = '\0';
	*real_end = end;

	return TRUE;
}

	/*
	** Same as cut_trailing_integer, but move end back past
	** the underscore as well. If cut_trailing_underscore_integer
	** returns TRUE, the `real_end' will be moved back before the
	** underscore and the integer. If it returns FALSE, the
	** `real_end' is unchanged.
	*/
static bool
cut_trailing_underscore_integer(char *str, char **real_end,
	int *num)
{
	char *end = *real_end;

	if (!cut_trailing_integer(str, &end, num)) {
		return FALSE;
	}
	if (end == str || *(--end) != '_') {
		return FALSE;
	}
	*end = '\0';
	*real_end = end;
	return TRUE;
}

	/*
	** Scan for `__' and cut the string at there (replace first
	** `_' with `\0', return the part of the string after the `__').
	** Returns TRUE if `__' was found, FALSE otherwise.
	*/

static bool
cut_at_double_underscore(char **start, char *end)
{
	char *str = *start;

	while (*str != '_' || *(str + 1) != '_') {
		if (str == end) {
			return FALSE;
		}
		str++;
	}

	*str = '\0';
	*start = str + 2;
	return TRUE;
}

	/*
	** The compiler changes all names starting with `f_' so that
	** they start with `f__' instead, and uses names starting with
	** `f_' for mangled names which are either descriptions (such
	** as `f_greater_than' for `>') or sequences of decimal
	** reprententations of ASCII codes separated by underscores.
	** If the name starts with `f__', we must change it back to
	** start with `f_'.  Otherwise, if it starts with `f_' we must
	** convert the mnemonic or list of ASCII codes back into an
	** identifier.
	*/

static char *
fix_mangled_ascii(char *str, char **real_end)
{
	char *end = *real_end;

	/*
	** If it starts with `f__', replace that with `f_'.
	*/
	if (strncmp(str, "f__" , 3) == 0) {
		str++;
		*str = 'f';
		return str;
	}

	/*
	** If it starts with `f_' followed by a mnemonic description,
	** then replace that with its unmangled version
	*/
	if (strncmp(str, "f_", 2) == 0 &&
		fix_mangled_special_case(str, real_end))
	{
		return str;
	}

	/*
	** Otherwise, if it starts with `f_' we must convert the list of
	** ASCII codes back into an identifier.
	*/
	if (strncmp(str, "f_", 2) == 0) {
		char buf[MAX_SYMBOL_LENGTH];
		char *num = str + 2;
		int count = 0;
		while (num < end) {
			char *next_num = num;
			while (MR_isdigit(*next_num)) {
				next_num++;
			}
			if (*next_num != '_' && *next_num != '\0')
				break;
			*next_num = '\0';
			buf[count++] = atoi(num);
			num = next_num + 1;
		}
			/* copy anything after the mangled string */
		while (num < end) {
			buf[count++] = *num++;
		}
		buf[count] = '\0';
		strcpy(str, buf);
		*real_end = str + count;
	}
	return str;
}

static bool
fix_mangled_special_case(char *str, char **real_end)
{
	static const struct {
		const char *mangled_name;
		const char *unmangled_name;
	} translations[] = {
		/*
		** Beware: we assume that the unmangled name is always
		** shorter than the mangled name.
		*/
		{ "f_not_equal", "\\=" },
		{ "f_greater_or_equal", ">=" },
		{ "f_less_or_equal", "=<" },
		{ "f_equal", "=" },
		{ "f_less_than", "<" },
		{ "f_greater_than", ">" },
		{ "f_plus", "+" },
		{ "f_times", "*" },
		{ "f_minus", "-" },
		{ "f_slash", "/" },
		{ "f_comma", "," },
		{ "f_semicolon", ";" },
		{ "f_cut", "!" }
	};
	const int num_translations =
		sizeof(translations) / sizeof(translations[0]);

	int i;

	/*
	** check for the special cases listed in the table above.
	*/
	for (i = 0; i < num_translations; i++) {
		const char *mangled = translations[i].mangled_name;
		size_t mangled_len = strlen(mangled);
		if (strncmp(str, mangled, mangled_len) == 0) {
			const char *unmangled = translations[i].unmangled_name;
			size_t unmangled_len = strlen(unmangled);
			size_t leftover_len = strlen(str) - mangled_len;

			assert(unmangled_len <= mangled_len);

			strcpy(str, unmangled);
			memmove(str + unmangled_len, str + mangled_len,
				leftover_len + 1);

			*real_end = str + unmangled_len + leftover_len;
			return TRUE;
		}
	}
	return FALSE;
}

static bool
check_for_suffix(char *start, char *position, const char *suffix,
		int sizeof_suffix, int *mode_num2)
{
	const int suffix_len = sizeof_suffix - 1;

	return (
		position - suffix_len >= start
		&& sscanf(position + 1, "%d", mode_num2) == 1
		&& strncmp(position - suffix_len + 1, suffix, suffix_len) == 0
	);
}

/*---------------------------------------------------------------------------*/