Files
mercury/runtime/mercury_atomic_ops.c
Paul Bone a9f82d004b On some systems the CPU's time stamp counter (TSC) cannot reliabily be
used.  Mercury's ThreadScope support will now use gettimeofday() by
default, but use of the TSC may be enabled.

Note that in Linux, gettimeofday() does not always make a system call.

runtime/mercury_threadscope.[ch]:
    Add support for measuring time with gettimeofday().

    Use gettimeofday() to measure time by default.

runtime/mercury_atomic_ops.[ch]
    Add a new function MR_tsc_is_sensible(), It returns true if the TSC can
    (as far as the RTS can detect) be used.

    Fix trailing whitespace.

runtime/mercury_wrapper.c:
    Add a new runtime option --threadscope-use-tsc.
    When specified this option allows threadscope to use the CPU's TSC to
    measure time.

doc/userguide.texi:
    Document the --threadscope-use-tsc option.  This documentation is
    commented out.
2012-06-20 13:13:34 +00:00

617 lines
17 KiB
C

/*
** vim:ts=4 sw=4 expandtab
*/
/*
** Copyright (C) 2007, 2009-2011 The University of Melbourne.
** This file may only be copied under the terms of the GNU Library General
** Public License - see the file COPYING.LIB in the Mercury distribution.
*/
/*
** mercury_atomic_ops.c
*/
#include "mercury_imp.h"
#include "mercury_atomic_ops.h"
/*---------------------------------------------------------------------------*/
#if defined(MR_LL_PARALLEL_CONJ)
/*
** Definitions for the atomic functions declared `extern inline'.
*/
MR_OUTLINE_DEFN(
MR_bool
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
MR_Integer new_val)
,
{
MR_COMPARE_AND_SWAP_WORD_BODY;
}
)
MR_OUTLINE_DEFN(
MR_bool
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
MR_Unsigned new_val)
,
{
MR_COMPARE_AND_SWAP_WORD_BODY;
}
)
MR_OUTLINE_DEFN(
MR_Integer
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
,
{
MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
}
)
MR_OUTLINE_DEFN(
MR_Unsigned
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
,
{
MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
,
{
MR_ATOMIC_ADD_INT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
,
{
MR_ATOMIC_ADD_UINT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
,
{
MR_ATOMIC_SUB_INT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_inc_int(volatile MR_Integer *addr)
,
{
MR_ATOMIC_INC_INT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_inc_uint(volatile MR_Unsigned *addr)
,
{
MR_ATOMIC_INC_UINT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_dec_int(volatile MR_Integer *addr)
,
{
MR_ATOMIC_DEC_INT_BODY;
}
)
MR_OUTLINE_DEFN(
MR_bool
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
,
{
MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
}
)
MR_OUTLINE_DEFN(
MR_bool
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
,
{
MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
}
)
#endif /* MR_LL_PARALLEL_CONJ */
/*---------------------------------------------------------------------------*/
#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
/*
** Profiling of the parallel runtime.
*/
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
/*
** True if the RDTSCP and RDTSC instructions are available respectively.
*/
static MR_bool MR_rdtscp_is_available = MR_FALSE;
static MR_bool MR_rdtsc_is_available = MR_FALSE;
#endif
MR_uint_least64_t MR_cpu_cycles_per_sec = 0;
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
/* Set this to 1 to enable some printfs below */
#define MR_DEBUG_CPU_FEATURE_DETECTION 0
/*
** cpuid, rdtscp and rdtsc are i386/x86_64 instructions.
*/
static __inline__ void
MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d);
static __inline__ void
MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id);
static __inline__ void
MR_rdtsc(MR_uint_least64_t *tsc);
/*
** Return zero if parsing failed, otherwise return the number of cycles per
** second.
*/
static MR_uint_least64_t
parse_freq_from_x86_brand_string(char *string);
#endif /* MR_GNUC && (__i386__ || __x86_64__) */
void
MR_do_cpu_feature_detection(void)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
MR_Unsigned a, b, c, d;
MR_Unsigned eflags, old_eflags;
MR_Unsigned maximum_extended_page;
MR_Unsigned extended_family, basic_family, family;
MR_Unsigned extended_model, model;
/*
** Check for the CPUID instruction. CPUID is supported if we can flip bit
** 21 in the CPU's EFLAGS register. The assembly below is written in a
** subset of i386 and x86_64 assembly. To read and write EFLAGS we have
** to go via the C stack.
*/
__asm__ ("pushf; pop %0"
:"=r"(eflags));
old_eflags = eflags;
/* Flip bit 21 */
eflags ^= (1 << 21);
__asm__ ("push %0; popf; pushf; pop %0;"
:"=r"(eflags)
:"0"(eflags));
/*
** Test to see if our change held. We don't restore eflags, a change to
** the ID bit has no effect.
*/
if (eflags == old_eflags) {
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "This CPU doesn't support the CPUID instruction.\n",
eflags, old_eflags);
#endif
return;
}
/*
** CPUID 0 gives the maximum basic CPUID page in EAX. Basic pages go up to
** but not including 0x40000000.
*/
MR_cpuid(0, 0, &a, &b, &c, &d);
if (a < 1) {
return;
}
/* CPUID 1 gives type, family, model and stepping information in EAX. */
MR_cpuid(1, 0, &a, &b, &c, &d);
/* Bit 4 in EDX is high if RDTSC is available */
if (d & (1 << 4)) {
MR_rdtsc_is_available = MR_TRUE;
}
/*
** BTW: Intel can't count:
**
** http://www.pagetable.com/?p=18
** http://www.codinghorror.com/blog/archives/000364.html
**
** 486 (1989): family 4
** Pentium (1993): family 5
** Pentium Pro (1995): family 6, models 0 and 1
** Pentium 2 (1997): family 6, models 3, 5 and 6
** Pentium 3 (2000): family 6, models 7, 8, 10, 11
** Itanium (2001): family 7
** Pentium 4 (2000): family 15/0
** Itanium 2 (2002): family 15/1 and 15/2
** Pentium D: family 15/4
** Pentium M (2003): family 6, models 9 and 13
** Core (2006): family 6, model 14
** Core 2 (2006): family 6, model 15
** i7: family 6, model 26
** Atom: family 6, model 28
**
** This list is incomplete, it doesn't cover AMD or any other brand of x86
** processor, and it probably doesn't cover all post-pentium Intel
** processors.
*/
/* bits 8-11 (first bit (LSB) is bit 0) */
basic_family = (a & 0x00000F00) >> 8;
if (basic_family == 0x0F) {
/* bits 20-27 */
extended_family = (a & 0x0FF00000) >> 20;
family = basic_family + extended_family;
} else {
family = basic_family;
}
/*
** I'm not using the model value but I'll leave the code here incase we
** have a reason to use it in the future.
*/
/* bits 4-7 */
model = (a & 0x000000F0) >> 4;
if ((basic_family == 0x0F) || (basic_family == 0x06)) {
/* bits 16-19 */
extended_model = (a & 0x000F0000) >> 16;
model += (extended_model << 4);
}
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "This is family %d and model %d\n", family, model);
#endif
/* Now check for P3 or higher since they have the extended pages */
if (family < 6) {
/* This is a 486 or Pentium */
return;
}
/*
** I could bail out here if this was a pentium 3, but there's a more
** reliable check for extended CPUID support below that should work on AMD
** chips as well, if I knew all the model numbers for all family 6
** processors and knew if they honoured extended CPUID.
*/
/*
** Extended CPUID 0x80000000.
**
** EAX contains the maximum extended CPUID node.
*/
MR_cpuid(0x80000000, 0, &a, &b, &c, &d);
if ((a & 0x80000000) == 0) {
/*
** Extended CPUID is not supported.
** Note that this check is still not as reliable as I'd like. If it
** succeeds I'm not confident that the processor definitely implements
** extended CPUID.
*/
return;
}
maximum_extended_page = a;
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "Maximum extended CPUID page: 0x%x\n", maximum_extended_page);
#endif
/*
** Extended CPUID 0x80000001
**
** If EDX bit 27 is set the RDTSCP instruction is available.
*/
if (maximum_extended_page >= 0x80000001) {
MR_cpuid(0x80000001, 0, &a, &b, &c, &d);
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "CPUID 0x80000001 EDX: 0x%x\n", d);
#endif
if ((d & (1 << 27))) {
/*
** This processor supports RDTSCP.
*/
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "RDTSCP is available\n");
#endif
MR_rdtscp_is_available = MR_TRUE;
}
}
if (maximum_extended_page >= 0x80000004) {
/*
** 3 CPUID pages, 4 return registers each, containing 4 bytes each,
** plus a null byte. Intel say they include their own null byte, but
** for the cost of a single byte I feel safer using our own.
*/
#define CPUID_BRAND_STRING_SIZE (3*4*4 + 1)
char buff[CPUID_BRAND_STRING_SIZE];
unsigned int page;
unsigned int byte;
unsigned int shift;
/*
** This processor supports the brand string from which we can
** try to extract the clock speed. This algorithm is described
** in the Intel Instruction Set Reference, Volume 2B, Chapter 3,
** Pages 207-208, In particular the flow chart in figure 3-10.
** This does not work on AMD processors since they don't include
** the clock speed in the brand string.
*/
for (page = 0; page < 3; page++) {
MR_cpuid(page + 0x80000002, 0, &a, &b, &c, &d);
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "CPUID page: 0x%.8x, eax: 0x%.8x, ebx: 0x%.8x, ecx: 0x%.8x, edx: 0x%.8x\n",
page + 0x80000002, a, b, c, d);
#endif
for (byte = 0; byte < 4; byte++) {
shift = byte * 8;
buff[page*4*4 + 0 + byte] = (char)(0xFF & (a >> shift));
buff[page*4*4 + 4 + byte] = (char)(0xFF & (b >> shift));
buff[page*4*4 + 8 + byte] = (char)(0xFF & (c >> shift));
buff[page*4*4 + 12 + byte] = (char)(0xFF & (d >> shift));
}
}
/* Add a null byte */
buff[CPUID_BRAND_STRING_SIZE - 1] = 0;
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "CPUID Brand string: %s\n", buff);
#endif
MR_cpu_cycles_per_sec = parse_freq_from_x86_brand_string(buff);
#if MR_DEBUG_CPU_FEATURE_DETECTION
if (MR_cpu_cycles_per_sec == 0) {
fprintf(stderr, "Failed to detect cycles per second "
"you can probably blame AMD for this.\n");
} else {
fprintf(stderr, "Cycles per second: %ld\n", MR_cpu_cycles_per_sec);
}
#endif
}
#endif /* MR_GNUC && (__i386__ || __x86_64__) */
}
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
static MR_uint_least64_t
parse_freq_from_x86_brand_string(char *string)
{
unsigned int brand_string_len;
unsigned int i;
double multiplier;
int freq_index = -1;
brand_string_len = strlen(string);
/*
** There will be at least five characters if we can parse this, three
** for the '?Hz' suffix, at least one for the units, plus a space at
** the beginning of the number.
*/
if (!(brand_string_len > 5))
return 0;
if (!((string[brand_string_len - 1] == 'z') &&
(string[brand_string_len - 2] == 'H'))) {
return 0;
}
switch (string[brand_string_len - 3]) {
case 'M':
multiplier = 1000000.0;
break;
case 'G':
multiplier = 1000000000.0;
break;
case 'T':
/*
** Yes, this is defined in the specification, Intel have some
** strong ambitions regarding Moore's law. :-) We include it here to
** conform with the standard.
*/
multiplier = 1000000000000.0;
break;
default:
return 0;
}
/* Search for the beginning of the digits. */
for (i = brand_string_len - 4; i >= 0; i--) {
if (string[i] == ' ') {
freq_index = i+1;
break;
}
}
if (freq_index == -1) {
/* We didn't find the beginning of the frequency */
return 0;
}
/*
** If strtod fails it returns zero, so if we fail to parse a number here,
** we'll return zero which our caller understands as a parsing failure.
*/
return (MR_uint_least64_t)(strtod(&string[freq_index], NULL) * multiplier);
}
#endif /* MR_GNUC && (__i386__ || __x86_64__) */
void
MR_profiling_start_timer(MR_Timer *timer)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
/*
** If we don't have enough data to fill in all the fields of this structure
** we leave them alone, we won't check them later without checking
** MR_rdtsc{p}_is_available first.
*/
if (MR_rdtscp_is_available) {
MR_rdtscp(&(timer->MR_timer_time), &(timer->MR_timer_processor_id));
} else if (MR_rdtsc_is_available) {
MR_rdtsc(&(timer->MR_timer_time));
}
#endif
}
void
MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
MR_Timer now;
MR_int_least64_t duration;
MR_uint_least64_t duration_squared;
if (MR_rdtscp_is_available) {
MR_rdtscp(&(now.MR_timer_time), &(now.MR_timer_processor_id));
if (timer->MR_timer_processor_id == now.MR_timer_processor_id) {
duration = now.MR_timer_time - timer->MR_timer_time;
duration_squared = duration * duration;
MR_atomic_inc_uint(&(stats->MR_stat_count_recorded));
#if MR_LOW_TAG_BITS >= 3
MR_atomic_add_int(&(stats->MR_stat_sum), duration);
MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared);
#else
MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock));
stats->MR_stat_sum += duration;
stats->MR_stat_sum_squares += duration_squared;
MR_US_UNLOCK(&(stats->MR_stat_sums_lock));
#endif
} else {
MR_atomic_inc_uint(&(stats->MR_stat_count_not_recorded));
}
} else if (MR_rdtsc_is_available) {
MR_rdtsc(&(now.MR_timer_time));
duration = now.MR_timer_time - timer->MR_timer_time;
duration_squared = duration * duration;
MR_atomic_inc_uint(&(stats->MR_stat_count_recorded));
#if MR_LOW_TAG_BITS >= 3
MR_atomic_add_int(&(stats->MR_stat_sum), duration);
MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared);
#else
MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock));
stats->MR_stat_sum += duration;
stats->MR_stat_sum_squares += duration_squared;
MR_US_UNLOCK(&(stats->MR_stat_sums_lock));
#endif
}
#else /* not MR_GNUC && (__i386__ || __x86_64__) */
/* No TSC support on this architecture or with this C compiler */
MR_atomic_inc_int(&(stats->MR_stat_count_recorded));
#endif /* not MR_GNUC && (__i386__ || __x86_64__) */
}
/*
** The TSC works and MR_cpu_cycles_per_sec is nonzero.
*/
extern MR_bool
MR_tsc_is_sensible(void)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
return ((MR_rdtscp_is_available || MR_rdtsc_is_available) &&
(MR_cpu_cycles_per_sec != 0));
#else
return MR_FALSE;
#endif
}
MR_uint_least64_t
MR_read_cpu_tsc(void)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
MR_uint_least64_t tsc;
if (MR_rdtsc_is_available) {
MR_rdtsc(&tsc);
} else {
tsc = 0;
}
return tsc;
#else /* not MR_GNUC && (__i386__ || __x86_64__) */
return 0;
#endif /* not MR_GNUC && (__i386__ || __x86_64__) */
}
/*
** It's convenient that this instruction is the same on both i386 and x86_64
*/
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
static __inline__ void
MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d)
{
#ifdef __x86_64__
__asm__("cpuid"
: "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d)
: "0"(code), "2"(sub_code));
#elif defined(__i386__)
/*
** i386 is more register staved, in particular we can't use ebx in
** position independent code. And we can't move ebx into another
** general purpose register, between register pinning, PIC, the
** stack and frame pointers and the other registers used by CPUID
** there are literally no general purpose registers left on i386.
*/
__asm__("pushl %%ebx; \
cpuid; \
movl %%ebx, %1; \
popl %%ebx;"
: "=a"(*a), "=m"(*b), "=c"(*c), "=d"(*d)
: "0"(code), "2"(sub_code)
: "memory");
#endif
}
static __inline__ void
MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id)
{
MR_Unsigned tsc_low;
MR_Unsigned tsc_high;
/*
** On 64bit systems the high 32 bits of RAX and RDX are 0 filled by
** rdtsc{p}
*/
__asm__("rdtscp"
: "=a"(tsc_low), "=d"(tsc_high), "=c"(*processor_id));
*tsc = tsc_high;
*tsc = *tsc << 32;
*tsc |= tsc_low;
}
static __inline__ void
MR_rdtsc(MR_uint_least64_t *tsc)
{
MR_Unsigned tsc_low;
MR_Unsigned tsc_high;
__asm__("rdtsc"
: "=a"(tsc_low), "=d"(tsc_high));
*tsc = tsc_high;
*tsc = *tsc << 32;
*tsc |= tsc_low;
}
#endif /* MR_GNUC && (__i386__ || __x86_64__) */
#endif /* MR_PROFILE_PARALLEL_EXECUTION_SUPPORT */