Files
mercury/runtime/mercury_atomic_ops.c
Mark Brown d465fa53cb Update the COPYING.LIB file and references to it.
Discussion of these changes can be found on the Mercury developers
mailing list archives from June 2018.

COPYING.LIB:
    Add a special linking exception to the LGPL.

*:
    Update references to COPYING.LIB.

    Clean up some minor errors that have accumulated in copyright
    messages.
2018-06-09 17:43:12 +10:00

581 lines
17 KiB
C

// vim: ts=4 sw=4 expandtab ft=c
// Copyright (C) 2007, 2009-2011 The University of Melbourne.
// Copyright (C) 2014-2016, 2018 The Mercury team.
// This file is distributed under the terms specified in COPYING.LIB.
// mercury_atomic_ops.c
#include "mercury_imp.h"
#include "mercury_atomic_ops.h"
////////////////////////////////////////////////////////////////////////////
#if defined(MR_THREAD_SAFE)
// Definitions for the atomic functions declared `extern inline'.
MR_OUTLINE_DEFN(
MR_bool
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
MR_Integer new_val)
,
{
MR_COMPARE_AND_SWAP_WORD_BODY;
}
)
MR_OUTLINE_DEFN(
MR_bool
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
MR_Unsigned new_val)
,
{
MR_COMPARE_AND_SWAP_WORD_BODY;
}
)
MR_OUTLINE_DEFN(
MR_Integer
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
,
{
MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
}
)
MR_OUTLINE_DEFN(
MR_Unsigned
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
,
{
MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
,
{
MR_ATOMIC_ADD_INT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
,
{
MR_ATOMIC_ADD_UINT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
,
{
MR_ATOMIC_SUB_INT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_inc_int(volatile MR_Integer *addr)
,
{
MR_ATOMIC_INC_INT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_inc_uint(volatile MR_Unsigned *addr)
,
{
MR_ATOMIC_INC_UINT_BODY;
}
)
MR_OUTLINE_DEFN(
void
MR_atomic_dec_int(volatile MR_Integer *addr)
,
{
MR_ATOMIC_DEC_INT_BODY;
}
)
MR_OUTLINE_DEFN(
MR_bool
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
,
{
MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
}
)
MR_OUTLINE_DEFN(
MR_bool
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
,
{
MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
}
)
#endif // MR_THREAD_SAFE
////////////////////////////////////////////////////////////////////////////
#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
// Profiling of the parallel runtime.
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
// True if the RDTSCP and RDTSC instructions are available respectively.
static MR_bool MR_rdtscp_is_available = MR_FALSE;
static MR_bool MR_rdtsc_is_available = MR_FALSE;
#endif
MR_uint_least64_t MR_cpu_cycles_per_sec = 0;
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
// Set this to 1 to enable some printfs below
#define MR_DEBUG_CPU_FEATURE_DETECTION 0
// cpuid, rdtscp and rdtsc are i386/x86_64 instructions.
static __inline__ void MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
MR_Unsigned *a, MR_Unsigned *b,
MR_Unsigned *c, MR_Unsigned *d);
static __inline__ void MR_rdtscp(MR_uint_least64_t *tsc,
MR_Unsigned *processor_id);
static __inline__ void MR_rdtsc(MR_uint_least64_t *tsc);
// Return zero if parsing failed, otherwise return the number of cycles per
// second.
static MR_uint_least64_t parse_freq_from_x86_brand_string(char *string);
#endif // MR_GNUC && (__i386__ || __x86_64__)
void
MR_do_cpu_feature_detection(void)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
MR_Unsigned a, b, c, d;
MR_Unsigned eflags, old_eflags;
MR_Unsigned maximum_extended_page;
MR_Unsigned extended_family, basic_family, family;
MR_Unsigned extended_model, model;
// Check for the CPUID instruction. CPUID is supported if we can flip bit
// 21 in the CPU's EFLAGS register. The assembly below is written in a
// subset of i386 and x86_64 assembly. To read and write EFLAGS we have
// to go via the C stack.
__asm__ ("pushf; pop %0"
:"=r"(eflags));
old_eflags = eflags;
// Flip bit 21
eflags ^= (1 << 21);
__asm__ ("push %0; popf; pushf; pop %0;"
:"=r"(eflags)
:"0"(eflags));
// Test to see if our change held. We don't restore eflags, a change to
// the ID bit has no effect.
if (eflags == old_eflags) {
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "This CPU doesn't support the CPUID instruction.\n",
eflags, old_eflags);
#endif
return;
}
// CPUID 0 gives the maximum basic CPUID page in EAX. Basic pages go up to
// but not including 0x40000000.
MR_cpuid(0, 0, &a, &b, &c, &d);
if (a < 1) {
return;
}
// CPUID 1 gives type, family, model and stepping information in EAX.
MR_cpuid(1, 0, &a, &b, &c, &d);
// Bit 4 in EDX is high if RDTSC is available
if (d & (1 << 4)) {
MR_rdtsc_is_available = MR_TRUE;
}
// BTW: Intel can't count:
//
// http://www.pagetable.com/?p=18
// http://www.codinghorror.com/blog/archives/000364.html
//
// 486 (1989): family 4
// Pentium (1993): family 5
// Pentium Pro (1995): family 6, models 0 and 1
// Pentium 2 (1997): family 6, models 3, 5 and 6
// Pentium 3 (2000): family 6, models 7, 8, 10, 11
// Itanium (2001): family 7
// Pentium 4 (2000): family 15/0
// Itanium 2 (2002): family 15/1 and 15/2
// Pentium D: family 15/4
// Pentium M (2003): family 6, models 9 and 13
// Core (2006): family 6, model 14
// Core 2 (2006): family 6, model 15
// i7: family 6, model 26
// Atom: family 6, model 28
//
// This list is incomplete, it doesn't cover AMD or any other brand of x86
// processor, and it probably doesn't cover all post-pentium Intel
// processors.
// bits 8-11 (first bit (LSB) is bit 0)
basic_family = (a & 0x00000F00) >> 8;
if (basic_family == 0x0F) {
// bits 20-27
extended_family = (a & 0x0FF00000) >> 20;
family = basic_family + extended_family;
} else {
family = basic_family;
}
// I'm not using the model value but I'll leave the code here in case we
// have a reason to use it in the future.
// bits 4-7
model = (a & 0x000000F0) >> 4;
if ((basic_family == 0x0F) || (basic_family == 0x06)) {
// bits 16-19
extended_model = (a & 0x000F0000) >> 16;
model += (extended_model << 4);
}
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "This is family %d and model %d\n", family, model);
#endif
// Now check for P3 or higher since they have the extended pages.
if (family < 6) {
// This is a 486 or Pentium.
return;
}
// I could bail out here if this was a pentium 3, but there is a more
// reliable check for extended CPUID support below that should work on AMD
// chips as well, if I knew all the model numbers for all family 6
// processors and knew if they honoured extended CPUID.
// Extended CPUID 0x80000000.
//
// EAX contains the maximum extended CPUID node.
MR_cpuid(0x80000000, 0, &a, &b, &c, &d);
if ((a & 0x80000000) == 0) {
// Extended CPUID is not supported.
// Note that this check is still not as reliable as I'd like. If it
// succeeds I'm not confident that the processor definitely implements
// extended CPUID.
return;
}
maximum_extended_page = a;
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "Maximum extended CPUID page: 0x%x\n", maximum_extended_page);
#endif
// Extended CPUID 0x80000001
//
// If EDX bit 27 is set the RDTSCP instruction is available.
if (maximum_extended_page >= 0x80000001) {
MR_cpuid(0x80000001, 0, &a, &b, &c, &d);
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "CPUID 0x80000001 EDX: 0x%x\n", d);
#endif
if ((d & (1 << 27))) {
// This processor supports RDTSCP.
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "RDTSCP is available\n");
#endif
MR_rdtscp_is_available = MR_TRUE;
}
}
if (maximum_extended_page >= 0x80000004) {
// 3 CPUID pages, 4 return registers each, containing 4 bytes each,
// plus a null byte. Intel says they include their own null byte, but
// for the cost of a single byte I feel safer using our own.
#define CPUID_BRAND_STRING_SIZE (3*4*4 + 1)
char buff[CPUID_BRAND_STRING_SIZE];
unsigned int page;
unsigned int byte;
unsigned int shift;
// This processor supports the brand string from which we can
// try to extract the clock speed. This algorithm is described
// in the Intel Instruction Set Reference, Volume 2B, Chapter 3,
// Pages 207-208, In particular the flow chart in figure 3-10.
// This does not work on AMD processors since they don't include
// the clock speed in the brand string.
for (page = 0; page < 3; page++) {
MR_cpuid(page + 0x80000002, 0, &a, &b, &c, &d);
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "CPUID page: 0x%.8x, eax: 0x%.8x, ebx: 0x%.8x, ecx: 0x%.8x, edx: 0x%.8x\n",
page + 0x80000002, a, b, c, d);
#endif
for (byte = 0; byte < 4; byte++) {
shift = byte * 8;
buff[page*4*4 + 0 + byte] = (char)(0xFF & (a >> shift));
buff[page*4*4 + 4 + byte] = (char)(0xFF & (b >> shift));
buff[page*4*4 + 8 + byte] = (char)(0xFF & (c >> shift));
buff[page*4*4 + 12 + byte] = (char)(0xFF & (d >> shift));
}
}
// Add a null byte.
buff[CPUID_BRAND_STRING_SIZE - 1] = 0;
#if MR_DEBUG_CPU_FEATURE_DETECTION
fprintf(stderr, "CPUID Brand string: %s\n", buff);
#endif
MR_cpu_cycles_per_sec = parse_freq_from_x86_brand_string(buff);
#if MR_DEBUG_CPU_FEATURE_DETECTION
if (MR_cpu_cycles_per_sec == 0) {
fprintf(stderr, "Failed to detect cycles per second "
"you can probably blame AMD for this.\n");
} else {
fprintf(stderr, "Cycles per second: %ld\n", MR_cpu_cycles_per_sec);
}
#endif
}
#endif // MR_GNUC && (__i386__ || __x86_64__)
}
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
static MR_uint_least64_t
parse_freq_from_x86_brand_string(char *string)
{
unsigned int brand_string_len;
unsigned int i;
double multiplier;
int freq_index = -1;
brand_string_len = strlen(string);
// There will be at least five characters if we can parse this, three
// for the '?Hz' suffix, at least one for the units, plus a space at
// the beginning of the number.
if (!(brand_string_len > 5))
return 0;
if (!((string[brand_string_len - 1] == 'z') &&
(string[brand_string_len - 2] == 'H'))) {
return 0;
}
switch (string[brand_string_len - 3]) {
case 'M':
multiplier = 1000000.0;
break;
case 'G':
multiplier = 1000000000.0;
break;
case 'T':
// Yes, this is defined in the specification, Intel have some
// strong ambitions regarding Moore's law. :-)
// We include it here to conform with the standard.
multiplier = 1000000000000.0;
break;
default:
return 0;
}
// Search for the beginning of the digits.
for (i = brand_string_len - 4; i >= 0; i--) {
if (string[i] == ' ') {
freq_index = i+1;
break;
}
}
if (freq_index == -1) {
// We didn't find the beginning of the frequency.
return 0;
}
// If strtod fails it returns zero, so if we fail to parse a number here,
// we will return zero, which our caller understands as a parsing failure.
return (MR_uint_least64_t)(strtod(&string[freq_index], NULL) * multiplier);
}
#endif // MR_GNUC && (__i386__ || __x86_64__)
void
MR_profiling_start_timer(MR_Timer *timer)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
// If we don't have enough data to fill in all the fields of this structure
// we leave them alone, we won't check them later without checking
// MR_rdtsc{p}_is_available first.
if (MR_rdtscp_is_available) {
MR_rdtscp(&(timer->MR_timer_time), &(timer->MR_timer_processor_id));
} else if (MR_rdtsc_is_available) {
MR_rdtsc(&(timer->MR_timer_time));
}
#endif
}
void
MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
MR_Timer now;
MR_int_least64_t duration;
MR_uint_least64_t duration_squared;
if (MR_rdtscp_is_available) {
MR_rdtscp(&(now.MR_timer_time), &(now.MR_timer_processor_id));
if (timer->MR_timer_processor_id == now.MR_timer_processor_id) {
duration = now.MR_timer_time - timer->MR_timer_time;
duration_squared = duration * duration;
MR_atomic_inc_uint(&(stats->MR_stat_count_recorded));
#if MR_LOW_TAG_BITS >= 3
MR_atomic_add_int(&(stats->MR_stat_sum), duration);
MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared);
#else
MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock));
stats->MR_stat_sum += duration;
stats->MR_stat_sum_squares += duration_squared;
MR_US_UNLOCK(&(stats->MR_stat_sums_lock));
#endif
} else {
MR_atomic_inc_uint(&(stats->MR_stat_count_not_recorded));
}
} else if (MR_rdtsc_is_available) {
MR_rdtsc(&(now.MR_timer_time));
duration = now.MR_timer_time - timer->MR_timer_time;
duration_squared = duration * duration;
MR_atomic_inc_uint(&(stats->MR_stat_count_recorded));
#if MR_LOW_TAG_BITS >= 3
MR_atomic_add_int(&(stats->MR_stat_sum), duration);
MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared);
#else
MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock));
stats->MR_stat_sum += duration;
stats->MR_stat_sum_squares += duration_squared;
MR_US_UNLOCK(&(stats->MR_stat_sums_lock));
#endif
}
#else // not MR_GNUC && (__i386__ || __x86_64__)
// No TSC support on this architecture or with this C compiler.
MR_atomic_inc_int(&(stats->MR_stat_count_recorded));
#endif // not MR_GNUC && (__i386__ || __x86_64__)
}
// The TSC works and MR_cpu_cycles_per_sec is nonzero.
extern MR_bool
MR_tsc_is_sensible(void)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
return ((MR_rdtscp_is_available || MR_rdtsc_is_available) &&
(MR_cpu_cycles_per_sec != 0));
#else
return MR_FALSE;
#endif
}
MR_uint_least64_t
MR_read_cpu_tsc(void)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
MR_uint_least64_t tsc;
if (MR_rdtsc_is_available) {
MR_rdtsc(&tsc);
} else {
tsc = 0;
}
return tsc;
#else // not MR_GNUC && (__i386__ || __x86_64__)
return 0;
#endif // not MR_GNUC && (__i386__ || __x86_64__)
}
// It is convenient that this instruction is the same on both i386 and x86_64.
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
static __inline__ void
MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d)
{
#ifdef __x86_64__
__asm__("cpuid"
: "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d)
: "0"(code), "2"(sub_code));
#elif defined(__i386__)
// i386 is more register staved, in particular we can't use ebx in
// position independent code. And we can't move ebx into another
// general purpose register, between register pinning, PIC, the
// stack and frame pointers and the other registers used by CPUID
// there are literally no general purpose registers left on i386.
__asm__("pushl %%ebx; \
cpuid; \
movl %%ebx, %1; \
popl %%ebx;"
: "=a"(*a), "=m"(*b), "=c"(*c), "=d"(*d)
: "0"(code), "2"(sub_code)
: "memory");
#endif
}
static __inline__ void
MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id)
{
MR_Unsigned tsc_low;
MR_Unsigned tsc_high;
// On 64bit systems the high 32 bits of RAX and RDX are 0 filled by
// rdtsc{p}.
__asm__("rdtscp"
: "=a"(tsc_low), "=d"(tsc_high), "=c"(*processor_id));
*tsc = tsc_high;
*tsc = *tsc << 32;
*tsc |= tsc_low;
}
static __inline__ void
MR_rdtsc(MR_uint_least64_t *tsc)
{
MR_Unsigned tsc_low;
MR_Unsigned tsc_high;
__asm__("rdtsc"
: "=a"(tsc_low), "=d"(tsc_high));
*tsc = tsc_high;
*tsc = *tsc << 32;
*tsc |= tsc_low;
}
#endif // MR_GNUC && (__i386__ || __x86_64__)
#endif // MR_PROFILE_PARALLEL_EXECUTION_SUPPORT