mercury/runtime/mercury_atomic_ops.c

// vim: ts=4 sw=4 expandtab ft=c

// Copyright (C) 2007, 2009-2011 The University of Melbourne.
// Copyright (C) 2014-2016, 2018 The Mercury team.
// This file is distributed under the terms specified in COPYING.LIB.

// mercury_atomic_ops.c

#include "mercury_imp.h"
#include "mercury_atomic_ops.h"

////////////////////////////////////////////////////////////////////////////

#if defined(MR_THREAD_SAFE)

// Definitions for the atomic functions declared `extern inline'.

MR_OUTLINE_DEFN(
    MR_bool
    MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
        MR_Integer new_val)
,
    {
        MR_COMPARE_AND_SWAP_WORD_BODY;
    }
)

MR_OUTLINE_DEFN(
    MR_bool
    MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
        MR_Unsigned new_val)
,
    {
        MR_COMPARE_AND_SWAP_WORD_BODY;
    }
)

MR_OUTLINE_DEFN(
    MR_Integer
    MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
,
    {
        MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
    }
)

MR_OUTLINE_DEFN(
    MR_Unsigned
    MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
,
    {
        MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
    }
)

MR_OUTLINE_DEFN(
    void
    MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
,
    {
        MR_ATOMIC_ADD_INT_BODY;
    }
)

MR_OUTLINE_DEFN(
    void
    MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
,
    {
        MR_ATOMIC_ADD_UINT_BODY;
    }
)

MR_OUTLINE_DEFN(
    void
    MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
,
    {
        MR_ATOMIC_SUB_INT_BODY;
    }
)

MR_OUTLINE_DEFN(
    void
    MR_atomic_inc_int(volatile MR_Integer *addr)
,
    {
        MR_ATOMIC_INC_INT_BODY;
    }
)

MR_OUTLINE_DEFN(
    void
    MR_atomic_inc_uint(volatile MR_Unsigned *addr)
,
    {
        MR_ATOMIC_INC_UINT_BODY;
    }
)

MR_OUTLINE_DEFN(
    void
    MR_atomic_dec_int(volatile MR_Integer *addr)
,
    {
        MR_ATOMIC_DEC_INT_BODY;
    }
)

MR_OUTLINE_DEFN(
    MR_bool
    MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
,
    {
        MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
    }
)

MR_OUTLINE_DEFN(
    MR_bool
    MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
,
    {
        MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
    }
)

#endif // MR_THREAD_SAFE

////////////////////////////////////////////////////////////////////////////

#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)

// Profiling of the parallel runtime.

#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
// True if the RDTSCP and RDTSC instructions are available respectively.

static MR_bool  MR_rdtscp_is_available = MR_FALSE;
static MR_bool  MR_rdtsc_is_available = MR_FALSE;
#endif

MR_uint_least64_t MR_cpu_cycles_per_sec = 0;

#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))

// Set this to 1 to enable some printfs below
#define MR_DEBUG_CPU_FEATURE_DETECTION 0

// cpuid, rdtscp and rdtsc are i386/x86_64 instructions.

static __inline__ void  MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
                            MR_Unsigned *a, MR_Unsigned *b,
                            MR_Unsigned *c, MR_Unsigned *d);

static __inline__ void  MR_rdtscp(MR_uint_least64_t *tsc,
                            MR_Unsigned *processor_id);

static __inline__ void  MR_rdtsc(MR_uint_least64_t *tsc);

// Return zero if parsing failed, otherwise return the number of cycles per
// second.

static MR_uint_least64_t parse_freq_from_x86_brand_string(char *string);

#endif // MR_GNUC && (__i386__ || __x86_64__)

void
MR_do_cpu_feature_detection(void)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
    MR_Unsigned     a, b, c, d;
    MR_Unsigned     eflags, old_eflags;
    MR_Unsigned     maximum_extended_page;
    MR_Unsigned     extended_family, basic_family, family;
    MR_Unsigned     extended_model, model;

    // Check for the CPUID instruction. CPUID is supported if we can flip bit
    // 21 in the CPU's EFLAGS register. The assembly below is written in a
    // subset of i386 and x86_64 assembly. To read and write EFLAGS we have
    // to go via the C stack.

    __asm__ ("pushf; pop %0"
            :"=r"(eflags));
    old_eflags = eflags;
    // Flip bit 21
    eflags ^= (1 << 21);
    __asm__ ("push %0; popf; pushf; pop %0;"
            :"=r"(eflags)
            :"0"(eflags));

    // Test to see if our change held. We don't restore eflags, a change to
    // the ID bit has no effect.

    if (eflags == old_eflags) {
#if MR_DEBUG_CPU_FEATURE_DETECTION
        fprintf(stderr, "This CPU doesn't support the CPUID instruction.\n",
            eflags, old_eflags);
#endif
        return;
    }

    // CPUID 0 gives the maximum basic CPUID page in EAX. Basic pages go up to
    // but not including 0x40000000.

    MR_cpuid(0, 0, &a, &b, &c, &d);
    if (a < 1) {
        return;
    }

    // CPUID 1 gives type, family, model and stepping information in EAX.
    MR_cpuid(1, 0, &a, &b, &c, &d);

    // Bit 4 in EDX is high if RDTSC is available
    if (d & (1 << 4)) {
        MR_rdtsc_is_available = MR_TRUE;
    }

    // BTW: Intel can't count:
    //
    // http://www.pagetable.com/?p=18
    // http://www.codinghorror.com/blog/archives/000364.html
    //
    // 486 (1989): family 4
    // Pentium (1993): family 5
    // Pentium Pro (1995): family 6, models 0 and 1
    // Pentium 2 (1997): family 6, models 3, 5 and 6
    // Pentium 3 (2000): family 6, models 7, 8, 10, 11
    // Itanium (2001): family 7
    // Pentium 4 (2000): family 15/0
    // Itanium 2 (2002): family 15/1 and 15/2
    // Pentium D: family 15/4
    // Pentium M (2003): family 6, models 9 and 13
    // Core (2006): family 6, model 14
    // Core 2 (2006): family 6, model 15
    // i7: family 6, model 26
    // Atom: family 6, model 28
    //
    // This list is incomplete, it doesn't cover AMD or any other brand of x86
    // processor, and it probably doesn't cover all post-pentium Intel
    // processors.

    // bits 8-11 (first bit (LSB) is bit 0)
    basic_family = (a & 0x00000F00) >> 8;
    if (basic_family == 0x0F) {
        // bits 20-27
        extended_family = (a & 0x0FF00000) >> 20;
        family = basic_family + extended_family;
    } else {
        family = basic_family;
    }

    // I'm not using the model value but I'll leave the code here in case we
    // have a reason to use it in the future.

    // bits 4-7
    model = (a & 0x000000F0) >> 4;
    if ((basic_family == 0x0F) || (basic_family == 0x06)) {
        // bits 16-19
        extended_model = (a & 0x000F0000) >> 16;
        model += (extended_model << 4);
    }
#if MR_DEBUG_CPU_FEATURE_DETECTION
    fprintf(stderr, "This is family %d and model %d\n", family, model);
#endif

    // Now check for P3 or higher since they have the extended pages.
    if (family < 6) {
        // This is a 486 or Pentium.
        return;
    }
    // I could bail out here if this was a pentium 3, but there is a more
    // reliable check for extended CPUID support below that should work on AMD
    // chips as well, if I knew all the model numbers for all family 6
    // processors and knew if they honoured extended CPUID.

    // Extended CPUID 0x80000000.
    //
    // EAX contains the maximum extended CPUID node.

    MR_cpuid(0x80000000, 0, &a, &b, &c, &d);
    if ((a & 0x80000000) == 0) {
        // Extended CPUID is not supported.
        // Note that this check is still not as reliable as I'd like. If it
        // succeeds I'm not confident that the processor definitely implements
        // extended CPUID.

        return;
    }
    maximum_extended_page = a;
#if MR_DEBUG_CPU_FEATURE_DETECTION
    fprintf(stderr, "Maximum extended CPUID page: 0x%x\n", maximum_extended_page);
#endif

    // Extended CPUID 0x80000001
    //
    // If EDX bit 27 is set the RDTSCP instruction is available.

    if (maximum_extended_page >= 0x80000001) {
        MR_cpuid(0x80000001, 0, &a, &b, &c, &d);
#if MR_DEBUG_CPU_FEATURE_DETECTION
        fprintf(stderr, "CPUID 0x80000001 EDX: 0x%x\n", d);
#endif
        if ((d & (1 << 27))) {
            // This processor supports RDTSCP.

#if MR_DEBUG_CPU_FEATURE_DETECTION
            fprintf(stderr, "RDTSCP is available\n");
#endif
            MR_rdtscp_is_available = MR_TRUE;
        }
    }

    if (maximum_extended_page >= 0x80000004) {
        // 3 CPUID pages, 4 return registers each, containing 4 bytes each,
        // plus a null byte. Intel says they include their own null byte, but
        // for the cost of a single byte I feel safer using our own.

#define CPUID_BRAND_STRING_SIZE (3*4*4 + 1)
        char buff[CPUID_BRAND_STRING_SIZE];
        unsigned int page;
        unsigned int byte;
        unsigned int shift;

        // This processor supports the brand string from which we can
        // try to extract the clock speed. This algorithm is described
        // in the Intel Instruction Set Reference, Volume 2B, Chapter 3,
        // Pages 207-208, In particular the flow chart in figure 3-10.
        // This does not work on AMD processors since they don't include
        // the clock speed in the brand string.

        for (page = 0; page < 3; page++) {
            MR_cpuid(page + 0x80000002, 0, &a, &b, &c, &d);
#if MR_DEBUG_CPU_FEATURE_DETECTION
            fprintf(stderr, "CPUID page: 0x%.8x, eax: 0x%.8x, ebx: 0x%.8x, ecx: 0x%.8x, edx: 0x%.8x\n",
                page + 0x80000002, a, b, c, d);
#endif
            for (byte = 0; byte < 4; byte++) {
                shift = byte * 8;
                buff[page*4*4 + 0 + byte] = (char)(0xFF & (a >> shift));
                buff[page*4*4 + 4 + byte] = (char)(0xFF & (b >> shift));
                buff[page*4*4 + 8 + byte] = (char)(0xFF & (c >> shift));
                buff[page*4*4 + 12 + byte] = (char)(0xFF & (d >> shift));
            }
        }
        // Add a null byte.
        buff[CPUID_BRAND_STRING_SIZE - 1] = 0;
#if MR_DEBUG_CPU_FEATURE_DETECTION
        fprintf(stderr, "CPUID Brand string: %s\n", buff);
#endif

        MR_cpu_cycles_per_sec = parse_freq_from_x86_brand_string(buff);
#if MR_DEBUG_CPU_FEATURE_DETECTION
        if (MR_cpu_cycles_per_sec == 0) {
            fprintf(stderr, "Failed to detect cycles per second "
                "you can probably blame AMD for this.\n");
        } else {
            fprintf(stderr, "Cycles per second: %ld\n", MR_cpu_cycles_per_sec);
        }
#endif
    }
#endif // MR_GNUC && (__i386__ || __x86_64__)
}

#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
static MR_uint_least64_t
parse_freq_from_x86_brand_string(char *string)
{
    unsigned int brand_string_len;
    unsigned int i;
    double       multiplier;
    int          freq_index = -1;

    brand_string_len = strlen(string);

    // There will be at least five characters if we can parse this, three
    // for the '?Hz' suffix, at least one for the units, plus a space at
    // the beginning of the number.

    if (!(brand_string_len > 5))
        return 0;

    if (!((string[brand_string_len - 1] == 'z') &&
          (string[brand_string_len - 2] == 'H'))) {
        return 0;
    }

    switch (string[brand_string_len - 3]) {
        case 'M':
            multiplier = 1000000.0;
            break;
        case 'G':
            multiplier = 1000000000.0;
            break;
        case 'T':
            // Yes, this is defined in the specification, Intel have some
            // strong ambitions regarding Moore's law. :-)
            // We include it here to conform with the standard.

            multiplier = 1000000000000.0;
            break;
        default:
            return 0;
    }

    // Search for the beginning of the digits.
    for (i = brand_string_len - 4; i >= 0; i--) {
        if (string[i] == ' ') {
            freq_index = i+1;
            break;
        }
    }
    if (freq_index == -1) {
        // We didn't find the beginning of the frequency.
        return 0;
    }

    // If strtod fails it returns zero, so if we fail to parse a number here,
    // we will return zero, which our caller understands as a parsing failure.

    return (MR_uint_least64_t)(strtod(&string[freq_index], NULL) * multiplier);
}
#endif // MR_GNUC && (__i386__ || __x86_64__)

void
MR_profiling_start_timer(MR_Timer *timer)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
    // If we don't have enough data to fill in all the fields of this structure
    // we leave them alone, we won't check them later without checking
    // MR_rdtsc{p}_is_available first.

    if (MR_rdtscp_is_available) {
        MR_rdtscp(&(timer->MR_timer_time), &(timer->MR_timer_processor_id));
    } else if (MR_rdtsc_is_available) {
        MR_rdtsc(&(timer->MR_timer_time));
    }
#endif
}

void
MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
    MR_Timer            now;
    MR_int_least64_t    duration;
    MR_uint_least64_t   duration_squared;

    if (MR_rdtscp_is_available) {
        MR_rdtscp(&(now.MR_timer_time), &(now.MR_timer_processor_id));
        if (timer->MR_timer_processor_id == now.MR_timer_processor_id) {
            duration = now.MR_timer_time - timer->MR_timer_time;
            duration_squared = duration * duration;
            MR_atomic_inc_uint(&(stats->MR_stat_count_recorded));
  #if MR_LOW_TAG_BITS >= 3
            MR_atomic_add_int(&(stats->MR_stat_sum), duration);
            MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared);
  #else
            MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock));
            stats->MR_stat_sum += duration;
            stats->MR_stat_sum_squares += duration_squared;
            MR_US_UNLOCK(&(stats->MR_stat_sums_lock));
  #endif
        } else {
            MR_atomic_inc_uint(&(stats->MR_stat_count_not_recorded));
        }
    } else if (MR_rdtsc_is_available) {
        MR_rdtsc(&(now.MR_timer_time));
        duration = now.MR_timer_time - timer->MR_timer_time;
        duration_squared = duration * duration;
        MR_atomic_inc_uint(&(stats->MR_stat_count_recorded));
  #if MR_LOW_TAG_BITS >= 3
        MR_atomic_add_int(&(stats->MR_stat_sum), duration);
        MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared);
  #else
        MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock));
        stats->MR_stat_sum += duration;
        stats->MR_stat_sum_squares += duration_squared;
        MR_US_UNLOCK(&(stats->MR_stat_sums_lock));
  #endif
    }
#else // not MR_GNUC && (__i386__ || __x86_64__)
    // No TSC support on this architecture or with this C compiler.
    MR_atomic_inc_int(&(stats->MR_stat_count_recorded));
#endif // not MR_GNUC && (__i386__ || __x86_64__)
}

// The TSC works and MR_cpu_cycles_per_sec is nonzero.

extern MR_bool
MR_tsc_is_sensible(void)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
    return ((MR_rdtscp_is_available || MR_rdtsc_is_available) &&
        (MR_cpu_cycles_per_sec != 0));
#else
    return MR_FALSE;
#endif
}

MR_uint_least64_t
MR_read_cpu_tsc(void)
{
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
    MR_uint_least64_t   tsc;

    if (MR_rdtsc_is_available) {
        MR_rdtsc(&tsc);
    } else {
        tsc = 0;
    }
    return tsc;
#else // not MR_GNUC && (__i386__ || __x86_64__)
    return 0;
#endif // not MR_GNUC && (__i386__ || __x86_64__)
}

// It is convenient that this instruction is the same on both i386 and x86_64.

#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))

static __inline__ void
MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
    MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d)
{
#ifdef __x86_64__
    __asm__("cpuid"
        : "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d)
        : "0"(code), "2"(sub_code));
#elif defined(__i386__)
    // i386 is more register staved, in particular we can't use ebx in
    // position independent code. And we can't move ebx into another
    // general purpose register, between register pinning, PIC, the
    // stack and frame pointers and the other registers used by CPUID
    // there are literally no general purpose registers left on i386.

    __asm__("pushl %%ebx;    \
             cpuid;          \
             movl %%ebx, %1; \
             popl %%ebx;"
        : "=a"(*a), "=m"(*b), "=c"(*c), "=d"(*d)
        : "0"(code), "2"(sub_code)
        : "memory");
#endif
}

static __inline__ void
MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id)
{
    MR_Unsigned tsc_low;
    MR_Unsigned tsc_high;

    // On 64bit systems the high 32 bits of RAX and RDX are 0 filled by
    // rdtsc{p}.

    __asm__("rdtscp"
           : "=a"(tsc_low), "=d"(tsc_high), "=c"(*processor_id));

    *tsc = tsc_high;
    *tsc = *tsc << 32;
    *tsc |= tsc_low;
}

static __inline__ void
MR_rdtsc(MR_uint_least64_t *tsc)
{
    MR_Unsigned tsc_low;
    MR_Unsigned tsc_high;

    __asm__("rdtsc"
           : "=a"(tsc_low), "=d"(tsc_high));

    *tsc = tsc_high;
    *tsc = *tsc << 32;
    *tsc |= tsc_low;
}

#endif // MR_GNUC && (__i386__ || __x86_64__)

#endif // MR_PROFILE_PARALLEL_EXECUTION_SUPPORT