// vim: ts=4 sw=4 expandtab ft=c

// Copyright (C) 2007, 2009-2011 The University of Melbourne.
// Copyright (C) 2016, 2018 The Mercury team.
// This file is distributed under the terms specified in COPYING.LIB.

// mercury_atomic.h - defines atomic operations and other primitives used by
// the parallel runtime.
//
// XXX we should have a version of this module that uses C11 atomics
// where possible.

#ifndef MERCURY_ATOMIC_OPS_H
#define MERCURY_ATOMIC_OPS_H

#include "mercury_std.h"

////////////////////////////////////////////////////////////////////////////

// Use this to make some storage volatile only when using a threadsafe grade.

#ifdef MR_THREAD_SAFE
  #define MR_THREADSAFE_VOLATILE    volatile
#else
  #define MR_THREADSAFE_VOLATILE
#endif

#if defined(MR_THREAD_SAFE)

// Intel and AMD support a pause instruction that is roughly equivalent
// to a no-op. Intel recommend that it is used in spin-loops to improve
// performance. Without a pause instruction, multiple simultaneous
// read-requests will be in-flight for the synchronization variable from a
// single thread. Giving the pause instruction causes these to be executed
// in sequence, allowing the processor to handle the change in the
// synchronization variable more easily.
//
// On some chips it may cause the spin-loop to use less power.
//
// This instruction was introduced with the Pentium 4 but is backwards
// compatible, This works because the two byte instruction for PAUSE is
// equivalent to the NOP instruction prefixed by REPE. Therefore older
// processors perform a no-op.
//
// This is not really an atomic instruction but we name it MR_ATOMIC_PAUSE
// for consistency.
//
// References: Intel and AMD documentation for PAUSE, Intel optimisation guide.

#if ( defined(MR_CLANG) || defined(MR_GNUC) ) &&                        \
    ( defined(__i386__) || defined(__x86_64__) ) &&                     \
    !defined(MR_DO_NOT_USE_CPU_RELAX)

    #define MR_ATOMIC_PAUSE                                                 \
        do {                                                                \
            __asm__ __volatile__("pause");                                  \
        } while (0)

#else

    // Fall back to a no-op
    #define MR_ATOMIC_PAUSE                                                 \
        do {                                                                \
            ;                                                               \
        } while (0)

#endif

////////////////////////////////////////////////////////////////////////////

// Declarations for inline atomic operations.
//
// These operations work on machine word-sized values, this is distinct from
// C's idea of 'int' and 'unsigned int'. MR_Integer and MR_Unsigned are
// supposed to be machine word sized so these functions should only be used
// with values of these types.

// If the value at addr is equal to old, assign new to addr and return true.
// Otherwise return false.

MR_EXTERN_INLINE MR_bool    MR_compare_and_swap_int(
                                volatile MR_Integer *addr,
                                MR_Integer old, MR_Integer new_val);
MR_EXTERN_INLINE MR_bool    MR_compare_and_swap_uint(
                                volatile MR_Unsigned *addr,
                                MR_Unsigned old, MR_Unsigned new_val);

// Atomically add to an integer in memory and retrieve the result. In other
// words an atomic pre-increment operation.

MR_EXTERN_INLINE MR_Integer MR_atomic_add_and_fetch_int(
                                volatile MR_Integer *addr, MR_Integer addend);
MR_EXTERN_INLINE MR_Unsigned MR_atomic_add_and_fetch_uint(
                                volatile MR_Unsigned *addr, MR_Unsigned addend);

// Atomically add the second argument to the memory pointed to by the first
// argument.

MR_EXTERN_INLINE void       MR_atomic_add_int(volatile MR_Integer *addr,
                                MR_Integer addend);
MR_EXTERN_INLINE void       MR_atomic_add_uint(volatile MR_Unsigned *addr,
                                MR_Unsigned addend);

// Atomically subtract the second argument from the memory pointed to by the
// first argument.

MR_EXTERN_INLINE void       MR_atomic_sub_int(volatile MR_Integer *addr,
                                MR_Integer x);

// Increment the word pointed at by the address.

MR_EXTERN_INLINE void       MR_atomic_inc_int(volatile MR_Integer *addr);
MR_EXTERN_INLINE void       MR_atomic_inc_uint(volatile MR_Unsigned *addr);

// Decrement the word pointed at by the address.

MR_EXTERN_INLINE void       MR_atomic_dec_int(volatile MR_Integer *addr);

// Decrement the integer pointed at by the address and return true iff it is
// zero after the decrement.

MR_EXTERN_INLINE MR_bool    MR_atomic_dec_and_is_zero_int(
                                volatile MR_Integer *addr);
MR_EXTERN_INLINE MR_bool    MR_atomic_dec_and_is_zero_uint(
                                volatile MR_Unsigned *addr);

// For information about GCC's builtins for atomic operations see:
// http://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcc/Atomic-Builtins.html

////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////

#if (defined(MR_CLANG) || (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1))) && \
    !defined(MR_AVOID_COMPILER_INTRINSICS)

    // gcc 4.1 and above have builtin atomic operations.

    #define MR_COMPARE_AND_SWAP_WORD_BODY                                   \
        do {                                                                \
            return __sync_bool_compare_and_swap(addr, old, new_val);        \
        } while (0)

#elif defined(MR_GNUC) && defined(__x86_64__)

    #define MR_COMPARE_AND_SWAP_WORD_BODY                                   \
        do {                                                                \
            char result;                                                    \
                                                                            \
            __asm__ __volatile__(                                           \
                "lock; cmpxchgq %4, %0; setz %1"                            \
                : "=m"(*addr), "=q"(result), "=a"(old)                      \
                : "m"(*addr), "r" (new_val), "a"(old)                       \
            );                                                              \
            return (MR_bool) result;                                        \
        } while (0)

#elif defined(MR_GNUC) && defined(__i386__)

    // Really 486 or better.
    #define MR_COMPARE_AND_SWAP_WORD_BODY                                   \
        do {                                                                \
            char result;                                                    \
                                                                            \
            __asm__ __volatile__(                                           \
                "lock; cmpxchgl %4, %0; setz %1"                            \
                : "=m"(*addr), "=q"(result), "=a"(old)                      \
                : "m"(*addr), "r" (new_val), "a"(old)                       \
                );                                                          \
            return (MR_bool) result;                                        \
        } while (0)

#endif

#ifdef MR_COMPARE_AND_SWAP_WORD_BODY
    MR_EXTERN_INLINE MR_bool
    MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
            MR_Integer new_val)
    {
        MR_COMPARE_AND_SWAP_WORD_BODY;
    }

    MR_EXTERN_INLINE MR_bool
    MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
            MR_Unsigned new_val)
    {
        MR_COMPARE_AND_SWAP_WORD_BODY;
    }
#endif

////////////////////////////////////////////////////////////////////////////

#if (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)) &&           \
    !defined(MR_AVOID_COMPILER_INTRINSICS)

    #define MR_ATOMIC_ADD_AND_FETCH_WORD_BODY                               \
        do {                                                                \
            return __sync_add_and_fetch(addr, addend);                      \
        } while (0)

    #define MR_ATOMIC_ADD_AND_FETCH_INT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
    #define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY

#elif defined(MR_COMPARE_AND_SWAP_WORD_BODY)
    // If there is no GCC builtin for this then it can be implemented in terms
    // of compare and swap, assuming that that has been implemented in
    // assembler for this architecture.
    //
    // XXX: There is an add and exchange (xadd) instruction on x86, this is
    // better than the CAS loop below.

    #define MR_ATOMIC_ADD_AND_FETCH_INT_BODY                                \
        do {                                                                \
            MR_Integer temp;                                                \
            temp = *addr;                                                   \
            while (!MR_compare_and_swap_int(addr, temp, temp+addend)) {     \
                MR_ATOMIC_PAUSE;                                            \
                temp = *addr;                                               \
            }                                                               \
            return temp+addend;                                             \
        } while (0)

    #define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY                               \
        do {                                                                \
            MR_Unsigned temp;                                               \
            temp = *addr;                                                   \
            while (!MR_compare_and_swap_uint(addr, temp, temp+addend)) {    \
                MR_ATOMIC_PAUSE;                                            \
                temp = *addr;                                               \
            }                                                               \
            return temp+addend;                                             \
        } while (0)

#endif

#ifdef MR_ATOMIC_ADD_AND_FETCH_INT_BODY
    MR_EXTERN_INLINE MR_Integer
    MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
    {
        MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
    }
#endif

#ifdef MR_ATOMIC_ADD_AND_FETCH_UINT_BODY
    MR_EXTERN_INLINE MR_Unsigned
    MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
    {
        MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
    }
#endif

////////////////////////////////////////////////////////////////////////////

#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) &&   \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    #define MR_ATOMIC_ADD_WORD_BODY                                         \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; addq %2, %0"                                         \
                : "=m"(*addr)                                               \
                : "m"(*addr), "r"(addend)                                   \
                );                                                          \
        } while (0)

    #define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
    #define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)

    #define MR_ATOMIC_ADD_WORD_BODY                                         \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; addl %2, %0;"                                        \
                : "=m"(*addr)                                               \
                : "m"(*addr), "r"(addend)                                   \
                );                                                          \
        } while (0)

    #define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
    #define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY

#elif defined(MR_ATOMIC_ADD_AND_FETCH_INT_BODY)

    #define MR_ATOMIC_ADD_INT_BODY                                          \
        do {                                                                \
            MR_atomic_add_and_fetch_int(addr, addend);                      \
        } while (0)

    #define MR_ATOMIC_ADD_UINT_BODY                                         \
        do {                                                                \
            MR_atomic_add_and_fetch_uint(addr, addend);                     \
        } while (0)

#endif

#ifdef MR_ATOMIC_ADD_INT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
    {
        MR_ATOMIC_ADD_INT_BODY;
    }
#endif

#ifdef MR_ATOMIC_ADD_UINT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
    {
        MR_ATOMIC_ADD_UINT_BODY;
    }
#endif

////////////////////////////////////////////////////////////////////////////

#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) &&   \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    #define MR_ATOMIC_SUB_INT_BODY                                          \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; subq %2, %0;"                                        \
                : "=m"(*addr)                                               \
                : "m"(*addr), "r"(x)                                        \
                );                                                          \
        } while (0)

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)

    #define MR_ATOMIC_SUB_INT_BODY                                          \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; subl %2, %0;"                                        \
                : "=m"(*addr)                                               \
                : "m"(*addr), "r"(x)                                        \
                );                                                          \
        } while (0)

#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)

    #define MR_ATOMIC_SUB_INT_BODY                                          \
        do {                                                                \
            __sync_sub_and_fetch(addr, x);                                  \
        } while (0)

#endif

#ifdef MR_ATOMIC_SUB_INT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
    {
        MR_ATOMIC_SUB_INT_BODY;
    }
#endif

////////////////////////////////////////////////////////////////////////////

#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) &&   \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    #define MR_ATOMIC_INC_WORD_BODY                                         \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; incq %0;"                                            \
                : "=m"(*addr)                                               \
                : "m"(*addr)                                                \
                );                                                          \
        } while (0)

    #define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
    #define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) &&   \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    // Really 486 or better.
    #define MR_ATOMIC_INC_WORD_BODY                                         \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; incl %0;"                                            \
                : "=m"(*addr)                                               \
                : "m"(*addr)                                                \
                );                                                          \
        } while (0)

    #define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
    #define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY

#else

    // Fall back to an atomic add 1 operation.
    //
    // We could fall back to the built-in GCC instructions but they also fetch
    // the value. I believe this is more efficient. pbone

    #define MR_ATOMIC_INC_INT_BODY                                          \
        MR_atomic_add_int(addr, 1)
    #define MR_ATOMIC_INC_UINT_BODY                                         \
        MR_atomic_add_uint(addr, 1)

#endif

#ifdef MR_ATOMIC_INC_INT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_inc_int(volatile MR_Integer *addr)
    {
        MR_ATOMIC_INC_INT_BODY;
    }
#endif

#ifdef MR_ATOMIC_INC_UINT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_inc_uint(volatile MR_Unsigned *addr)
    {
        MR_ATOMIC_INC_UINT_BODY;
    }
#endif

////////////////////////////////////////////////////////////////////////////

#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) &&   \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    #define MR_ATOMIC_DEC_INT_BODY                                          \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; decq %0;"                                            \
                : "=m"(*addr)                                               \
                : "m"(*addr)                                                \
                );                                                          \
        } while (0)

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) &&   \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    // Really 486 or better.
    #define MR_ATOMIC_DEC_INT_BODY                                          \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; decl %0;"                                            \
                : "=m"(*addr)                                               \
                : "m"(*addr)                                                \
                );                                                          \
        } while (0)
#else
    // Fall back to an atomic subtract 1 operation.

    #define MR_ATOMIC_DEC_INT_BODY                                          \
        MR_atomic_sub_int(addr, 1)

#endif

#ifdef MR_ATOMIC_DEC_INT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_dec_int(volatile MR_Integer *addr)
    {
        MR_ATOMIC_DEC_INT_BODY;
    }
#endif

////////////////////////////////////////////////////////////////////////////

// Note that on x86(_64) we have to use the sub instruction rather than the
// dec instruction because we need it to set the CPU flags.

#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) &&   \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

// This could be trivially implemented using the __sync_sub_and_fetch compiler
// intrinsic. However on some platforms this could use a compare and exchange
// loop. We can avoid this because we don't need to retrieve the result of the
// subtraction.

    #define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY                             \
        do {                                                                \
            char is_zero;                                                   \
            __asm__(                                                        \
                "lock; subq $1, %0; setz %1"                                \
                : "=m"(*addr), "=q"(is_zero)                                \
                : "m"(*addr)                                                \
                );                                                          \
            return (MR_bool)is_zero;                                        \
        } while (0)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY                          \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
    #define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY                         \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY                              \
        do {                                                                 \
            char is_zero;                                                    \
            __asm__(                                                         \
                "lock; subl $1, %0; setz %1"                                 \
                : "=m"(*addr), "=q"(is_zero)                                 \
                : "m"(*addr)                                                 \
                );                                                           \
            return (MR_bool)is_zero;                                         \
        } while (0)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY                          \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
    #define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY                         \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY

#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY                             \
        do {                                                                \
            return (__sync_sub_and_fetch(addr, 1) == 0);                    \
        } while (0)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY                          \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
    #define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY                         \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY

#endif

#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY
    MR_EXTERN_INLINE MR_bool
    MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
    {
        MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
    }
#endif

#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY
    MR_EXTERN_INLINE MR_bool
    MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
    {
        MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
    }
#endif

#endif // MR_THREAD_SAFE

////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////

#ifdef MR_THREAD_SAFE

// Memory fence operations.

#if ( defined(MR_CLANG) || defined(MR_GNUC) ) &&                        \
    ( defined(__i386__) || defined(__x86_64__) ) &&                     \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    // Guarantees that any stores executed before this fence are
    // globally visible before those after this fence.

    #define MR_CPU_SFENCE                                                   \
        do {                                                                \
            __asm__ __volatile__("sfence");                                 \
        } while (0)

    // Guarantees that any loads executed before this fence are complete
    // before any loads after this fence.

    #define MR_CPU_LFENCE                                                   \
        do {                                                                \
            __asm__ __volatile__("lfence");                                 \
        } while (0)

    // A combination of the above.

    #define MR_CPU_MFENCE                                                   \
        do {                                                                \
            __asm__ __volatile__("mfence");                                 \
        } while (0)

#elif defined(MR_CLANG) || MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)

    // Our memory fences are better than GCC's. GCC only implements a full
    // fence.

    #define MR_CPU_MFENCE                                                   \
        do {                                                                \
            __sync_synchronize();                                           \
        } while (0)
    #define MR_CPU_SFENCE MR_CPU_MFENCE
    #define MR_CPU_LFENCE MR_CPU_MFENCE

#else

    // Do not break this string literal; doing so produces an unreadable
    // mess with clang.
    #error "Please implement memory fence operations for this compiler/architecture"

#endif

#endif // MR_THREAD_SAFE

////////////////////////////////////////////////////////////////////////////

#ifdef MR_LL_PARALLEL_CONJ

// Roll our own cheap user-space mutual exclusion locks. Blocking without
// spinning is not supported. Storage for these locks should be volatile.
//
// I expect these to be faster than pthread mutexes when threads are pinned
// and critical sections are short.

typedef MR_Unsigned MR_Us_Lock;

#define MR_US_LOCK_INITIAL_VALUE (0)

#define MR_US_TRY_LOCK(x)                                                   \
    MR_compare_and_swap_uint(x, 0, 1)

#define MR_US_SPIN_LOCK(x)                                                  \
    do {                                                                    \
        while (!MR_compare_and_swap_uint(x, 0, 1)) {                        \
            MR_ATOMIC_PAUSE;                                                \
        }                                                                   \
    } while (0)

#define MR_US_UNLOCK(x)                                                     \
    do {                                                                    \
        MR_CPU_MFENCE;                                                      \
        *x = 0;                                                             \
    } while (0)

// Similar support for condition variables. Again, make sure that storage for
// these is declared as volatile.
//
// XXX: These are not atomic, A waiting thread will not see a change until
// sometime after the signaling thread has signaled the condition. The same
// race can occur when clearing a condition. Order of memory operations is not
// guaranteed either.

typedef MR_Unsigned MR_Us_Cond;

#define MR_US_COND_CLEAR(x)                                                 \
    do {                                                                    \
        MR_CPU_MFENCE;                                                      \
        *x = 0;                                                             \
    } while (0)

#define MR_US_COND_SET(x)                                                   \
    do {                                                                    \
        MR_CPU_MFENCE;                                                      \
        *x = 1;                                                             \
        MR_CPU_MFENCE;                                                      \
    } while (0)

#define MR_US_SPIN_COND(x)                                                  \
    do {                                                                    \
        while (!(*x)) {                                                     \
            MR_ATOMIC_PAUSE;                                                \
        }                                                                   \
        MR_CPU_MFENCE;                                                      \
    } while (0)

#endif // MR_LL_PARALLEL_CONJ

// If we don't have definitions available for this compiler or architecture
// then we will get a link error in low-level .par grades. No other grades
// currently require any atomic ops.

////////////////////////////////////////////////////////////////////////////

#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)

// Declarations for profiling the parallel runtime.

typedef struct {
    // The total number of times this event occurred is implicitly the sum of
    // the recorded and not_recorded counts.

    volatile MR_Unsigned    MR_stat_count_recorded;
    volatile MR_Unsigned    MR_stat_count_not_recorded;

    // Atomic instructions are used to update these fields, and these fields
    // must be 64 bit to contain the valid ranges of values. However a 32 bit
    // machine cannot (usually) do atomic operations on 64 bit data. Therefore
    // if we have fewer than 64 bits we protect these two fields with a lock.
    //
    // The sum of squares is used to calculate variance and standard deviation.

  #if MR_LOW_TAG_BIGS >= 3
    volatile MR_Integer     MR_stat_sum;
    volatile MR_Unsigned    MR_stat_sum_squares;
  #else
    MR_Us_Lock              MR_stat_sums_lock;
    MR_int_least64_t        MR_stat_sum;
    MR_uint_least64_t       MR_stat_sum_squares;
  #endif
} MR_Stats;

typedef struct {
    MR_uint_least64_t   MR_timer_time;
    MR_Unsigned         MR_timer_processor_id;
} MR_Timer;

// The number of CPU clock cycles per second, ie a 1GHz CPU will have a value
// of 10^9, zero if unknown.
// This value is only available after MR_do_cpu_feature_detection() has been
// called.

extern MR_uint_least64_t MR_cpu_cycles_per_sec;

// Do CPU feature detection, this is necessary for profiling parallel code
// execution and the threadscope code.
// On i386 and x86_64 machines this uses CPUID to determine if the RDTSCP
// instruction is available and not prohibited by the OS.
// This function is idempotent.

extern void     MR_do_cpu_feature_detection(void);

// Start and initialize a timer structure.

extern void     MR_profiling_start_timer(MR_Timer *timer);

// Stop the timer and update stats with the results.

extern void     MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats);

// The TSC works and MR_cpu_cycles_per_sec is nonzero.

extern MR_bool  MR_tsc_is_sensible(void);

// Read the CPU's TSC. This is currently only implemented for i386 and x86-64
// systems. It returns 0 when support is not available.

extern MR_uint_least64_t    MR_read_cpu_tsc(void);

#endif // MR_PROFILE_PARALLEL_EXECUTION_SUPPORT

////////////////////////////////////////////////////////////////////////////

#endif // not MERCURY_ATOMIC_OPS_H