mercury/runtime/mercury_atomic_ops.h

/*
** vim:ts=4 sw=4 expandtab
*/
/*
** Copyright (C) 2007, 2009-2011 The University of Melbourne.
** This file may only be copied under the terms of the GNU Library General
** Public License - see the file COPYING.LIB in the Mercury distribution.
*/

/*
** mercury_atomic.h - defines atomic operations and other primitives used by
** the parallel runtime.
*/

#ifndef MERCURY_ATOMIC_OPS_H
#define MERCURY_ATOMIC_OPS_H

#include "mercury_std.h"

/*---------------------------------------------------------------------------*/

/*
** Use this to make some storage volatile only when using a threadsafe grade.
*/
#ifdef MR_THREAD_SAFE
#define MR_THREADSAFE_VOLATILE volatile
#else
#define MR_THREADSAFE_VOLATILE
#endif

#if defined(MR_THREAD_SAFE)

/*
 * Intel and AMD support a pause instruction that is roughly equivalent
 * to a no-op.  Intel recommend that it is used in spin-loops to improve
 * performance.  Without a pause instruction multiple simultaneous
 * read-requests will be in-flight for the synchronization variable from a
 * single thread.  Giving the pause instruction causes these to be executed
 * in sequence allowing the processor to handle the change in the
 * synchronization variable more easily.
 *
 * On some chips it may cause the spin-loop to use less power.
 *
 * This instruction was introduced with the Pentium 4 but is backwards
 * compatible, This works because the two byte instruction for PAUSE is
 * equivalent to the NOP instruction prefixed by REPE.  Therefore older
 * processors perform a no-op.
 *
 * This is not really an atomic instruction but we name it
 * MR_ATOMIC_PAUSE for consistency.
 *
 * References: Intel and AMD documentation for PAUSE, Intel optimisation
 * guide.
 */
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) &&    \
    ( defined(__i386__) || defined(__x86_64__) ) && \
    !defined(MR_DO_NOT_USE_CPU_RELAX)

    #define MR_ATOMIC_PAUSE                                                 \
        do {                                                                \
            __asm__ __volatile__("pause");                                  \
        } while(0)

#else

    /* Fall back to a no-op */
    #define MR_ATOMIC_PAUSE                                                 \
        do {                                                                \
            ;                                                               \
        } while(0)

#endif

/*---------------------------------------------------------------------------*/

/*
** Declarations for inline atomic operations.
**
** These operations work on machine word-sized values, this is distinct from
** C's idea of 'int' and 'unsigned int'.  MR_Integer and MR_Unsigned are
** supposed to be machine word sized so these functions should only be used
** with values of these types.
*/

/*
** If the value at addr is equal to old, assign new to addr and return true.
** Otherwise return false.
*/
MR_EXTERN_INLINE MR_bool
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
        MR_Integer new_val);
MR_EXTERN_INLINE MR_bool
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
        MR_Unsigned new_val);

/*
** Atomically add to an integer in memory and retrieve the result.  In other
** words an atomic pre-increment operation.
*/
MR_EXTERN_INLINE MR_Integer
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend);
MR_EXTERN_INLINE MR_Unsigned
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend);

/*
** Atomically add the second argument to the memory pointed to by the first
** argument.
*/
MR_EXTERN_INLINE void
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend);
MR_EXTERN_INLINE void
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend);

/*
** Atomically subtract the second argument from the memory pointed to by the
** first argument.
*/
MR_EXTERN_INLINE void
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x);

/*
** Increment the word pointed at by the address.
*/
MR_EXTERN_INLINE void
MR_atomic_inc_int(volatile MR_Integer *addr);
MR_EXTERN_INLINE void
MR_atomic_inc_uint(volatile MR_Unsigned *addr);

/*
** Decrement the word pointed at by the address.
*/
MR_EXTERN_INLINE void
MR_atomic_dec_int(volatile MR_Integer *addr);

/*
** Decrement the integer pointed at by the address and return true iff it is
** zero after the decrement.
*/
MR_EXTERN_INLINE MR_bool
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr);
MR_EXTERN_INLINE MR_bool
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr);

/*
** For information about GCC's builtins for atomic operations see:
** http://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcc/Atomic-Builtins.html
*/

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

#if (defined(MR_CLANG) || (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1))) && \
    !defined(MR_AVOID_COMPILER_INTRINSICS)

    /*
    ** gcc 4.1 and above have builtin atomic operations.
    */
    #define MR_COMPARE_AND_SWAP_WORD_BODY                                   \
        do {                                                                \
            return __sync_bool_compare_and_swap(addr, old, new_val);        \
        } while (0)

#elif defined(MR_GNUC) && defined(__x86_64__)

    #define MR_COMPARE_AND_SWAP_WORD_BODY                                   \
        do {                                                                \
            char result;                                                    \
                                                                            \
            __asm__ __volatile__(                                           \
                "lock; cmpxchgq %4, %0; setz %1"                            \
                : "=m"(*addr), "=q"(result), "=a"(old)                      \
                : "m"(*addr), "r" (new_val), "a"(old)                       \
            );                                                              \
            return (MR_bool) result;                                        \
        } while (0)

#elif defined(MR_GNUC) && defined(__i386__)

    /* Really 486 or better. */
    #define MR_COMPARE_AND_SWAP_WORD_BODY                                   \
        do {                                                                \
            char result;                                                    \
                                                                            \
            __asm__ __volatile__(                                           \
                "lock; cmpxchgl %4, %0; setz %1"                            \
                : "=m"(*addr), "=q"(result), "=a"(old)                      \
                : "m"(*addr), "r" (new_val), "a"(old)                       \
                );                                                          \
            return (MR_bool) result;                                        \
        } while (0)

#endif

#ifdef MR_COMPARE_AND_SWAP_WORD_BODY
    MR_EXTERN_INLINE MR_bool
    MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
            MR_Integer new_val)
    {
        MR_COMPARE_AND_SWAP_WORD_BODY;
    }

    MR_EXTERN_INLINE MR_bool
    MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
            MR_Unsigned new_val)
    {
        MR_COMPARE_AND_SWAP_WORD_BODY;
    }
#endif

/*---------------------------------------------------------------------------*/

#if (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)) && \
    !defined(MR_AVOID_COMPILER_INTRINSICS)

    #define MR_ATOMIC_ADD_AND_FETCH_WORD_BODY                               \
        do {                                                                \
            return __sync_add_and_fetch(addr, addend);                      \
        } while (0)

    #define MR_ATOMIC_ADD_AND_FETCH_INT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
    #define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY

#elif defined(MR_COMPARE_AND_SWAP_WORD_BODY)
    /*
    ** If there is no GCC builtin for this then it can be implemented in terms
    ** of compare and swap, assuming that that has been implemented in
    ** assembler for this architecture.
    **
    ** XXX: There is an add and exchange (xadd) instruction on x86, this is
    ** better than the CAS loop below.
    */
    #define MR_ATOMIC_ADD_AND_FETCH_INT_BODY                                \
        do {                                                                \
            MR_Integer temp;                                                \
            temp = *addr;                                                   \
            while (!MR_compare_and_swap_int(addr, temp, temp+addend)) {     \
                MR_ATOMIC_PAUSE;                                            \
                temp = *addr;                                               \
            }                                                               \
            return temp+addend;                                             \
        } while (0)

    #define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY                               \
        do {                                                                \
            MR_Unsigned temp;                                               \
            temp = *addr;                                                   \
            while (!MR_compare_and_swap_uint(addr, temp, temp+addend)) {    \
                MR_ATOMIC_PAUSE;                                            \
                temp = *addr;                                               \
            }                                                               \
            return temp+addend;                                             \
        } while (0)

#endif

#ifdef MR_ATOMIC_ADD_AND_FETCH_INT_BODY
    MR_EXTERN_INLINE MR_Integer
    MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
    {
        MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
    }
#endif

#ifdef MR_ATOMIC_ADD_AND_FETCH_UINT_BODY
    MR_EXTERN_INLINE MR_Unsigned
    MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
    {
        MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
    }
#endif

/*---------------------------------------------------------------------------*/

#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    #define MR_ATOMIC_ADD_WORD_BODY                                         \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; addq %2, %0"                                         \
                : "=m"(*addr)                                               \
                : "m"(*addr), "r"(addend)                                   \
                );                                                          \
        } while (0)

    #define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
    #define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)

    #define MR_ATOMIC_ADD_WORD_BODY                                         \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; addl %2, %0;"                                        \
                : "=m"(*addr)                                               \
                : "m"(*addr), "r"(addend)                                   \
                );                                                          \
        } while (0)

    #define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
    #define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY

#elif defined(MR_ATOMIC_ADD_AND_FETCH_INT_BODY)

    #define MR_ATOMIC_ADD_INT_BODY                                          \
        do {                                                                \
            MR_atomic_add_and_fetch_int(addr, addend);                      \
        } while (0)

    #define MR_ATOMIC_ADD_UINT_BODY                                         \
        do {                                                                \
            MR_atomic_add_and_fetch_uint(addr, addend);                     \
        } while (0)

#endif

#ifdef MR_ATOMIC_ADD_INT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
    {
        MR_ATOMIC_ADD_INT_BODY;
    }
#endif

#ifdef MR_ATOMIC_ADD_UINT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
    {
        MR_ATOMIC_ADD_UINT_BODY;
    }
#endif

/*---------------------------------------------------------------------------*/

#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    #define MR_ATOMIC_SUB_INT_BODY                                          \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; subq %2, %0;"                                        \
                : "=m"(*addr)                                               \
                : "m"(*addr), "r"(x)                                        \
                );                                                          \
        } while (0)

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)

    #define MR_ATOMIC_SUB_INT_BODY                                          \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; subl %2, %0;"                                        \
                : "=m"(*addr)                                               \
                : "m"(*addr), "r"(x)                                        \
                );                                                          \
        } while (0)

#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)

    #define MR_ATOMIC_SUB_INT_BODY                                          \
        do {                                                                \
            __sync_sub_and_fetch(addr, x);                                  \
        } while (0)

#endif

#ifdef MR_ATOMIC_SUB_INT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
    {
        MR_ATOMIC_SUB_INT_BODY;
    }
#endif

/*---------------------------------------------------------------------------*/

#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    #define MR_ATOMIC_INC_WORD_BODY                                         \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; incq %0;"                                            \
                : "=m"(*addr)                                               \
                : "m"(*addr)                                                \
                );                                                          \
        } while (0)

    #define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
    #define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    /* Really 486 or better. */
    #define MR_ATOMIC_INC_WORD_BODY                                         \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; incl %0;"                                            \
                : "=m"(*addr)                                               \
                : "m"(*addr)                                                \
                );                                                          \
        } while (0)

    #define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
    #define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY

#else

    /*
    ** Fall back to an atomic add 1 operation.
    **
    ** We could fall back to the built-in GCC instructions but they also fetch
    ** the value.  I believe this is more efficient.
    **  - pbone
    */
    #define MR_ATOMIC_INC_INT_BODY                                          \
        MR_atomic_add_int(addr, 1)
    #define MR_ATOMIC_INC_UINT_BODY                                          \
        MR_atomic_add_uint(addr, 1)

#endif

#ifdef MR_ATOMIC_INC_INT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_inc_int(volatile MR_Integer *addr)
    {
        MR_ATOMIC_INC_INT_BODY;
    }
#endif

#ifdef MR_ATOMIC_INC_UINT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_inc_uint(volatile MR_Unsigned *addr)
    {
        MR_ATOMIC_INC_UINT_BODY;
    }
#endif

/*---------------------------------------------------------------------------*/

#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    #define MR_ATOMIC_DEC_INT_BODY                                          \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; decq %0;"                                            \
                : "=m"(*addr)                                               \
                : "m"(*addr)                                                \
                );                                                          \
        } while (0)

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    /* Really 486 or better. */
    #define MR_ATOMIC_DEC_INT_BODY                                          \
        do {                                                                \
            __asm__ __volatile__(                                           \
                "lock; decl %0;"                                            \
                : "=m"(*addr)                                               \
                : "m"(*addr)                                                \
                );                                                          \
        } while (0)
#else
    /*
    ** Fall back to an atomic subtract 1 operation.
    */

    #define MR_ATOMIC_DEC_INT_BODY                                          \
        MR_atomic_sub_int(addr, 1)

#endif

#ifdef MR_ATOMIC_DEC_INT_BODY
    MR_EXTERN_INLINE void
    MR_atomic_dec_int(volatile MR_Integer *addr)
    {
        MR_ATOMIC_DEC_INT_BODY;
    }
#endif

/*---------------------------------------------------------------------------*/

/*
** Note that on x86(_64) we have to use the sub instruction rather than the
** dec instruction because we need it to set the CPU flags.
*/
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

/*
** This could be trivially implemented using the __sync_sub_and_fetch compiler
** intrinsic.  However on some platforms this could use a compare and exchange
** loop. We can avoid this because we don't need to retrieve the result of the
** subtraction.
*/
    #define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY                             \
        do {                                                                \
            char is_zero;                                                   \
            __asm__(                                                        \
                "lock; subq $1, %0; setz %1"                                \
                : "=m"(*addr), "=q"(is_zero)                                \
                : "m"(*addr)                                                \
                );                                                          \
            return (MR_bool)is_zero;                                        \
        } while (0)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
    #define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY

#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY                              \
        do {                                                                \
            char is_zero;                                                   \
            __asm__(                                                        \
                "lock; subl $1, %0; setz %1"                                \
                : "=m"(*addr), "=q"(is_zero)                                \
                : "m"(*addr)                                                \
                );                                                          \
            return (MR_bool)is_zero;                                        \
        } while (0)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
    #define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY

#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY                             \
        do {                                                                \
            return (__sync_sub_and_fetch(addr, 1) == 0);                    \
        } while (0)

    #define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
    #define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
        MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY

#endif

#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY
    MR_EXTERN_INLINE MR_bool
    MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
    {
        MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
    }
#endif

#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY
    MR_EXTERN_INLINE MR_bool
    MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
    {
        MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
    }
#endif

#endif /* MR_THREAD_SAFE */

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

#ifdef MR_THREAD_SAFE

/*
** Memory fence operations.
*/
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) &&    \
    ( defined(__i386__) || defined(__x86_64__) ) && \
    !defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)

    /*
    ** Guarantees that any stores executed before this fence are
    ** globally visible before those after this fence.
    */
    #define MR_CPU_SFENCE                                                   \
        do {                                                                \
            __asm__ __volatile__("sfence");                                 \
        } while(0)

    /*
    ** Guarantees that any loads executed before this fence are complete
    ** before any loads after this fence.
    */
    #define MR_CPU_LFENCE                                                   \
        do {                                                                \
            __asm__ __volatile__("lfence");                                 \
        } while(0)

    /*
    ** A combination of the above.
    */
    #define MR_CPU_MFENCE                                                   \
        do {                                                                \
            __asm__ __volatile__("mfence");                                 \
        } while(0)

#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)

    /*
    ** Our memory fences are better than GCC's.  GCC only implements a full
    ** fence.
    */
    #define MR_CPU_MFENCE                                                   \
        do {                                                                \
            __sync_synchronize();                                           \
        } while(0)
    #define MR_CPU_SFENCE MR_CPU_MFENCE
    #define MR_CPU_LFENCE MR_CPU_MFENCE

#else

    #pragma error "Please implement memory fence operations "               \
        "for this compiler/architecture"

#endif

#endif /* MR_THREAD_SAFE */

/*---------------------------------------------------------------------------*/

#ifdef MR_LL_PARALLEL_CONJ

/*
** Roll our own cheap user-space mutual exclusion locks.  Blocking without
** spinning is not supported.  Storage for these locks should be volatile.
**
** I expect these to be faster than pthread mutexes when threads are pinned
** and critical sections are short.
*/
typedef MR_Unsigned MR_Us_Lock;

#define MR_US_LOCK_INITIAL_VALUE (0)

#define MR_US_TRY_LOCK(x)                                                   \
    MR_compare_and_swap_uint(x, 0, 1)

#define MR_US_SPIN_LOCK(x)                                                  \
    do {                                                                    \
        while (!MR_compare_and_swap_uint(x, 0, 1)) {                        \
            MR_ATOMIC_PAUSE;                                                \
        }                                                                   \
    } while (0)

#define MR_US_UNLOCK(x)                                                     \
    do {                                                                    \
        MR_CPU_MFENCE;                                                      \
        *x = 0;                                                             \
    } while (0)

/*
** Similar support for condition variables.  Again, make sure that storage for
** these is declared as volatile.
**
** XXX: These are not atomic, A waiting thread will not see a change until
** sometime after the signaling thread has signaled the condition.  The same
** race can occur when clearing a condition.  Order of memory operations is not
** guaranteed either.
*/
typedef MR_Unsigned MR_Us_Cond;

#define MR_US_COND_CLEAR(x)                                                 \
    do {                                                                    \
        MR_CPU_MFENCE;                                                      \
        *x = 0;                                                             \
    } while (0)

#define MR_US_COND_SET(x)                                                   \
    do {                                                                    \
        MR_CPU_MFENCE;                                                      \
        *x = 1;                                                             \
        MR_CPU_MFENCE;                                                      \
    } while (0)

#define MR_US_SPIN_COND(x)                                                  \
    do {                                                                    \
        while (!(*x)) {                                                     \
            MR_ATOMIC_PAUSE;                                                \
        }                                                                   \
        MR_CPU_MFENCE;                                                      \
    } while (0)

#endif /* MR_LL_PARALLEL_CONJ */

/*
** If we don't have definitions available for this compiler or architecture
** then we will get a link error in low-level .par grades.  No other grades
** currently require any atomic ops.
*/

/*---------------------------------------------------------------------------*/

#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)

/*
** Declarations for profiling the parallel runtime.
*/

typedef struct {
    /*
    ** The total number of times this event occurred is implicitly the sum of
    ** the recorded and not_recorded counts.
    */
    volatile MR_Unsigned    MR_stat_count_recorded;
    volatile MR_Unsigned    MR_stat_count_not_recorded;

    /*
    ** Atomic instructions are used to update these fields, and these fields
    ** must be 64 bit to contain the valid ranges of values.  However a 32 bit
    ** machine cannot (usually) do atomic operations on 64 bit data.  Therefore
    ** if we have fewer than 64 bits we protect these two fields with a lock.
    **
    ** The sum of squares is used to calculate variance and standard deviation.
    */
  #if MR_LOW_TAG_BIGS >= 3
    volatile MR_Integer     MR_stat_sum;
    volatile MR_Unsigned    MR_stat_sum_squares;
  #else
    MR_Us_Lock              MR_stat_sums_lock;
    MR_int_least64_t        MR_stat_sum;
    MR_uint_least64_t       MR_stat_sum_squares;
  #endif
} MR_Stats;

typedef struct {
    MR_uint_least64_t   MR_timer_time;
    MR_Unsigned         MR_timer_processor_id;
} MR_Timer;

/*
** The number of CPU clock cycles per second, ie a 1GHz CPU will have a value
** of 10^9, zero if unknown.
** This value is only available after MR_do_cpu_feature_detection() has been
** called.
*/
extern MR_uint_least64_t MR_cpu_cycles_per_sec;

/*
** Do CPU feature detection, this is necessary for profiling parallel code
** execution and the threadscope code.
** On i386 and x86_64 machines this uses CPUID to determine if the RDTSCP
** instruction is available and not prohibited by the OS.
** This function is idempotent.
*/
extern void
MR_do_cpu_feature_detection(void);

/*
** Start and initialize a timer structure.
*/
extern void
MR_profiling_start_timer(MR_Timer *timer);

/*
** Stop the timer and update stats with the results.
*/
extern void
MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats);

/*
** The TSC works and MR_cpu_cycles_per_sec is nonzero.
*/
extern MR_bool
MR_tsc_is_sensible(void);

/*
** Read the CPU's TSC.  This is currently only implemented for i386 and x86-64
** systems.  It returns 0 when support is not available.
*/
extern MR_uint_least64_t
MR_read_cpu_tsc(void);

#endif /* MR_PROFILE_PARALLEL_EXECUTION_SUPPORT */

/*---------------------------------------------------------------------------*/

#endif /* not MERCURY_ATOMIC_OPS_H */