Files
mercury/runtime/mercury_atomic_ops.h
Mark Brown d465fa53cb Update the COPYING.LIB file and references to it.
Discussion of these changes can be found on the Mercury developers
mailing list archives from June 2018.

COPYING.LIB:
    Add a special linking exception to the LGPL.

*:
    Update references to COPYING.LIB.

    Clean up some minor errors that have accumulated in copyright
    messages.
2018-06-09 17:43:12 +10:00

729 lines
30 KiB
C

// vim: ts=4 sw=4 expandtab ft=c
// Copyright (C) 2007, 2009-2011 The University of Melbourne.
// Copyright (C) 2016, 2018 The Mercury team.
// This file is distributed under the terms specified in COPYING.LIB.
// mercury_atomic.h - defines atomic operations and other primitives used by
// the parallel runtime.
#ifndef MERCURY_ATOMIC_OPS_H
#define MERCURY_ATOMIC_OPS_H
#include "mercury_std.h"
////////////////////////////////////////////////////////////////////////////
// Use this to make some storage volatile only when using a threadsafe grade.
#ifdef MR_THREAD_SAFE
#define MR_THREADSAFE_VOLATILE volatile
#else
#define MR_THREADSAFE_VOLATILE
#endif
#if defined(MR_THREAD_SAFE)
// Intel and AMD support a pause instruction that is roughly equivalent
// to a no-op. Intel recommend that it is used in spin-loops to improve
// performance. Without a pause instruction, multiple simultaneous
// read-requests will be in-flight for the synchronization variable from a
// single thread. Giving the pause instruction causes these to be executed
// in sequence, allowing the processor to handle the change in the
// synchronization variable more easily.
//
// On some chips it may cause the spin-loop to use less power.
//
// This instruction was introduced with the Pentium 4 but is backwards
// compatible, This works because the two byte instruction for PAUSE is
// equivalent to the NOP instruction prefixed by REPE. Therefore older
// processors perform a no-op.
//
// This is not really an atomic instruction but we name it MR_ATOMIC_PAUSE
// for consistency.
//
// References: Intel and AMD documentation for PAUSE, Intel optimisation guide.
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) && \
( defined(__i386__) || defined(__x86_64__) ) && \
!defined(MR_DO_NOT_USE_CPU_RELAX)
#define MR_ATOMIC_PAUSE \
do { \
__asm__ __volatile__("pause"); \
} while (0)
#else
// Fall back to a no-op
#define MR_ATOMIC_PAUSE \
do { \
; \
} while (0)
#endif
////////////////////////////////////////////////////////////////////////////
// Declarations for inline atomic operations.
//
// These operations work on machine word-sized values, this is distinct from
// C's idea of 'int' and 'unsigned int'. MR_Integer and MR_Unsigned are
// supposed to be machine word sized so these functions should only be used
// with values of these types.
// If the value at addr is equal to old, assign new to addr and return true.
// Otherwise return false.
MR_EXTERN_INLINE MR_bool MR_compare_and_swap_int(
volatile MR_Integer *addr,
MR_Integer old, MR_Integer new_val);
MR_EXTERN_INLINE MR_bool MR_compare_and_swap_uint(
volatile MR_Unsigned *addr,
MR_Unsigned old, MR_Unsigned new_val);
// Atomically add to an integer in memory and retrieve the result. In other
// words an atomic pre-increment operation.
MR_EXTERN_INLINE MR_Integer MR_atomic_add_and_fetch_int(
volatile MR_Integer *addr, MR_Integer addend);
MR_EXTERN_INLINE MR_Unsigned MR_atomic_add_and_fetch_uint(
volatile MR_Unsigned *addr, MR_Unsigned addend);
// Atomically add the second argument to the memory pointed to by the first
// argument.
MR_EXTERN_INLINE void MR_atomic_add_int(volatile MR_Integer *addr,
MR_Integer addend);
MR_EXTERN_INLINE void MR_atomic_add_uint(volatile MR_Unsigned *addr,
MR_Unsigned addend);
// Atomically subtract the second argument from the memory pointed to by the
// first argument.
MR_EXTERN_INLINE void MR_atomic_sub_int(volatile MR_Integer *addr,
MR_Integer x);
// Increment the word pointed at by the address.
MR_EXTERN_INLINE void MR_atomic_inc_int(volatile MR_Integer *addr);
MR_EXTERN_INLINE void MR_atomic_inc_uint(volatile MR_Unsigned *addr);
// Decrement the word pointed at by the address.
MR_EXTERN_INLINE void MR_atomic_dec_int(volatile MR_Integer *addr);
// Decrement the integer pointed at by the address and return true iff it is
// zero after the decrement.
MR_EXTERN_INLINE MR_bool MR_atomic_dec_and_is_zero_int(
volatile MR_Integer *addr);
MR_EXTERN_INLINE MR_bool MR_atomic_dec_and_is_zero_uint(
volatile MR_Unsigned *addr);
// For information about GCC's builtins for atomic operations see:
// http://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcc/Atomic-Builtins.html
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
#if (defined(MR_CLANG) || (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1))) && \
!defined(MR_AVOID_COMPILER_INTRINSICS)
// gcc 4.1 and above have builtin atomic operations.
#define MR_COMPARE_AND_SWAP_WORD_BODY \
do { \
return __sync_bool_compare_and_swap(addr, old, new_val); \
} while (0)
#elif defined(MR_GNUC) && defined(__x86_64__)
#define MR_COMPARE_AND_SWAP_WORD_BODY \
do { \
char result; \
\
__asm__ __volatile__( \
"lock; cmpxchgq %4, %0; setz %1" \
: "=m"(*addr), "=q"(result), "=a"(old) \
: "m"(*addr), "r" (new_val), "a"(old) \
); \
return (MR_bool) result; \
} while (0)
#elif defined(MR_GNUC) && defined(__i386__)
// Really 486 or better.
#define MR_COMPARE_AND_SWAP_WORD_BODY \
do { \
char result; \
\
__asm__ __volatile__( \
"lock; cmpxchgl %4, %0; setz %1" \
: "=m"(*addr), "=q"(result), "=a"(old) \
: "m"(*addr), "r" (new_val), "a"(old) \
); \
return (MR_bool) result; \
} while (0)
#endif
#ifdef MR_COMPARE_AND_SWAP_WORD_BODY
MR_EXTERN_INLINE MR_bool
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
MR_Integer new_val)
{
MR_COMPARE_AND_SWAP_WORD_BODY;
}
MR_EXTERN_INLINE MR_bool
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
MR_Unsigned new_val)
{
MR_COMPARE_AND_SWAP_WORD_BODY;
}
#endif
////////////////////////////////////////////////////////////////////////////
#if (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)) && \
!defined(MR_AVOID_COMPILER_INTRINSICS)
#define MR_ATOMIC_ADD_AND_FETCH_WORD_BODY \
do { \
return __sync_add_and_fetch(addr, addend); \
} while (0)
#define MR_ATOMIC_ADD_AND_FETCH_INT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
#define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
#elif defined(MR_COMPARE_AND_SWAP_WORD_BODY)
// If there is no GCC builtin for this then it can be implemented in terms
// of compare and swap, assuming that that has been implemented in
// assembler for this architecture.
//
// XXX: There is an add and exchange (xadd) instruction on x86, this is
// better than the CAS loop below.
#define MR_ATOMIC_ADD_AND_FETCH_INT_BODY \
do { \
MR_Integer temp; \
temp = *addr; \
while (!MR_compare_and_swap_int(addr, temp, temp+addend)) { \
MR_ATOMIC_PAUSE; \
temp = *addr; \
} \
return temp+addend; \
} while (0)
#define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY \
do { \
MR_Unsigned temp; \
temp = *addr; \
while (!MR_compare_and_swap_uint(addr, temp, temp+addend)) { \
MR_ATOMIC_PAUSE; \
temp = *addr; \
} \
return temp+addend; \
} while (0)
#endif
#ifdef MR_ATOMIC_ADD_AND_FETCH_INT_BODY
MR_EXTERN_INLINE MR_Integer
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
{
MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
}
#endif
#ifdef MR_ATOMIC_ADD_AND_FETCH_UINT_BODY
MR_EXTERN_INLINE MR_Unsigned
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
{
MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
}
#endif
////////////////////////////////////////////////////////////////////////////
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
#define MR_ATOMIC_ADD_WORD_BODY \
do { \
__asm__ __volatile__( \
"lock; addq %2, %0" \
: "=m"(*addr) \
: "m"(*addr), "r"(addend) \
); \
} while (0)
#define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
#define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
#define MR_ATOMIC_ADD_WORD_BODY \
do { \
__asm__ __volatile__( \
"lock; addl %2, %0;" \
: "=m"(*addr) \
: "m"(*addr), "r"(addend) \
); \
} while (0)
#define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
#define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY
#elif defined(MR_ATOMIC_ADD_AND_FETCH_INT_BODY)
#define MR_ATOMIC_ADD_INT_BODY \
do { \
MR_atomic_add_and_fetch_int(addr, addend); \
} while (0)
#define MR_ATOMIC_ADD_UINT_BODY \
do { \
MR_atomic_add_and_fetch_uint(addr, addend); \
} while (0)
#endif
#ifdef MR_ATOMIC_ADD_INT_BODY
MR_EXTERN_INLINE void
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
{
MR_ATOMIC_ADD_INT_BODY;
}
#endif
#ifdef MR_ATOMIC_ADD_UINT_BODY
MR_EXTERN_INLINE void
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
{
MR_ATOMIC_ADD_UINT_BODY;
}
#endif
////////////////////////////////////////////////////////////////////////////
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
#define MR_ATOMIC_SUB_INT_BODY \
do { \
__asm__ __volatile__( \
"lock; subq %2, %0;" \
: "=m"(*addr) \
: "m"(*addr), "r"(x) \
); \
} while (0)
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
#define MR_ATOMIC_SUB_INT_BODY \
do { \
__asm__ __volatile__( \
"lock; subl %2, %0;" \
: "=m"(*addr) \
: "m"(*addr), "r"(x) \
); \
} while (0)
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
#define MR_ATOMIC_SUB_INT_BODY \
do { \
__sync_sub_and_fetch(addr, x); \
} while (0)
#endif
#ifdef MR_ATOMIC_SUB_INT_BODY
MR_EXTERN_INLINE void
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
{
MR_ATOMIC_SUB_INT_BODY;
}
#endif
////////////////////////////////////////////////////////////////////////////
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
#define MR_ATOMIC_INC_WORD_BODY \
do { \
__asm__ __volatile__( \
"lock; incq %0;" \
: "=m"(*addr) \
: "m"(*addr) \
); \
} while (0)
#define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
#define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
// Really 486 or better.
#define MR_ATOMIC_INC_WORD_BODY \
do { \
__asm__ __volatile__( \
"lock; incl %0;" \
: "=m"(*addr) \
: "m"(*addr) \
); \
} while (0)
#define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
#define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY
#else
// Fall back to an atomic add 1 operation.
//
// We could fall back to the built-in GCC instructions but they also fetch
// the value. I believe this is more efficient. pbone
#define MR_ATOMIC_INC_INT_BODY \
MR_atomic_add_int(addr, 1)
#define MR_ATOMIC_INC_UINT_BODY \
MR_atomic_add_uint(addr, 1)
#endif
#ifdef MR_ATOMIC_INC_INT_BODY
MR_EXTERN_INLINE void
MR_atomic_inc_int(volatile MR_Integer *addr)
{
MR_ATOMIC_INC_INT_BODY;
}
#endif
#ifdef MR_ATOMIC_INC_UINT_BODY
MR_EXTERN_INLINE void
MR_atomic_inc_uint(volatile MR_Unsigned *addr)
{
MR_ATOMIC_INC_UINT_BODY;
}
#endif
////////////////////////////////////////////////////////////////////////////
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
#define MR_ATOMIC_DEC_INT_BODY \
do { \
__asm__ __volatile__( \
"lock; decq %0;" \
: "=m"(*addr) \
: "m"(*addr) \
); \
} while (0)
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
// Really 486 or better.
#define MR_ATOMIC_DEC_INT_BODY \
do { \
__asm__ __volatile__( \
"lock; decl %0;" \
: "=m"(*addr) \
: "m"(*addr) \
); \
} while (0)
#else
// Fall back to an atomic subtract 1 operation.
#define MR_ATOMIC_DEC_INT_BODY \
MR_atomic_sub_int(addr, 1)
#endif
#ifdef MR_ATOMIC_DEC_INT_BODY
MR_EXTERN_INLINE void
MR_atomic_dec_int(volatile MR_Integer *addr)
{
MR_ATOMIC_DEC_INT_BODY;
}
#endif
////////////////////////////////////////////////////////////////////////////
// Note that on x86(_64) we have to use the sub instruction rather than the
// dec instruction because we need it to set the CPU flags.
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
// This could be trivially implemented using the __sync_sub_and_fetch compiler
// intrinsic. However on some platforms this could use a compare and exchange
// loop. We can avoid this because we don't need to retrieve the result of the
// subtraction.
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
do { \
char is_zero; \
__asm__( \
"lock; subq $1, %0; setz %1" \
: "=m"(*addr), "=q"(is_zero) \
: "m"(*addr) \
); \
return (MR_bool)is_zero; \
} while (0)
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
do { \
char is_zero; \
__asm__( \
"lock; subl $1, %0; setz %1" \
: "=m"(*addr), "=q"(is_zero) \
: "m"(*addr) \
); \
return (MR_bool)is_zero; \
} while (0)
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
do { \
return (__sync_sub_and_fetch(addr, 1) == 0); \
} while (0)
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#endif
#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY
MR_EXTERN_INLINE MR_bool
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
{
MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
}
#endif
#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY
MR_EXTERN_INLINE MR_bool
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
{
MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
}
#endif
#endif // MR_THREAD_SAFE
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
#ifdef MR_THREAD_SAFE
// Memory fence operations.
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) && \
( defined(__i386__) || defined(__x86_64__) ) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
// Guarantees that any stores executed before this fence are
// globally visible before those after this fence.
#define MR_CPU_SFENCE \
do { \
__asm__ __volatile__("sfence"); \
} while (0)
// Guarantees that any loads executed before this fence are complete
// before any loads after this fence.
#define MR_CPU_LFENCE \
do { \
__asm__ __volatile__("lfence"); \
} while (0)
// A combination of the above.
#define MR_CPU_MFENCE \
do { \
__asm__ __volatile__("mfence"); \
} while (0)
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
// Our memory fences are better than GCC's. GCC only implements a full
// fence.
#define MR_CPU_MFENCE \
do { \
__sync_synchronize(); \
} while (0)
#define MR_CPU_SFENCE MR_CPU_MFENCE
#define MR_CPU_LFENCE MR_CPU_MFENCE
#else
#pragma error "Please implement memory fence operations " \
"for this compiler/architecture"
#endif
#endif // MR_THREAD_SAFE
////////////////////////////////////////////////////////////////////////////
#ifdef MR_LL_PARALLEL_CONJ
// Roll our own cheap user-space mutual exclusion locks. Blocking without
// spinning is not supported. Storage for these locks should be volatile.
//
// I expect these to be faster than pthread mutexes when threads are pinned
// and critical sections are short.
typedef MR_Unsigned MR_Us_Lock;
#define MR_US_LOCK_INITIAL_VALUE (0)
#define MR_US_TRY_LOCK(x) \
MR_compare_and_swap_uint(x, 0, 1)
#define MR_US_SPIN_LOCK(x) \
do { \
while (!MR_compare_and_swap_uint(x, 0, 1)) { \
MR_ATOMIC_PAUSE; \
} \
} while (0)
#define MR_US_UNLOCK(x) \
do { \
MR_CPU_MFENCE; \
*x = 0; \
} while (0)
// Similar support for condition variables. Again, make sure that storage for
// these is declared as volatile.
//
// XXX: These are not atomic, A waiting thread will not see a change until
// sometime after the signaling thread has signaled the condition. The same
// race can occur when clearing a condition. Order of memory operations is not
// guaranteed either.
typedef MR_Unsigned MR_Us_Cond;
#define MR_US_COND_CLEAR(x) \
do { \
MR_CPU_MFENCE; \
*x = 0; \
} while (0)
#define MR_US_COND_SET(x) \
do { \
MR_CPU_MFENCE; \
*x = 1; \
MR_CPU_MFENCE; \
} while (0)
#define MR_US_SPIN_COND(x) \
do { \
while (!(*x)) { \
MR_ATOMIC_PAUSE; \
} \
MR_CPU_MFENCE; \
} while (0)
#endif // MR_LL_PARALLEL_CONJ
// If we don't have definitions available for this compiler or architecture
// then we will get a link error in low-level .par grades. No other grades
// currently require any atomic ops.
////////////////////////////////////////////////////////////////////////////
#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
// Declarations for profiling the parallel runtime.
typedef struct {
// The total number of times this event occurred is implicitly the sum of
// the recorded and not_recorded counts.
volatile MR_Unsigned MR_stat_count_recorded;
volatile MR_Unsigned MR_stat_count_not_recorded;
// Atomic instructions are used to update these fields, and these fields
// must be 64 bit to contain the valid ranges of values. However a 32 bit
// machine cannot (usually) do atomic operations on 64 bit data. Therefore
// if we have fewer than 64 bits we protect these two fields with a lock.
//
// The sum of squares is used to calculate variance and standard deviation.
#if MR_LOW_TAG_BIGS >= 3
volatile MR_Integer MR_stat_sum;
volatile MR_Unsigned MR_stat_sum_squares;
#else
MR_Us_Lock MR_stat_sums_lock;
MR_int_least64_t MR_stat_sum;
MR_uint_least64_t MR_stat_sum_squares;
#endif
} MR_Stats;
typedef struct {
MR_uint_least64_t MR_timer_time;
MR_Unsigned MR_timer_processor_id;
} MR_Timer;
// The number of CPU clock cycles per second, ie a 1GHz CPU will have a value
// of 10^9, zero if unknown.
// This value is only available after MR_do_cpu_feature_detection() has been
// called.
extern MR_uint_least64_t MR_cpu_cycles_per_sec;
// Do CPU feature detection, this is necessary for profiling parallel code
// execution and the threadscope code.
// On i386 and x86_64 machines this uses CPUID to determine if the RDTSCP
// instruction is available and not prohibited by the OS.
// This function is idempotent.
extern void MR_do_cpu_feature_detection(void);
// Start and initialize a timer structure.
extern void MR_profiling_start_timer(MR_Timer *timer);
// Stop the timer and update stats with the results.
extern void MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats);
// The TSC works and MR_cpu_cycles_per_sec is nonzero.
extern MR_bool MR_tsc_is_sensible(void);
// Read the CPU's TSC. This is currently only implemented for i386 and x86-64
// systems. It returns 0 when support is not available.
extern MR_uint_least64_t MR_read_cpu_tsc(void);
#endif // MR_PROFILE_PARALLEL_EXECUTION_SUPPORT
////////////////////////////////////////////////////////////////////////////
#endif // not MERCURY_ATOMIC_OPS_H