mirror of
https://github.com/Mercury-Language/mercury.git
synced 2025-12-09 19:02:18 +00:00
Discussion of these changes can be found on the Mercury developers
mailing list archives from June 2018.
COPYING.LIB:
Add a special linking exception to the LGPL.
*:
Update references to COPYING.LIB.
Clean up some minor errors that have accumulated in copyright
messages.
729 lines
30 KiB
C
729 lines
30 KiB
C
// vim: ts=4 sw=4 expandtab ft=c
|
|
|
|
// Copyright (C) 2007, 2009-2011 The University of Melbourne.
|
|
// Copyright (C) 2016, 2018 The Mercury team.
|
|
// This file is distributed under the terms specified in COPYING.LIB.
|
|
|
|
// mercury_atomic.h - defines atomic operations and other primitives used by
|
|
// the parallel runtime.
|
|
|
|
#ifndef MERCURY_ATOMIC_OPS_H
|
|
#define MERCURY_ATOMIC_OPS_H
|
|
|
|
#include "mercury_std.h"
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Use this to make some storage volatile only when using a threadsafe grade.
|
|
|
|
#ifdef MR_THREAD_SAFE
|
|
#define MR_THREADSAFE_VOLATILE volatile
|
|
#else
|
|
#define MR_THREADSAFE_VOLATILE
|
|
#endif
|
|
|
|
#if defined(MR_THREAD_SAFE)
|
|
|
|
// Intel and AMD support a pause instruction that is roughly equivalent
|
|
// to a no-op. Intel recommend that it is used in spin-loops to improve
|
|
// performance. Without a pause instruction, multiple simultaneous
|
|
// read-requests will be in-flight for the synchronization variable from a
|
|
// single thread. Giving the pause instruction causes these to be executed
|
|
// in sequence, allowing the processor to handle the change in the
|
|
// synchronization variable more easily.
|
|
//
|
|
// On some chips it may cause the spin-loop to use less power.
|
|
//
|
|
// This instruction was introduced with the Pentium 4 but is backwards
|
|
// compatible, This works because the two byte instruction for PAUSE is
|
|
// equivalent to the NOP instruction prefixed by REPE. Therefore older
|
|
// processors perform a no-op.
|
|
//
|
|
// This is not really an atomic instruction but we name it MR_ATOMIC_PAUSE
|
|
// for consistency.
|
|
//
|
|
// References: Intel and AMD documentation for PAUSE, Intel optimisation guide.
|
|
|
|
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) && \
|
|
( defined(__i386__) || defined(__x86_64__) ) && \
|
|
!defined(MR_DO_NOT_USE_CPU_RELAX)
|
|
|
|
#define MR_ATOMIC_PAUSE \
|
|
do { \
|
|
__asm__ __volatile__("pause"); \
|
|
} while (0)
|
|
|
|
#else
|
|
|
|
// Fall back to a no-op
|
|
#define MR_ATOMIC_PAUSE \
|
|
do { \
|
|
; \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Declarations for inline atomic operations.
|
|
//
|
|
// These operations work on machine word-sized values, this is distinct from
|
|
// C's idea of 'int' and 'unsigned int'. MR_Integer and MR_Unsigned are
|
|
// supposed to be machine word sized so these functions should only be used
|
|
// with values of these types.
|
|
|
|
// If the value at addr is equal to old, assign new to addr and return true.
|
|
// Otherwise return false.
|
|
|
|
MR_EXTERN_INLINE MR_bool MR_compare_and_swap_int(
|
|
volatile MR_Integer *addr,
|
|
MR_Integer old, MR_Integer new_val);
|
|
MR_EXTERN_INLINE MR_bool MR_compare_and_swap_uint(
|
|
volatile MR_Unsigned *addr,
|
|
MR_Unsigned old, MR_Unsigned new_val);
|
|
|
|
// Atomically add to an integer in memory and retrieve the result. In other
|
|
// words an atomic pre-increment operation.
|
|
|
|
MR_EXTERN_INLINE MR_Integer MR_atomic_add_and_fetch_int(
|
|
volatile MR_Integer *addr, MR_Integer addend);
|
|
MR_EXTERN_INLINE MR_Unsigned MR_atomic_add_and_fetch_uint(
|
|
volatile MR_Unsigned *addr, MR_Unsigned addend);
|
|
|
|
// Atomically add the second argument to the memory pointed to by the first
|
|
// argument.
|
|
|
|
MR_EXTERN_INLINE void MR_atomic_add_int(volatile MR_Integer *addr,
|
|
MR_Integer addend);
|
|
MR_EXTERN_INLINE void MR_atomic_add_uint(volatile MR_Unsigned *addr,
|
|
MR_Unsigned addend);
|
|
|
|
// Atomically subtract the second argument from the memory pointed to by the
|
|
// first argument.
|
|
|
|
MR_EXTERN_INLINE void MR_atomic_sub_int(volatile MR_Integer *addr,
|
|
MR_Integer x);
|
|
|
|
// Increment the word pointed at by the address.
|
|
|
|
MR_EXTERN_INLINE void MR_atomic_inc_int(volatile MR_Integer *addr);
|
|
MR_EXTERN_INLINE void MR_atomic_inc_uint(volatile MR_Unsigned *addr);
|
|
|
|
// Decrement the word pointed at by the address.
|
|
|
|
MR_EXTERN_INLINE void MR_atomic_dec_int(volatile MR_Integer *addr);
|
|
|
|
// Decrement the integer pointed at by the address and return true iff it is
|
|
// zero after the decrement.
|
|
|
|
MR_EXTERN_INLINE MR_bool MR_atomic_dec_and_is_zero_int(
|
|
volatile MR_Integer *addr);
|
|
MR_EXTERN_INLINE MR_bool MR_atomic_dec_and_is_zero_uint(
|
|
volatile MR_Unsigned *addr);
|
|
|
|
// For information about GCC's builtins for atomic operations see:
|
|
// http://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcc/Atomic-Builtins.html
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#if (defined(MR_CLANG) || (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1))) && \
|
|
!defined(MR_AVOID_COMPILER_INTRINSICS)
|
|
|
|
// gcc 4.1 and above have builtin atomic operations.
|
|
|
|
#define MR_COMPARE_AND_SWAP_WORD_BODY \
|
|
do { \
|
|
return __sync_bool_compare_and_swap(addr, old, new_val); \
|
|
} while (0)
|
|
|
|
#elif defined(MR_GNUC) && defined(__x86_64__)
|
|
|
|
#define MR_COMPARE_AND_SWAP_WORD_BODY \
|
|
do { \
|
|
char result; \
|
|
\
|
|
__asm__ __volatile__( \
|
|
"lock; cmpxchgq %4, %0; setz %1" \
|
|
: "=m"(*addr), "=q"(result), "=a"(old) \
|
|
: "m"(*addr), "r" (new_val), "a"(old) \
|
|
); \
|
|
return (MR_bool) result; \
|
|
} while (0)
|
|
|
|
#elif defined(MR_GNUC) && defined(__i386__)
|
|
|
|
// Really 486 or better.
|
|
#define MR_COMPARE_AND_SWAP_WORD_BODY \
|
|
do { \
|
|
char result; \
|
|
\
|
|
__asm__ __volatile__( \
|
|
"lock; cmpxchgl %4, %0; setz %1" \
|
|
: "=m"(*addr), "=q"(result), "=a"(old) \
|
|
: "m"(*addr), "r" (new_val), "a"(old) \
|
|
); \
|
|
return (MR_bool) result; \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_COMPARE_AND_SWAP_WORD_BODY
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
|
|
MR_Integer new_val)
|
|
{
|
|
MR_COMPARE_AND_SWAP_WORD_BODY;
|
|
}
|
|
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
|
|
MR_Unsigned new_val)
|
|
{
|
|
MR_COMPARE_AND_SWAP_WORD_BODY;
|
|
}
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#if (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)) && \
|
|
!defined(MR_AVOID_COMPILER_INTRINSICS)
|
|
|
|
#define MR_ATOMIC_ADD_AND_FETCH_WORD_BODY \
|
|
do { \
|
|
return __sync_add_and_fetch(addr, addend); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_AND_FETCH_INT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
|
|
#define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
|
|
|
|
#elif defined(MR_COMPARE_AND_SWAP_WORD_BODY)
|
|
// If there is no GCC builtin for this then it can be implemented in terms
|
|
// of compare and swap, assuming that that has been implemented in
|
|
// assembler for this architecture.
|
|
//
|
|
// XXX: There is an add and exchange (xadd) instruction on x86, this is
|
|
// better than the CAS loop below.
|
|
|
|
#define MR_ATOMIC_ADD_AND_FETCH_INT_BODY \
|
|
do { \
|
|
MR_Integer temp; \
|
|
temp = *addr; \
|
|
while (!MR_compare_and_swap_int(addr, temp, temp+addend)) { \
|
|
MR_ATOMIC_PAUSE; \
|
|
temp = *addr; \
|
|
} \
|
|
return temp+addend; \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY \
|
|
do { \
|
|
MR_Unsigned temp; \
|
|
temp = *addr; \
|
|
while (!MR_compare_and_swap_uint(addr, temp, temp+addend)) { \
|
|
MR_ATOMIC_PAUSE; \
|
|
temp = *addr; \
|
|
} \
|
|
return temp+addend; \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_ADD_AND_FETCH_INT_BODY
|
|
MR_EXTERN_INLINE MR_Integer
|
|
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
|
|
{
|
|
MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_ADD_AND_FETCH_UINT_BODY
|
|
MR_EXTERN_INLINE MR_Unsigned
|
|
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
|
|
{
|
|
MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
|
|
}
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
#define MR_ATOMIC_ADD_WORD_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; addq %2, %0" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr), "r"(addend) \
|
|
); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
|
|
#define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
|
|
|
|
#define MR_ATOMIC_ADD_WORD_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; addl %2, %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr), "r"(addend) \
|
|
); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
|
|
#define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY
|
|
|
|
#elif defined(MR_ATOMIC_ADD_AND_FETCH_INT_BODY)
|
|
|
|
#define MR_ATOMIC_ADD_INT_BODY \
|
|
do { \
|
|
MR_atomic_add_and_fetch_int(addr, addend); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_UINT_BODY \
|
|
do { \
|
|
MR_atomic_add_and_fetch_uint(addr, addend); \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_ADD_INT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
|
|
{
|
|
MR_ATOMIC_ADD_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_ADD_UINT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
|
|
{
|
|
MR_ATOMIC_ADD_UINT_BODY;
|
|
}
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
#define MR_ATOMIC_SUB_INT_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; subq %2, %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr), "r"(x) \
|
|
); \
|
|
} while (0)
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
|
|
|
|
#define MR_ATOMIC_SUB_INT_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; subl %2, %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr), "r"(x) \
|
|
); \
|
|
} while (0)
|
|
|
|
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
|
|
|
|
#define MR_ATOMIC_SUB_INT_BODY \
|
|
do { \
|
|
__sync_sub_and_fetch(addr, x); \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_SUB_INT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
|
|
{
|
|
MR_ATOMIC_SUB_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
#define MR_ATOMIC_INC_WORD_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; incq %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr) \
|
|
); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
|
|
#define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
// Really 486 or better.
|
|
#define MR_ATOMIC_INC_WORD_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; incl %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr) \
|
|
); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
|
|
#define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY
|
|
|
|
#else
|
|
|
|
// Fall back to an atomic add 1 operation.
|
|
//
|
|
// We could fall back to the built-in GCC instructions but they also fetch
|
|
// the value. I believe this is more efficient. pbone
|
|
|
|
#define MR_ATOMIC_INC_INT_BODY \
|
|
MR_atomic_add_int(addr, 1)
|
|
#define MR_ATOMIC_INC_UINT_BODY \
|
|
MR_atomic_add_uint(addr, 1)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_INC_INT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_inc_int(volatile MR_Integer *addr)
|
|
{
|
|
MR_ATOMIC_INC_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_INC_UINT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_inc_uint(volatile MR_Unsigned *addr)
|
|
{
|
|
MR_ATOMIC_INC_UINT_BODY;
|
|
}
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
#define MR_ATOMIC_DEC_INT_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; decq %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr) \
|
|
); \
|
|
} while (0)
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
// Really 486 or better.
|
|
#define MR_ATOMIC_DEC_INT_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; decl %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr) \
|
|
); \
|
|
} while (0)
|
|
#else
|
|
// Fall back to an atomic subtract 1 operation.
|
|
|
|
#define MR_ATOMIC_DEC_INT_BODY \
|
|
MR_atomic_sub_int(addr, 1)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_DEC_INT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_dec_int(volatile MR_Integer *addr)
|
|
{
|
|
MR_ATOMIC_DEC_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Note that on x86(_64) we have to use the sub instruction rather than the
|
|
// dec instruction because we need it to set the CPU flags.
|
|
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
// This could be trivially implemented using the __sync_sub_and_fetch compiler
|
|
// intrinsic. However on some platforms this could use a compare and exchange
|
|
// loop. We can avoid this because we don't need to retrieve the result of the
|
|
// subtraction.
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
|
|
do { \
|
|
char is_zero; \
|
|
__asm__( \
|
|
"lock; subq $1, %0; setz %1" \
|
|
: "=m"(*addr), "=q"(is_zero) \
|
|
: "m"(*addr) \
|
|
); \
|
|
return (MR_bool)is_zero; \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
|
|
do { \
|
|
char is_zero; \
|
|
__asm__( \
|
|
"lock; subl $1, %0; setz %1" \
|
|
: "=m"(*addr), "=q"(is_zero) \
|
|
: "m"(*addr) \
|
|
); \
|
|
return (MR_bool)is_zero; \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
|
|
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
|
|
do { \
|
|
return (__sync_sub_and_fetch(addr, 1) == 0); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
|
|
{
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
|
|
{
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#endif // MR_THREAD_SAFE
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#ifdef MR_THREAD_SAFE
|
|
|
|
// Memory fence operations.
|
|
|
|
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) && \
|
|
( defined(__i386__) || defined(__x86_64__) ) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
// Guarantees that any stores executed before this fence are
|
|
// globally visible before those after this fence.
|
|
|
|
#define MR_CPU_SFENCE \
|
|
do { \
|
|
__asm__ __volatile__("sfence"); \
|
|
} while (0)
|
|
|
|
// Guarantees that any loads executed before this fence are complete
|
|
// before any loads after this fence.
|
|
|
|
#define MR_CPU_LFENCE \
|
|
do { \
|
|
__asm__ __volatile__("lfence"); \
|
|
} while (0)
|
|
|
|
// A combination of the above.
|
|
|
|
#define MR_CPU_MFENCE \
|
|
do { \
|
|
__asm__ __volatile__("mfence"); \
|
|
} while (0)
|
|
|
|
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
|
|
|
|
// Our memory fences are better than GCC's. GCC only implements a full
|
|
// fence.
|
|
|
|
#define MR_CPU_MFENCE \
|
|
do { \
|
|
__sync_synchronize(); \
|
|
} while (0)
|
|
#define MR_CPU_SFENCE MR_CPU_MFENCE
|
|
#define MR_CPU_LFENCE MR_CPU_MFENCE
|
|
|
|
#else
|
|
|
|
#pragma error "Please implement memory fence operations " \
|
|
"for this compiler/architecture"
|
|
|
|
#endif
|
|
|
|
#endif // MR_THREAD_SAFE
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#ifdef MR_LL_PARALLEL_CONJ
|
|
|
|
// Roll our own cheap user-space mutual exclusion locks. Blocking without
|
|
// spinning is not supported. Storage for these locks should be volatile.
|
|
//
|
|
// I expect these to be faster than pthread mutexes when threads are pinned
|
|
// and critical sections are short.
|
|
|
|
typedef MR_Unsigned MR_Us_Lock;
|
|
|
|
#define MR_US_LOCK_INITIAL_VALUE (0)
|
|
|
|
#define MR_US_TRY_LOCK(x) \
|
|
MR_compare_and_swap_uint(x, 0, 1)
|
|
|
|
#define MR_US_SPIN_LOCK(x) \
|
|
do { \
|
|
while (!MR_compare_and_swap_uint(x, 0, 1)) { \
|
|
MR_ATOMIC_PAUSE; \
|
|
} \
|
|
} while (0)
|
|
|
|
#define MR_US_UNLOCK(x) \
|
|
do { \
|
|
MR_CPU_MFENCE; \
|
|
*x = 0; \
|
|
} while (0)
|
|
|
|
// Similar support for condition variables. Again, make sure that storage for
|
|
// these is declared as volatile.
|
|
//
|
|
// XXX: These are not atomic, A waiting thread will not see a change until
|
|
// sometime after the signaling thread has signaled the condition. The same
|
|
// race can occur when clearing a condition. Order of memory operations is not
|
|
// guaranteed either.
|
|
|
|
typedef MR_Unsigned MR_Us_Cond;
|
|
|
|
#define MR_US_COND_CLEAR(x) \
|
|
do { \
|
|
MR_CPU_MFENCE; \
|
|
*x = 0; \
|
|
} while (0)
|
|
|
|
#define MR_US_COND_SET(x) \
|
|
do { \
|
|
MR_CPU_MFENCE; \
|
|
*x = 1; \
|
|
MR_CPU_MFENCE; \
|
|
} while (0)
|
|
|
|
#define MR_US_SPIN_COND(x) \
|
|
do { \
|
|
while (!(*x)) { \
|
|
MR_ATOMIC_PAUSE; \
|
|
} \
|
|
MR_CPU_MFENCE; \
|
|
} while (0)
|
|
|
|
#endif // MR_LL_PARALLEL_CONJ
|
|
|
|
// If we don't have definitions available for this compiler or architecture
|
|
// then we will get a link error in low-level .par grades. No other grades
|
|
// currently require any atomic ops.
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
|
|
|
|
// Declarations for profiling the parallel runtime.
|
|
|
|
typedef struct {
|
|
// The total number of times this event occurred is implicitly the sum of
|
|
// the recorded and not_recorded counts.
|
|
|
|
volatile MR_Unsigned MR_stat_count_recorded;
|
|
volatile MR_Unsigned MR_stat_count_not_recorded;
|
|
|
|
// Atomic instructions are used to update these fields, and these fields
|
|
// must be 64 bit to contain the valid ranges of values. However a 32 bit
|
|
// machine cannot (usually) do atomic operations on 64 bit data. Therefore
|
|
// if we have fewer than 64 bits we protect these two fields with a lock.
|
|
//
|
|
// The sum of squares is used to calculate variance and standard deviation.
|
|
|
|
#if MR_LOW_TAG_BIGS >= 3
|
|
volatile MR_Integer MR_stat_sum;
|
|
volatile MR_Unsigned MR_stat_sum_squares;
|
|
#else
|
|
MR_Us_Lock MR_stat_sums_lock;
|
|
MR_int_least64_t MR_stat_sum;
|
|
MR_uint_least64_t MR_stat_sum_squares;
|
|
#endif
|
|
} MR_Stats;
|
|
|
|
typedef struct {
|
|
MR_uint_least64_t MR_timer_time;
|
|
MR_Unsigned MR_timer_processor_id;
|
|
} MR_Timer;
|
|
|
|
// The number of CPU clock cycles per second, ie a 1GHz CPU will have a value
|
|
// of 10^9, zero if unknown.
|
|
// This value is only available after MR_do_cpu_feature_detection() has been
|
|
// called.
|
|
|
|
extern MR_uint_least64_t MR_cpu_cycles_per_sec;
|
|
|
|
// Do CPU feature detection, this is necessary for profiling parallel code
|
|
// execution and the threadscope code.
|
|
// On i386 and x86_64 machines this uses CPUID to determine if the RDTSCP
|
|
// instruction is available and not prohibited by the OS.
|
|
// This function is idempotent.
|
|
|
|
extern void MR_do_cpu_feature_detection(void);
|
|
|
|
// Start and initialize a timer structure.
|
|
|
|
extern void MR_profiling_start_timer(MR_Timer *timer);
|
|
|
|
// Stop the timer and update stats with the results.
|
|
|
|
extern void MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats);
|
|
|
|
// The TSC works and MR_cpu_cycles_per_sec is nonzero.
|
|
|
|
extern MR_bool MR_tsc_is_sensible(void);
|
|
|
|
// Read the CPU's TSC. This is currently only implemented for i386 and x86-64
|
|
// systems. It returns 0 when support is not available.
|
|
|
|
extern MR_uint_least64_t MR_read_cpu_tsc(void);
|
|
|
|
#endif // MR_PROFILE_PARALLEL_EXECUTION_SUPPORT
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
#endif // not MERCURY_ATOMIC_OPS_H
|