mirror of
https://github.com/Mercury-Language/mercury.git
synced 2025-12-13 12:53:53 +00:00
used. Mercury's ThreadScope support will now use gettimeofday() by
default, but use of the TSC may be enabled.
Note that in Linux, gettimeofday() does not always make a system call.
runtime/mercury_threadscope.[ch]:
Add support for measuring time with gettimeofday().
Use gettimeofday() to measure time by default.
runtime/mercury_atomic_ops.[ch]
Add a new function MR_tsc_is_sensible(), It returns true if the TSC can
(as far as the RTS can detect) be used.
Fix trailing whitespace.
runtime/mercury_wrapper.c:
Add a new runtime option --threadscope-use-tsc.
When specified this option allows threadscope to use the CPU's TSC to
measure time.
doc/userguide.texi:
Document the --threadscope-use-tsc option. This documentation is
commented out.
783 lines
30 KiB
C
783 lines
30 KiB
C
/*
|
|
** vim:ts=4 sw=4 expandtab
|
|
*/
|
|
/*
|
|
** Copyright (C) 2007, 2009-2011 The University of Melbourne.
|
|
** This file may only be copied under the terms of the GNU Library General
|
|
** Public License - see the file COPYING.LIB in the Mercury distribution.
|
|
*/
|
|
|
|
/*
|
|
** mercury_atomic.h - defines atomic operations and other primitives used by
|
|
** the parallel runtime.
|
|
*/
|
|
|
|
#ifndef MERCURY_ATOMIC_OPS_H
|
|
#define MERCURY_ATOMIC_OPS_H
|
|
|
|
#include "mercury_std.h"
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
/*
|
|
** Use this to make some storage volatile only when using a threadsafe grade.
|
|
*/
|
|
#ifdef MR_THREAD_SAFE
|
|
#define MR_THREADSAFE_VOLATILE volatile
|
|
#else
|
|
#define MR_THREADSAFE_VOLATILE
|
|
#endif
|
|
|
|
#if defined(MR_THREAD_SAFE)
|
|
|
|
/*
|
|
* Intel and AMD support a pause instruction that is roughly equivalent
|
|
* to a no-op. Intel recommend that it is used in spin-loops to improve
|
|
* performance. Without a pause instruction multiple simultaneous
|
|
* read-requests will be in-flight for the synchronization variable from a
|
|
* single thread. Giving the pause instruction causes these to be executed
|
|
* in sequence allowing the processor to handle the change in the
|
|
* synchronization variable more easily.
|
|
*
|
|
* On some chips it may cause the spin-loop to use less power.
|
|
*
|
|
* This instruction was introduced with the Pentium 4 but is backwards
|
|
* compatible, This works because the two byte instruction for PAUSE is
|
|
* equivalent to the NOP instruction prefixed by REPE. Therefore older
|
|
* processors perform a no-op.
|
|
*
|
|
* This is not really an atomic instruction but we name it
|
|
* MR_ATOMIC_PAUSE for consistency.
|
|
*
|
|
* References: Intel and AMD documentation for PAUSE, Intel optimisation
|
|
* guide.
|
|
*/
|
|
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) && \
|
|
( defined(__i386__) || defined(__x86_64__) ) && \
|
|
!defined(MR_DO_NOT_USE_CPU_RELAX)
|
|
|
|
#define MR_ATOMIC_PAUSE \
|
|
do { \
|
|
__asm__ __volatile__("pause"); \
|
|
} while(0)
|
|
|
|
#else
|
|
|
|
/* Fall back to a no-op */
|
|
#define MR_ATOMIC_PAUSE \
|
|
do { \
|
|
; \
|
|
} while(0)
|
|
|
|
#endif
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
/*
|
|
** Declarations for inline atomic operations.
|
|
**
|
|
** These operations work on machine word-sized values, this is distinct from
|
|
** C's idea of 'int' and 'unsigned int'. MR_Integer and MR_Unsigned are
|
|
** supposed to be machine word sized so these functions should only be used
|
|
** with values of these types.
|
|
*/
|
|
|
|
/*
|
|
** If the value at addr is equal to old, assign new to addr and return true.
|
|
** Otherwise return false.
|
|
*/
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
|
|
MR_Integer new_val);
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
|
|
MR_Unsigned new_val);
|
|
|
|
/*
|
|
** Atomically add to an integer in memory and retrieve the result. In other
|
|
** words an atomic pre-increment operation.
|
|
*/
|
|
MR_EXTERN_INLINE MR_Integer
|
|
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend);
|
|
MR_EXTERN_INLINE MR_Unsigned
|
|
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend);
|
|
|
|
/*
|
|
** Atomically add the second argument to the memory pointed to by the first
|
|
** argument.
|
|
*/
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend);
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend);
|
|
|
|
/*
|
|
** Atomically subtract the second argument from the memory pointed to by the
|
|
** first argument.
|
|
*/
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x);
|
|
|
|
/*
|
|
** Increment the word pointed at by the address.
|
|
*/
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_inc_int(volatile MR_Integer *addr);
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_inc_uint(volatile MR_Unsigned *addr);
|
|
|
|
/*
|
|
** Decrement the word pointed at by the address.
|
|
*/
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_dec_int(volatile MR_Integer *addr);
|
|
|
|
/*
|
|
** Decrement the integer pointed at by the address and return true iff it is
|
|
** zero after the decrement.
|
|
*/
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr);
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr);
|
|
|
|
/*
|
|
** For information about GCC's builtins for atomic operations see:
|
|
** http://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcc/Atomic-Builtins.html
|
|
*/
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#if (defined(MR_CLANG) || (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1))) && \
|
|
!defined(MR_AVOID_COMPILER_INTRINSICS)
|
|
|
|
/*
|
|
** gcc 4.1 and above have builtin atomic operations.
|
|
*/
|
|
#define MR_COMPARE_AND_SWAP_WORD_BODY \
|
|
do { \
|
|
return __sync_bool_compare_and_swap(addr, old, new_val); \
|
|
} while (0)
|
|
|
|
#elif defined(MR_GNUC) && defined(__x86_64__)
|
|
|
|
#define MR_COMPARE_AND_SWAP_WORD_BODY \
|
|
do { \
|
|
char result; \
|
|
\
|
|
__asm__ __volatile__( \
|
|
"lock; cmpxchgq %4, %0; setz %1" \
|
|
: "=m"(*addr), "=q"(result), "=a"(old) \
|
|
: "m"(*addr), "r" (new_val), "a"(old) \
|
|
); \
|
|
return (MR_bool) result; \
|
|
} while (0)
|
|
|
|
#elif defined(MR_GNUC) && defined(__i386__)
|
|
|
|
/* Really 486 or better. */
|
|
#define MR_COMPARE_AND_SWAP_WORD_BODY \
|
|
do { \
|
|
char result; \
|
|
\
|
|
__asm__ __volatile__( \
|
|
"lock; cmpxchgl %4, %0; setz %1" \
|
|
: "=m"(*addr), "=q"(result), "=a"(old) \
|
|
: "m"(*addr), "r" (new_val), "a"(old) \
|
|
); \
|
|
return (MR_bool) result; \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_COMPARE_AND_SWAP_WORD_BODY
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
|
|
MR_Integer new_val)
|
|
{
|
|
MR_COMPARE_AND_SWAP_WORD_BODY;
|
|
}
|
|
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
|
|
MR_Unsigned new_val)
|
|
{
|
|
MR_COMPARE_AND_SWAP_WORD_BODY;
|
|
}
|
|
#endif
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#if (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)) && \
|
|
!defined(MR_AVOID_COMPILER_INTRINSICS)
|
|
|
|
#define MR_ATOMIC_ADD_AND_FETCH_WORD_BODY \
|
|
do { \
|
|
return __sync_add_and_fetch(addr, addend); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_AND_FETCH_INT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
|
|
#define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
|
|
|
|
#elif defined(MR_COMPARE_AND_SWAP_WORD_BODY)
|
|
/*
|
|
** If there is no GCC builtin for this then it can be implemented in terms
|
|
** of compare and swap, assuming that that has been implemented in
|
|
** assembler for this architecture.
|
|
**
|
|
** XXX: There is an add and exchange (xadd) instruction on x86, this is
|
|
** better than the CAS loop below.
|
|
*/
|
|
#define MR_ATOMIC_ADD_AND_FETCH_INT_BODY \
|
|
do { \
|
|
MR_Integer temp; \
|
|
temp = *addr; \
|
|
while (!MR_compare_and_swap_int(addr, temp, temp+addend)) { \
|
|
MR_ATOMIC_PAUSE; \
|
|
temp = *addr; \
|
|
} \
|
|
return temp+addend; \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY \
|
|
do { \
|
|
MR_Unsigned temp; \
|
|
temp = *addr; \
|
|
while (!MR_compare_and_swap_uint(addr, temp, temp+addend)) { \
|
|
MR_ATOMIC_PAUSE; \
|
|
temp = *addr; \
|
|
} \
|
|
return temp+addend; \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_ADD_AND_FETCH_INT_BODY
|
|
MR_EXTERN_INLINE MR_Integer
|
|
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
|
|
{
|
|
MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_ADD_AND_FETCH_UINT_BODY
|
|
MR_EXTERN_INLINE MR_Unsigned
|
|
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
|
|
{
|
|
MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
|
|
}
|
|
#endif
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
#define MR_ATOMIC_ADD_WORD_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; addq %2, %0" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr), "r"(addend) \
|
|
); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
|
|
#define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
|
|
|
|
#define MR_ATOMIC_ADD_WORD_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; addl %2, %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr), "r"(addend) \
|
|
); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
|
|
#define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY
|
|
|
|
#elif defined(MR_ATOMIC_ADD_AND_FETCH_INT_BODY)
|
|
|
|
#define MR_ATOMIC_ADD_INT_BODY \
|
|
do { \
|
|
MR_atomic_add_and_fetch_int(addr, addend); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_ADD_UINT_BODY \
|
|
do { \
|
|
MR_atomic_add_and_fetch_uint(addr, addend); \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_ADD_INT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
|
|
{
|
|
MR_ATOMIC_ADD_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_ADD_UINT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
|
|
{
|
|
MR_ATOMIC_ADD_UINT_BODY;
|
|
}
|
|
#endif
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
#define MR_ATOMIC_SUB_INT_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; subq %2, %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr), "r"(x) \
|
|
); \
|
|
} while (0)
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
|
|
|
|
#define MR_ATOMIC_SUB_INT_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; subl %2, %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr), "r"(x) \
|
|
); \
|
|
} while (0)
|
|
|
|
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
|
|
|
|
#define MR_ATOMIC_SUB_INT_BODY \
|
|
do { \
|
|
__sync_sub_and_fetch(addr, x); \
|
|
} while (0)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_SUB_INT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
|
|
{
|
|
MR_ATOMIC_SUB_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
#define MR_ATOMIC_INC_WORD_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; incq %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr) \
|
|
); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
|
|
#define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
/* Really 486 or better. */
|
|
#define MR_ATOMIC_INC_WORD_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; incl %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr) \
|
|
); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
|
|
#define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY
|
|
|
|
#else
|
|
|
|
/*
|
|
** Fall back to an atomic add 1 operation.
|
|
**
|
|
** We could fall back to the built-in GCC instructions but they also fetch
|
|
** the value. I believe this is more efficient.
|
|
** - pbone
|
|
*/
|
|
#define MR_ATOMIC_INC_INT_BODY \
|
|
MR_atomic_add_int(addr, 1)
|
|
#define MR_ATOMIC_INC_UINT_BODY \
|
|
MR_atomic_add_uint(addr, 1)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_INC_INT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_inc_int(volatile MR_Integer *addr)
|
|
{
|
|
MR_ATOMIC_INC_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_INC_UINT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_inc_uint(volatile MR_Unsigned *addr)
|
|
{
|
|
MR_ATOMIC_INC_UINT_BODY;
|
|
}
|
|
#endif
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
#define MR_ATOMIC_DEC_INT_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; decq %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr) \
|
|
); \
|
|
} while (0)
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
/* Really 486 or better. */
|
|
#define MR_ATOMIC_DEC_INT_BODY \
|
|
do { \
|
|
__asm__ __volatile__( \
|
|
"lock; decl %0;" \
|
|
: "=m"(*addr) \
|
|
: "m"(*addr) \
|
|
); \
|
|
} while (0)
|
|
#else
|
|
/*
|
|
** Fall back to an atomic subtract 1 operation.
|
|
*/
|
|
|
|
#define MR_ATOMIC_DEC_INT_BODY \
|
|
MR_atomic_sub_int(addr, 1)
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_DEC_INT_BODY
|
|
MR_EXTERN_INLINE void
|
|
MR_atomic_dec_int(volatile MR_Integer *addr)
|
|
{
|
|
MR_ATOMIC_DEC_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
/*
|
|
** Note that on x86(_64) we have to use the sub instruction rather than the
|
|
** dec instruction because we need it to set the CPU flags.
|
|
*/
|
|
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
/*
|
|
** This could be trivially implemented using the __sync_sub_and_fetch compiler
|
|
** intrinsic. However on some platforms this could use a compare and exchange
|
|
** loop. We can avoid this because we don't need to retrieve the result of the
|
|
** subtraction.
|
|
*/
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
|
|
do { \
|
|
char is_zero; \
|
|
__asm__( \
|
|
"lock; subq $1, %0; setz %1" \
|
|
: "=m"(*addr), "=q"(is_zero) \
|
|
: "m"(*addr) \
|
|
); \
|
|
return (MR_bool)is_zero; \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
|
|
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
|
|
do { \
|
|
char is_zero; \
|
|
__asm__( \
|
|
"lock; subl $1, %0; setz %1" \
|
|
: "=m"(*addr), "=q"(is_zero) \
|
|
: "m"(*addr) \
|
|
); \
|
|
return (MR_bool)is_zero; \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
|
|
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
|
|
do { \
|
|
return (__sync_sub_and_fetch(addr, 1) == 0); \
|
|
} while (0)
|
|
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
|
|
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
|
|
{
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY
|
|
MR_EXTERN_INLINE MR_bool
|
|
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
|
|
{
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
|
|
}
|
|
#endif
|
|
|
|
#endif /* MR_THREAD_SAFE */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#ifdef MR_THREAD_SAFE
|
|
|
|
/*
|
|
** Memory fence operations.
|
|
*/
|
|
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) && \
|
|
( defined(__i386__) || defined(__x86_64__) ) && \
|
|
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
|
|
|
|
/*
|
|
** Guarantees that any stores executed before this fence are
|
|
** globally visible before those after this fence.
|
|
*/
|
|
#define MR_CPU_SFENCE \
|
|
do { \
|
|
__asm__ __volatile__("sfence"); \
|
|
} while(0)
|
|
|
|
/*
|
|
** Guarantees that any loads executed before this fence are complete
|
|
** before any loads after this fence.
|
|
*/
|
|
#define MR_CPU_LFENCE \
|
|
do { \
|
|
__asm__ __volatile__("lfence"); \
|
|
} while(0)
|
|
|
|
/*
|
|
** A combination of the above.
|
|
*/
|
|
#define MR_CPU_MFENCE \
|
|
do { \
|
|
__asm__ __volatile__("mfence"); \
|
|
} while(0)
|
|
|
|
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
|
|
|
|
/*
|
|
** Our memory fences are better than GCC's. GCC only implements a full
|
|
** fence.
|
|
*/
|
|
#define MR_CPU_MFENCE \
|
|
do { \
|
|
__sync_synchronize(); \
|
|
} while(0)
|
|
#define MR_CPU_SFENCE MR_CPU_MFENCE
|
|
#define MR_CPU_LFENCE MR_CPU_MFENCE
|
|
|
|
#else
|
|
|
|
#pragma error "Please implement memory fence operations " \
|
|
"for this compiler/architecture"
|
|
|
|
#endif
|
|
|
|
#endif /* MR_THREAD_SAFE */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#ifdef MR_LL_PARALLEL_CONJ
|
|
|
|
/*
|
|
** Roll our own cheap user-space mutual exclusion locks. Blocking without
|
|
** spinning is not supported. Storage for these locks should be volatile.
|
|
**
|
|
** I expect these to be faster than pthread mutexes when threads are pinned
|
|
** and critical sections are short.
|
|
*/
|
|
typedef MR_Unsigned MR_Us_Lock;
|
|
|
|
#define MR_US_LOCK_INITIAL_VALUE (0)
|
|
|
|
#define MR_US_TRY_LOCK(x) \
|
|
MR_compare_and_swap_uint(x, 0, 1)
|
|
|
|
#define MR_US_SPIN_LOCK(x) \
|
|
do { \
|
|
while (!MR_compare_and_swap_uint(x, 0, 1)) { \
|
|
MR_ATOMIC_PAUSE; \
|
|
} \
|
|
} while (0)
|
|
|
|
#define MR_US_UNLOCK(x) \
|
|
do { \
|
|
MR_CPU_MFENCE; \
|
|
*x = 0; \
|
|
} while (0)
|
|
|
|
/*
|
|
** Similar support for condition variables. Again, make sure that storage for
|
|
** these is declared as volatile.
|
|
**
|
|
** XXX: These are not atomic, A waiting thread will not see a change until
|
|
** sometime after the signaling thread has signaled the condition. The same
|
|
** race can occur when clearing a condition. Order of memory operations is not
|
|
** guaranteed either.
|
|
*/
|
|
typedef MR_Unsigned MR_Us_Cond;
|
|
|
|
#define MR_US_COND_CLEAR(x) \
|
|
do { \
|
|
MR_CPU_MFENCE; \
|
|
*x = 0; \
|
|
} while (0)
|
|
|
|
#define MR_US_COND_SET(x) \
|
|
do { \
|
|
MR_CPU_MFENCE; \
|
|
*x = 1; \
|
|
MR_CPU_MFENCE; \
|
|
} while (0)
|
|
|
|
#define MR_US_SPIN_COND(x) \
|
|
do { \
|
|
while (!(*x)) { \
|
|
MR_ATOMIC_PAUSE; \
|
|
} \
|
|
MR_CPU_MFENCE; \
|
|
} while (0)
|
|
|
|
#endif /* MR_LL_PARALLEL_CONJ */
|
|
|
|
/*
|
|
** If we don't have definitions available for this compiler or architecture
|
|
** then we will get a link error in low-level .par grades. No other grades
|
|
** currently require any atomic ops.
|
|
*/
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
|
|
|
|
/*
|
|
** Declarations for profiling the parallel runtime.
|
|
*/
|
|
|
|
typedef struct {
|
|
/*
|
|
** The total number of times this event occurred is implicitly the sum of
|
|
** the recorded and not_recorded counts.
|
|
*/
|
|
volatile MR_Unsigned MR_stat_count_recorded;
|
|
volatile MR_Unsigned MR_stat_count_not_recorded;
|
|
|
|
/*
|
|
** Atomic instructions are used to update these fields, and these fields
|
|
** must be 64 bit to contain the valid ranges of values. However a 32 bit
|
|
** machine cannot (usually) do atomic operations on 64 bit data. Therefore
|
|
** if we have fewer than 64 bits we protect these two fields with a lock.
|
|
**
|
|
** The sum of squares is used to calculate variance and standard deviation.
|
|
*/
|
|
#if MR_LOW_TAG_BIGS >= 3
|
|
volatile MR_Integer MR_stat_sum;
|
|
volatile MR_Unsigned MR_stat_sum_squares;
|
|
#else
|
|
MR_Us_Lock MR_stat_sums_lock;
|
|
MR_int_least64_t MR_stat_sum;
|
|
MR_uint_least64_t MR_stat_sum_squares;
|
|
#endif
|
|
} MR_Stats;
|
|
|
|
typedef struct {
|
|
MR_uint_least64_t MR_timer_time;
|
|
MR_Unsigned MR_timer_processor_id;
|
|
} MR_Timer;
|
|
|
|
/*
|
|
** The number of CPU clock cycles per second, ie a 1GHz CPU will have a value
|
|
** of 10^9, zero if unknown.
|
|
** This value is only available after MR_do_cpu_feature_detection() has been
|
|
** called.
|
|
*/
|
|
extern MR_uint_least64_t MR_cpu_cycles_per_sec;
|
|
|
|
/*
|
|
** Do CPU feature detection, this is necessary for profiling parallel code
|
|
** execution and the threadscope code.
|
|
** On i386 and x86_64 machines this uses CPUID to determine if the RDTSCP
|
|
** instruction is available and not prohibited by the OS.
|
|
** This function is idempotent.
|
|
*/
|
|
extern void
|
|
MR_do_cpu_feature_detection(void);
|
|
|
|
/*
|
|
** Start and initialize a timer structure.
|
|
*/
|
|
extern void
|
|
MR_profiling_start_timer(MR_Timer *timer);
|
|
|
|
/*
|
|
** Stop the timer and update stats with the results.
|
|
*/
|
|
extern void
|
|
MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats);
|
|
|
|
/*
|
|
** The TSC works and MR_cpu_cycles_per_sec is nonzero.
|
|
*/
|
|
extern MR_bool
|
|
MR_tsc_is_sensible(void);
|
|
|
|
/*
|
|
** Read the CPU's TSC. This is currently only implemented for i386 and x86-64
|
|
** systems. It returns 0 when support is not available.
|
|
*/
|
|
extern MR_uint_least64_t
|
|
MR_read_cpu_tsc(void);
|
|
|
|
#endif /* MR_PROFILE_PARALLEL_EXECUTION_SUPPORT */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#endif /* not MERCURY_ATOMIC_OPS_H */
|