Files
mercury/runtime/mercury_atomic_ops.h
Paul Bone a9f82d004b On some systems the CPU's time stamp counter (TSC) cannot reliabily be
used.  Mercury's ThreadScope support will now use gettimeofday() by
default, but use of the TSC may be enabled.

Note that in Linux, gettimeofday() does not always make a system call.

runtime/mercury_threadscope.[ch]:
    Add support for measuring time with gettimeofday().

    Use gettimeofday() to measure time by default.

runtime/mercury_atomic_ops.[ch]
    Add a new function MR_tsc_is_sensible(), It returns true if the TSC can
    (as far as the RTS can detect) be used.

    Fix trailing whitespace.

runtime/mercury_wrapper.c:
    Add a new runtime option --threadscope-use-tsc.
    When specified this option allows threadscope to use the CPU's TSC to
    measure time.

doc/userguide.texi:
    Document the --threadscope-use-tsc option.  This documentation is
    commented out.
2012-06-20 13:13:34 +00:00

783 lines
30 KiB
C

/*
** vim:ts=4 sw=4 expandtab
*/
/*
** Copyright (C) 2007, 2009-2011 The University of Melbourne.
** This file may only be copied under the terms of the GNU Library General
** Public License - see the file COPYING.LIB in the Mercury distribution.
*/
/*
** mercury_atomic.h - defines atomic operations and other primitives used by
** the parallel runtime.
*/
#ifndef MERCURY_ATOMIC_OPS_H
#define MERCURY_ATOMIC_OPS_H
#include "mercury_std.h"
/*---------------------------------------------------------------------------*/
/*
** Use this to make some storage volatile only when using a threadsafe grade.
*/
#ifdef MR_THREAD_SAFE
#define MR_THREADSAFE_VOLATILE volatile
#else
#define MR_THREADSAFE_VOLATILE
#endif
#if defined(MR_THREAD_SAFE)
/*
* Intel and AMD support a pause instruction that is roughly equivalent
* to a no-op. Intel recommend that it is used in spin-loops to improve
* performance. Without a pause instruction multiple simultaneous
* read-requests will be in-flight for the synchronization variable from a
* single thread. Giving the pause instruction causes these to be executed
* in sequence allowing the processor to handle the change in the
* synchronization variable more easily.
*
* On some chips it may cause the spin-loop to use less power.
*
* This instruction was introduced with the Pentium 4 but is backwards
* compatible, This works because the two byte instruction for PAUSE is
* equivalent to the NOP instruction prefixed by REPE. Therefore older
* processors perform a no-op.
*
* This is not really an atomic instruction but we name it
* MR_ATOMIC_PAUSE for consistency.
*
* References: Intel and AMD documentation for PAUSE, Intel optimisation
* guide.
*/
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) && \
( defined(__i386__) || defined(__x86_64__) ) && \
!defined(MR_DO_NOT_USE_CPU_RELAX)
#define MR_ATOMIC_PAUSE \
do { \
__asm__ __volatile__("pause"); \
} while(0)
#else
/* Fall back to a no-op */
#define MR_ATOMIC_PAUSE \
do { \
; \
} while(0)
#endif
/*---------------------------------------------------------------------------*/
/*
** Declarations for inline atomic operations.
**
** These operations work on machine word-sized values, this is distinct from
** C's idea of 'int' and 'unsigned int'. MR_Integer and MR_Unsigned are
** supposed to be machine word sized so these functions should only be used
** with values of these types.
*/
/*
** If the value at addr is equal to old, assign new to addr and return true.
** Otherwise return false.
*/
MR_EXTERN_INLINE MR_bool
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
MR_Integer new_val);
MR_EXTERN_INLINE MR_bool
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
MR_Unsigned new_val);
/*
** Atomically add to an integer in memory and retrieve the result. In other
** words an atomic pre-increment operation.
*/
MR_EXTERN_INLINE MR_Integer
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend);
MR_EXTERN_INLINE MR_Unsigned
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend);
/*
** Atomically add the second argument to the memory pointed to by the first
** argument.
*/
MR_EXTERN_INLINE void
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend);
MR_EXTERN_INLINE void
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend);
/*
** Atomically subtract the second argument from the memory pointed to by the
** first argument.
*/
MR_EXTERN_INLINE void
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x);
/*
** Increment the word pointed at by the address.
*/
MR_EXTERN_INLINE void
MR_atomic_inc_int(volatile MR_Integer *addr);
MR_EXTERN_INLINE void
MR_atomic_inc_uint(volatile MR_Unsigned *addr);
/*
** Decrement the word pointed at by the address.
*/
MR_EXTERN_INLINE void
MR_atomic_dec_int(volatile MR_Integer *addr);
/*
** Decrement the integer pointed at by the address and return true iff it is
** zero after the decrement.
*/
MR_EXTERN_INLINE MR_bool
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr);
MR_EXTERN_INLINE MR_bool
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr);
/*
** For information about GCC's builtins for atomic operations see:
** http://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcc/Atomic-Builtins.html
*/
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/
#if (defined(MR_CLANG) || (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1))) && \
!defined(MR_AVOID_COMPILER_INTRINSICS)
/*
** gcc 4.1 and above have builtin atomic operations.
*/
#define MR_COMPARE_AND_SWAP_WORD_BODY \
do { \
return __sync_bool_compare_and_swap(addr, old, new_val); \
} while (0)
#elif defined(MR_GNUC) && defined(__x86_64__)
#define MR_COMPARE_AND_SWAP_WORD_BODY \
do { \
char result; \
\
__asm__ __volatile__( \
"lock; cmpxchgq %4, %0; setz %1" \
: "=m"(*addr), "=q"(result), "=a"(old) \
: "m"(*addr), "r" (new_val), "a"(old) \
); \
return (MR_bool) result; \
} while (0)
#elif defined(MR_GNUC) && defined(__i386__)
/* Really 486 or better. */
#define MR_COMPARE_AND_SWAP_WORD_BODY \
do { \
char result; \
\
__asm__ __volatile__( \
"lock; cmpxchgl %4, %0; setz %1" \
: "=m"(*addr), "=q"(result), "=a"(old) \
: "m"(*addr), "r" (new_val), "a"(old) \
); \
return (MR_bool) result; \
} while (0)
#endif
#ifdef MR_COMPARE_AND_SWAP_WORD_BODY
MR_EXTERN_INLINE MR_bool
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
MR_Integer new_val)
{
MR_COMPARE_AND_SWAP_WORD_BODY;
}
MR_EXTERN_INLINE MR_bool
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
MR_Unsigned new_val)
{
MR_COMPARE_AND_SWAP_WORD_BODY;
}
#endif
/*---------------------------------------------------------------------------*/
#if (MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)) && \
!defined(MR_AVOID_COMPILER_INTRINSICS)
#define MR_ATOMIC_ADD_AND_FETCH_WORD_BODY \
do { \
return __sync_add_and_fetch(addr, addend); \
} while (0)
#define MR_ATOMIC_ADD_AND_FETCH_INT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
#define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY MR_ATOMIC_ADD_AND_FETCH_WORD_BODY
#elif defined(MR_COMPARE_AND_SWAP_WORD_BODY)
/*
** If there is no GCC builtin for this then it can be implemented in terms
** of compare and swap, assuming that that has been implemented in
** assembler for this architecture.
**
** XXX: There is an add and exchange (xadd) instruction on x86, this is
** better than the CAS loop below.
*/
#define MR_ATOMIC_ADD_AND_FETCH_INT_BODY \
do { \
MR_Integer temp; \
temp = *addr; \
while (!MR_compare_and_swap_int(addr, temp, temp+addend)) { \
MR_ATOMIC_PAUSE; \
temp = *addr; \
} \
return temp+addend; \
} while (0)
#define MR_ATOMIC_ADD_AND_FETCH_UINT_BODY \
do { \
MR_Unsigned temp; \
temp = *addr; \
while (!MR_compare_and_swap_uint(addr, temp, temp+addend)) { \
MR_ATOMIC_PAUSE; \
temp = *addr; \
} \
return temp+addend; \
} while (0)
#endif
#ifdef MR_ATOMIC_ADD_AND_FETCH_INT_BODY
MR_EXTERN_INLINE MR_Integer
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
{
MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
}
#endif
#ifdef MR_ATOMIC_ADD_AND_FETCH_UINT_BODY
MR_EXTERN_INLINE MR_Unsigned
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
{
MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
}
#endif
/*---------------------------------------------------------------------------*/
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
#define MR_ATOMIC_ADD_WORD_BODY \
do { \
__asm__ __volatile__( \
"lock; addq %2, %0" \
: "=m"(*addr) \
: "m"(*addr), "r"(addend) \
); \
} while (0)
#define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
#define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
#define MR_ATOMIC_ADD_WORD_BODY \
do { \
__asm__ __volatile__( \
"lock; addl %2, %0;" \
: "=m"(*addr) \
: "m"(*addr), "r"(addend) \
); \
} while (0)
#define MR_ATOMIC_ADD_INT_BODY MR_ATOMIC_ADD_WORD_BODY
#define MR_ATOMIC_ADD_UINT_BODY MR_ATOMIC_ADD_WORD_BODY
#elif defined(MR_ATOMIC_ADD_AND_FETCH_INT_BODY)
#define MR_ATOMIC_ADD_INT_BODY \
do { \
MR_atomic_add_and_fetch_int(addr, addend); \
} while (0)
#define MR_ATOMIC_ADD_UINT_BODY \
do { \
MR_atomic_add_and_fetch_uint(addr, addend); \
} while (0)
#endif
#ifdef MR_ATOMIC_ADD_INT_BODY
MR_EXTERN_INLINE void
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
{
MR_ATOMIC_ADD_INT_BODY;
}
#endif
#ifdef MR_ATOMIC_ADD_UINT_BODY
MR_EXTERN_INLINE void
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
{
MR_ATOMIC_ADD_UINT_BODY;
}
#endif
/*---------------------------------------------------------------------------*/
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
#define MR_ATOMIC_SUB_INT_BODY \
do { \
__asm__ __volatile__( \
"lock; subq %2, %0;" \
: "=m"(*addr) \
: "m"(*addr), "r"(x) \
); \
} while (0)
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
#define MR_ATOMIC_SUB_INT_BODY \
do { \
__asm__ __volatile__( \
"lock; subl %2, %0;" \
: "=m"(*addr) \
: "m"(*addr), "r"(x) \
); \
} while (0)
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
#define MR_ATOMIC_SUB_INT_BODY \
do { \
__sync_sub_and_fetch(addr, x); \
} while (0)
#endif
#ifdef MR_ATOMIC_SUB_INT_BODY
MR_EXTERN_INLINE void
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
{
MR_ATOMIC_SUB_INT_BODY;
}
#endif
/*---------------------------------------------------------------------------*/
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
#define MR_ATOMIC_INC_WORD_BODY \
do { \
__asm__ __volatile__( \
"lock; incq %0;" \
: "=m"(*addr) \
: "m"(*addr) \
); \
} while (0)
#define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
#define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
/* Really 486 or better. */
#define MR_ATOMIC_INC_WORD_BODY \
do { \
__asm__ __volatile__( \
"lock; incl %0;" \
: "=m"(*addr) \
: "m"(*addr) \
); \
} while (0)
#define MR_ATOMIC_INC_INT_BODY MR_ATOMIC_INC_WORD_BODY
#define MR_ATOMIC_INC_UINT_BODY MR_ATOMIC_INC_WORD_BODY
#else
/*
** Fall back to an atomic add 1 operation.
**
** We could fall back to the built-in GCC instructions but they also fetch
** the value. I believe this is more efficient.
** - pbone
*/
#define MR_ATOMIC_INC_INT_BODY \
MR_atomic_add_int(addr, 1)
#define MR_ATOMIC_INC_UINT_BODY \
MR_atomic_add_uint(addr, 1)
#endif
#ifdef MR_ATOMIC_INC_INT_BODY
MR_EXTERN_INLINE void
MR_atomic_inc_int(volatile MR_Integer *addr)
{
MR_ATOMIC_INC_INT_BODY;
}
#endif
#ifdef MR_ATOMIC_INC_UINT_BODY
MR_EXTERN_INLINE void
MR_atomic_inc_uint(volatile MR_Unsigned *addr)
{
MR_ATOMIC_INC_UINT_BODY;
}
#endif
/*---------------------------------------------------------------------------*/
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
#define MR_ATOMIC_DEC_INT_BODY \
do { \
__asm__ __volatile__( \
"lock; decq %0;" \
: "=m"(*addr) \
: "m"(*addr) \
); \
} while (0)
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
/* Really 486 or better. */
#define MR_ATOMIC_DEC_INT_BODY \
do { \
__asm__ __volatile__( \
"lock; decl %0;" \
: "=m"(*addr) \
: "m"(*addr) \
); \
} while (0)
#else
/*
** Fall back to an atomic subtract 1 operation.
*/
#define MR_ATOMIC_DEC_INT_BODY \
MR_atomic_sub_int(addr, 1)
#endif
#ifdef MR_ATOMIC_DEC_INT_BODY
MR_EXTERN_INLINE void
MR_atomic_dec_int(volatile MR_Integer *addr)
{
MR_ATOMIC_DEC_INT_BODY;
}
#endif
/*---------------------------------------------------------------------------*/
/*
** Note that on x86(_64) we have to use the sub instruction rather than the
** dec instruction because we need it to set the CPU flags.
*/
#if (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__x86_64__) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
/*
** This could be trivially implemented using the __sync_sub_and_fetch compiler
** intrinsic. However on some platforms this could use a compare and exchange
** loop. We can avoid this because we don't need to retrieve the result of the
** subtraction.
*/
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
do { \
char is_zero; \
__asm__( \
"lock; subq $1, %0; setz %1" \
: "=m"(*addr), "=q"(is_zero) \
: "m"(*addr) \
); \
return (MR_bool)is_zero; \
} while (0)
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#elif (defined(MR_CLANG) || defined(MR_GNUC)) && defined(__i386__)
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
do { \
char is_zero; \
__asm__( \
"lock; subl $1, %0; setz %1" \
: "=m"(*addr), "=q"(is_zero) \
: "m"(*addr) \
); \
return (MR_bool)is_zero; \
} while (0)
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
#define MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY \
do { \
return (__sync_sub_and_fetch(addr, 1) == 0); \
} while (0)
#define MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#define MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY \
MR_ATOMIC_DEC_AND_IS_ZERO_WORD_BODY
#endif
#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY
MR_EXTERN_INLINE MR_bool
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
{
MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
}
#endif
#ifdef MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY
MR_EXTERN_INLINE MR_bool
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
{
MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
}
#endif
#endif /* MR_THREAD_SAFE */
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/
#ifdef MR_THREAD_SAFE
/*
** Memory fence operations.
*/
#if ( defined(MR_CLANG) || defined(MR_GNUC) ) && \
( defined(__i386__) || defined(__x86_64__) ) && \
!defined(MR_AVOID_HANDWRITTEN_ASSEMBLER)
/*
** Guarantees that any stores executed before this fence are
** globally visible before those after this fence.
*/
#define MR_CPU_SFENCE \
do { \
__asm__ __volatile__("sfence"); \
} while(0)
/*
** Guarantees that any loads executed before this fence are complete
** before any loads after this fence.
*/
#define MR_CPU_LFENCE \
do { \
__asm__ __volatile__("lfence"); \
} while(0)
/*
** A combination of the above.
*/
#define MR_CPU_MFENCE \
do { \
__asm__ __volatile__("mfence"); \
} while(0)
#elif MR_GNUC > 4 || (MR_GNUC == 4 && __GNUC_MINOR__ >= 1)
/*
** Our memory fences are better than GCC's. GCC only implements a full
** fence.
*/
#define MR_CPU_MFENCE \
do { \
__sync_synchronize(); \
} while(0)
#define MR_CPU_SFENCE MR_CPU_MFENCE
#define MR_CPU_LFENCE MR_CPU_MFENCE
#else
#pragma error "Please implement memory fence operations " \
"for this compiler/architecture"
#endif
#endif /* MR_THREAD_SAFE */
/*---------------------------------------------------------------------------*/
#ifdef MR_LL_PARALLEL_CONJ
/*
** Roll our own cheap user-space mutual exclusion locks. Blocking without
** spinning is not supported. Storage for these locks should be volatile.
**
** I expect these to be faster than pthread mutexes when threads are pinned
** and critical sections are short.
*/
typedef MR_Unsigned MR_Us_Lock;
#define MR_US_LOCK_INITIAL_VALUE (0)
#define MR_US_TRY_LOCK(x) \
MR_compare_and_swap_uint(x, 0, 1)
#define MR_US_SPIN_LOCK(x) \
do { \
while (!MR_compare_and_swap_uint(x, 0, 1)) { \
MR_ATOMIC_PAUSE; \
} \
} while (0)
#define MR_US_UNLOCK(x) \
do { \
MR_CPU_MFENCE; \
*x = 0; \
} while (0)
/*
** Similar support for condition variables. Again, make sure that storage for
** these is declared as volatile.
**
** XXX: These are not atomic, A waiting thread will not see a change until
** sometime after the signaling thread has signaled the condition. The same
** race can occur when clearing a condition. Order of memory operations is not
** guaranteed either.
*/
typedef MR_Unsigned MR_Us_Cond;
#define MR_US_COND_CLEAR(x) \
do { \
MR_CPU_MFENCE; \
*x = 0; \
} while (0)
#define MR_US_COND_SET(x) \
do { \
MR_CPU_MFENCE; \
*x = 1; \
MR_CPU_MFENCE; \
} while (0)
#define MR_US_SPIN_COND(x) \
do { \
while (!(*x)) { \
MR_ATOMIC_PAUSE; \
} \
MR_CPU_MFENCE; \
} while (0)
#endif /* MR_LL_PARALLEL_CONJ */
/*
** If we don't have definitions available for this compiler or architecture
** then we will get a link error in low-level .par grades. No other grades
** currently require any atomic ops.
*/
/*---------------------------------------------------------------------------*/
#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
/*
** Declarations for profiling the parallel runtime.
*/
typedef struct {
/*
** The total number of times this event occurred is implicitly the sum of
** the recorded and not_recorded counts.
*/
volatile MR_Unsigned MR_stat_count_recorded;
volatile MR_Unsigned MR_stat_count_not_recorded;
/*
** Atomic instructions are used to update these fields, and these fields
** must be 64 bit to contain the valid ranges of values. However a 32 bit
** machine cannot (usually) do atomic operations on 64 bit data. Therefore
** if we have fewer than 64 bits we protect these two fields with a lock.
**
** The sum of squares is used to calculate variance and standard deviation.
*/
#if MR_LOW_TAG_BIGS >= 3
volatile MR_Integer MR_stat_sum;
volatile MR_Unsigned MR_stat_sum_squares;
#else
MR_Us_Lock MR_stat_sums_lock;
MR_int_least64_t MR_stat_sum;
MR_uint_least64_t MR_stat_sum_squares;
#endif
} MR_Stats;
typedef struct {
MR_uint_least64_t MR_timer_time;
MR_Unsigned MR_timer_processor_id;
} MR_Timer;
/*
** The number of CPU clock cycles per second, ie a 1GHz CPU will have a value
** of 10^9, zero if unknown.
** This value is only available after MR_do_cpu_feature_detection() has been
** called.
*/
extern MR_uint_least64_t MR_cpu_cycles_per_sec;
/*
** Do CPU feature detection, this is necessary for profiling parallel code
** execution and the threadscope code.
** On i386 and x86_64 machines this uses CPUID to determine if the RDTSCP
** instruction is available and not prohibited by the OS.
** This function is idempotent.
*/
extern void
MR_do_cpu_feature_detection(void);
/*
** Start and initialize a timer structure.
*/
extern void
MR_profiling_start_timer(MR_Timer *timer);
/*
** Stop the timer and update stats with the results.
*/
extern void
MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats);
/*
** The TSC works and MR_cpu_cycles_per_sec is nonzero.
*/
extern MR_bool
MR_tsc_is_sensible(void);
/*
** Read the CPU's TSC. This is currently only implemented for i386 and x86-64
** systems. It returns 0 when support is not available.
*/
extern MR_uint_least64_t
MR_read_cpu_tsc(void);
#endif /* MR_PROFILE_PARALLEL_EXECUTION_SUPPORT */
/*---------------------------------------------------------------------------*/
#endif /* not MERCURY_ATOMIC_OPS_H */