mirror of
https://github.com/Mercury-Language/mercury.git
synced 2025-12-12 04:14:06 +00:00
used. Mercury's ThreadScope support will now use gettimeofday() by
default, but use of the TSC may be enabled.
Note that in Linux, gettimeofday() does not always make a system call.
runtime/mercury_threadscope.[ch]:
Add support for measuring time with gettimeofday().
Use gettimeofday() to measure time by default.
runtime/mercury_atomic_ops.[ch]
Add a new function MR_tsc_is_sensible(), It returns true if the TSC can
(as far as the RTS can detect) be used.
Fix trailing whitespace.
runtime/mercury_wrapper.c:
Add a new runtime option --threadscope-use-tsc.
When specified this option allows threadscope to use the CPU's TSC to
measure time.
doc/userguide.texi:
Document the --threadscope-use-tsc option. This documentation is
commented out.
617 lines
17 KiB
C
617 lines
17 KiB
C
/*
|
|
** vim:ts=4 sw=4 expandtab
|
|
*/
|
|
/*
|
|
** Copyright (C) 2007, 2009-2011 The University of Melbourne.
|
|
** This file may only be copied under the terms of the GNU Library General
|
|
** Public License - see the file COPYING.LIB in the Mercury distribution.
|
|
*/
|
|
|
|
/*
|
|
** mercury_atomic_ops.c
|
|
*/
|
|
|
|
#include "mercury_imp.h"
|
|
#include "mercury_atomic_ops.h"
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#if defined(MR_LL_PARALLEL_CONJ)
|
|
|
|
/*
|
|
** Definitions for the atomic functions declared `extern inline'.
|
|
*/
|
|
|
|
MR_OUTLINE_DEFN(
|
|
MR_bool
|
|
MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old,
|
|
MR_Integer new_val)
|
|
,
|
|
{
|
|
MR_COMPARE_AND_SWAP_WORD_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
MR_bool
|
|
MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old,
|
|
MR_Unsigned new_val)
|
|
,
|
|
{
|
|
MR_COMPARE_AND_SWAP_WORD_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
MR_Integer
|
|
MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend)
|
|
,
|
|
{
|
|
MR_ATOMIC_ADD_AND_FETCH_INT_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
MR_Unsigned
|
|
MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
|
|
,
|
|
{
|
|
MR_ATOMIC_ADD_AND_FETCH_UINT_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
void
|
|
MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
|
|
,
|
|
{
|
|
MR_ATOMIC_ADD_INT_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
void
|
|
MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend)
|
|
,
|
|
{
|
|
MR_ATOMIC_ADD_UINT_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
void
|
|
MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
|
|
,
|
|
{
|
|
MR_ATOMIC_SUB_INT_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
void
|
|
MR_atomic_inc_int(volatile MR_Integer *addr)
|
|
,
|
|
{
|
|
MR_ATOMIC_INC_INT_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
void
|
|
MR_atomic_inc_uint(volatile MR_Unsigned *addr)
|
|
,
|
|
{
|
|
MR_ATOMIC_INC_UINT_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
void
|
|
MR_atomic_dec_int(volatile MR_Integer *addr)
|
|
,
|
|
{
|
|
MR_ATOMIC_DEC_INT_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
MR_bool
|
|
MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr)
|
|
,
|
|
{
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY;
|
|
}
|
|
)
|
|
|
|
MR_OUTLINE_DEFN(
|
|
MR_bool
|
|
MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr)
|
|
,
|
|
{
|
|
MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY;
|
|
}
|
|
)
|
|
|
|
#endif /* MR_LL_PARALLEL_CONJ */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
|
|
|
|
/*
|
|
** Profiling of the parallel runtime.
|
|
*/
|
|
|
|
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
|
|
/*
|
|
** True if the RDTSCP and RDTSC instructions are available respectively.
|
|
*/
|
|
static MR_bool MR_rdtscp_is_available = MR_FALSE;
|
|
static MR_bool MR_rdtsc_is_available = MR_FALSE;
|
|
#endif
|
|
|
|
MR_uint_least64_t MR_cpu_cycles_per_sec = 0;
|
|
|
|
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
|
|
|
|
/* Set this to 1 to enable some printfs below */
|
|
#define MR_DEBUG_CPU_FEATURE_DETECTION 0
|
|
|
|
/*
|
|
** cpuid, rdtscp and rdtsc are i386/x86_64 instructions.
|
|
*/
|
|
static __inline__ void
|
|
MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
|
|
MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d);
|
|
|
|
static __inline__ void
|
|
MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id);
|
|
|
|
static __inline__ void
|
|
MR_rdtsc(MR_uint_least64_t *tsc);
|
|
|
|
/*
|
|
** Return zero if parsing failed, otherwise return the number of cycles per
|
|
** second.
|
|
*/
|
|
static MR_uint_least64_t
|
|
parse_freq_from_x86_brand_string(char *string);
|
|
|
|
#endif /* MR_GNUC && (__i386__ || __x86_64__) */
|
|
|
|
void
|
|
MR_do_cpu_feature_detection(void)
|
|
{
|
|
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
|
|
MR_Unsigned a, b, c, d;
|
|
MR_Unsigned eflags, old_eflags;
|
|
MR_Unsigned maximum_extended_page;
|
|
MR_Unsigned extended_family, basic_family, family;
|
|
MR_Unsigned extended_model, model;
|
|
|
|
/*
|
|
** Check for the CPUID instruction. CPUID is supported if we can flip bit
|
|
** 21 in the CPU's EFLAGS register. The assembly below is written in a
|
|
** subset of i386 and x86_64 assembly. To read and write EFLAGS we have
|
|
** to go via the C stack.
|
|
*/
|
|
__asm__ ("pushf; pop %0"
|
|
:"=r"(eflags));
|
|
old_eflags = eflags;
|
|
/* Flip bit 21 */
|
|
eflags ^= (1 << 21);
|
|
__asm__ ("push %0; popf; pushf; pop %0;"
|
|
:"=r"(eflags)
|
|
:"0"(eflags));
|
|
|
|
/*
|
|
** Test to see if our change held. We don't restore eflags, a change to
|
|
** the ID bit has no effect.
|
|
*/
|
|
if (eflags == old_eflags) {
|
|
#if MR_DEBUG_CPU_FEATURE_DETECTION
|
|
fprintf(stderr, "This CPU doesn't support the CPUID instruction.\n",
|
|
eflags, old_eflags);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
/*
|
|
** CPUID 0 gives the maximum basic CPUID page in EAX. Basic pages go up to
|
|
** but not including 0x40000000.
|
|
*/
|
|
MR_cpuid(0, 0, &a, &b, &c, &d);
|
|
if (a < 1) {
|
|
return;
|
|
}
|
|
|
|
/* CPUID 1 gives type, family, model and stepping information in EAX. */
|
|
MR_cpuid(1, 0, &a, &b, &c, &d);
|
|
|
|
/* Bit 4 in EDX is high if RDTSC is available */
|
|
if (d & (1 << 4)) {
|
|
MR_rdtsc_is_available = MR_TRUE;
|
|
}
|
|
|
|
/*
|
|
** BTW: Intel can't count:
|
|
**
|
|
** http://www.pagetable.com/?p=18
|
|
** http://www.codinghorror.com/blog/archives/000364.html
|
|
**
|
|
** 486 (1989): family 4
|
|
** Pentium (1993): family 5
|
|
** Pentium Pro (1995): family 6, models 0 and 1
|
|
** Pentium 2 (1997): family 6, models 3, 5 and 6
|
|
** Pentium 3 (2000): family 6, models 7, 8, 10, 11
|
|
** Itanium (2001): family 7
|
|
** Pentium 4 (2000): family 15/0
|
|
** Itanium 2 (2002): family 15/1 and 15/2
|
|
** Pentium D: family 15/4
|
|
** Pentium M (2003): family 6, models 9 and 13
|
|
** Core (2006): family 6, model 14
|
|
** Core 2 (2006): family 6, model 15
|
|
** i7: family 6, model 26
|
|
** Atom: family 6, model 28
|
|
**
|
|
** This list is incomplete, it doesn't cover AMD or any other brand of x86
|
|
** processor, and it probably doesn't cover all post-pentium Intel
|
|
** processors.
|
|
*/
|
|
|
|
/* bits 8-11 (first bit (LSB) is bit 0) */
|
|
basic_family = (a & 0x00000F00) >> 8;
|
|
if (basic_family == 0x0F) {
|
|
/* bits 20-27 */
|
|
extended_family = (a & 0x0FF00000) >> 20;
|
|
family = basic_family + extended_family;
|
|
} else {
|
|
family = basic_family;
|
|
}
|
|
/*
|
|
** I'm not using the model value but I'll leave the code here incase we
|
|
** have a reason to use it in the future.
|
|
*/
|
|
/* bits 4-7 */
|
|
model = (a & 0x000000F0) >> 4;
|
|
if ((basic_family == 0x0F) || (basic_family == 0x06)) {
|
|
/* bits 16-19 */
|
|
extended_model = (a & 0x000F0000) >> 16;
|
|
model += (extended_model << 4);
|
|
}
|
|
#if MR_DEBUG_CPU_FEATURE_DETECTION
|
|
fprintf(stderr, "This is family %d and model %d\n", family, model);
|
|
#endif
|
|
|
|
/* Now check for P3 or higher since they have the extended pages */
|
|
if (family < 6) {
|
|
/* This is a 486 or Pentium */
|
|
return;
|
|
}
|
|
/*
|
|
** I could bail out here if this was a pentium 3, but there's a more
|
|
** reliable check for extended CPUID support below that should work on AMD
|
|
** chips as well, if I knew all the model numbers for all family 6
|
|
** processors and knew if they honoured extended CPUID.
|
|
*/
|
|
|
|
/*
|
|
** Extended CPUID 0x80000000.
|
|
**
|
|
** EAX contains the maximum extended CPUID node.
|
|
*/
|
|
MR_cpuid(0x80000000, 0, &a, &b, &c, &d);
|
|
if ((a & 0x80000000) == 0) {
|
|
/*
|
|
** Extended CPUID is not supported.
|
|
** Note that this check is still not as reliable as I'd like. If it
|
|
** succeeds I'm not confident that the processor definitely implements
|
|
** extended CPUID.
|
|
*/
|
|
return;
|
|
}
|
|
maximum_extended_page = a;
|
|
#if MR_DEBUG_CPU_FEATURE_DETECTION
|
|
fprintf(stderr, "Maximum extended CPUID page: 0x%x\n", maximum_extended_page);
|
|
#endif
|
|
|
|
/*
|
|
** Extended CPUID 0x80000001
|
|
**
|
|
** If EDX bit 27 is set the RDTSCP instruction is available.
|
|
*/
|
|
if (maximum_extended_page >= 0x80000001) {
|
|
MR_cpuid(0x80000001, 0, &a, &b, &c, &d);
|
|
#if MR_DEBUG_CPU_FEATURE_DETECTION
|
|
fprintf(stderr, "CPUID 0x80000001 EDX: 0x%x\n", d);
|
|
#endif
|
|
if ((d & (1 << 27))) {
|
|
/*
|
|
** This processor supports RDTSCP.
|
|
*/
|
|
#if MR_DEBUG_CPU_FEATURE_DETECTION
|
|
fprintf(stderr, "RDTSCP is available\n");
|
|
#endif
|
|
MR_rdtscp_is_available = MR_TRUE;
|
|
}
|
|
}
|
|
|
|
if (maximum_extended_page >= 0x80000004) {
|
|
/*
|
|
** 3 CPUID pages, 4 return registers each, containing 4 bytes each,
|
|
** plus a null byte. Intel say they include their own null byte, but
|
|
** for the cost of a single byte I feel safer using our own.
|
|
*/
|
|
#define CPUID_BRAND_STRING_SIZE (3*4*4 + 1)
|
|
char buff[CPUID_BRAND_STRING_SIZE];
|
|
unsigned int page;
|
|
unsigned int byte;
|
|
unsigned int shift;
|
|
|
|
/*
|
|
** This processor supports the brand string from which we can
|
|
** try to extract the clock speed. This algorithm is described
|
|
** in the Intel Instruction Set Reference, Volume 2B, Chapter 3,
|
|
** Pages 207-208, In particular the flow chart in figure 3-10.
|
|
** This does not work on AMD processors since they don't include
|
|
** the clock speed in the brand string.
|
|
*/
|
|
for (page = 0; page < 3; page++) {
|
|
MR_cpuid(page + 0x80000002, 0, &a, &b, &c, &d);
|
|
#if MR_DEBUG_CPU_FEATURE_DETECTION
|
|
fprintf(stderr, "CPUID page: 0x%.8x, eax: 0x%.8x, ebx: 0x%.8x, ecx: 0x%.8x, edx: 0x%.8x\n",
|
|
page + 0x80000002, a, b, c, d);
|
|
#endif
|
|
for (byte = 0; byte < 4; byte++) {
|
|
shift = byte * 8;
|
|
buff[page*4*4 + 0 + byte] = (char)(0xFF & (a >> shift));
|
|
buff[page*4*4 + 4 + byte] = (char)(0xFF & (b >> shift));
|
|
buff[page*4*4 + 8 + byte] = (char)(0xFF & (c >> shift));
|
|
buff[page*4*4 + 12 + byte] = (char)(0xFF & (d >> shift));
|
|
}
|
|
}
|
|
/* Add a null byte */
|
|
buff[CPUID_BRAND_STRING_SIZE - 1] = 0;
|
|
#if MR_DEBUG_CPU_FEATURE_DETECTION
|
|
fprintf(stderr, "CPUID Brand string: %s\n", buff);
|
|
#endif
|
|
|
|
MR_cpu_cycles_per_sec = parse_freq_from_x86_brand_string(buff);
|
|
#if MR_DEBUG_CPU_FEATURE_DETECTION
|
|
if (MR_cpu_cycles_per_sec == 0) {
|
|
fprintf(stderr, "Failed to detect cycles per second "
|
|
"you can probably blame AMD for this.\n");
|
|
} else {
|
|
fprintf(stderr, "Cycles per second: %ld\n", MR_cpu_cycles_per_sec);
|
|
}
|
|
#endif
|
|
}
|
|
#endif /* MR_GNUC && (__i386__ || __x86_64__) */
|
|
}
|
|
|
|
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
|
|
static MR_uint_least64_t
|
|
parse_freq_from_x86_brand_string(char *string)
|
|
{
|
|
unsigned int brand_string_len;
|
|
unsigned int i;
|
|
double multiplier;
|
|
int freq_index = -1;
|
|
|
|
brand_string_len = strlen(string);
|
|
|
|
/*
|
|
** There will be at least five characters if we can parse this, three
|
|
** for the '?Hz' suffix, at least one for the units, plus a space at
|
|
** the beginning of the number.
|
|
*/
|
|
if (!(brand_string_len > 5))
|
|
return 0;
|
|
|
|
if (!((string[brand_string_len - 1] == 'z') &&
|
|
(string[brand_string_len - 2] == 'H'))) {
|
|
return 0;
|
|
}
|
|
|
|
switch (string[brand_string_len - 3]) {
|
|
case 'M':
|
|
multiplier = 1000000.0;
|
|
break;
|
|
case 'G':
|
|
multiplier = 1000000000.0;
|
|
break;
|
|
case 'T':
|
|
/*
|
|
** Yes, this is defined in the specification, Intel have some
|
|
** strong ambitions regarding Moore's law. :-) We include it here to
|
|
** conform with the standard.
|
|
*/
|
|
multiplier = 1000000000000.0;
|
|
break;
|
|
default:
|
|
return 0;
|
|
}
|
|
|
|
/* Search for the beginning of the digits. */
|
|
for (i = brand_string_len - 4; i >= 0; i--) {
|
|
if (string[i] == ' ') {
|
|
freq_index = i+1;
|
|
break;
|
|
}
|
|
}
|
|
if (freq_index == -1) {
|
|
/* We didn't find the beginning of the frequency */
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
** If strtod fails it returns zero, so if we fail to parse a number here,
|
|
** we'll return zero which our caller understands as a parsing failure.
|
|
*/
|
|
return (MR_uint_least64_t)(strtod(&string[freq_index], NULL) * multiplier);
|
|
}
|
|
#endif /* MR_GNUC && (__i386__ || __x86_64__) */
|
|
|
|
void
|
|
MR_profiling_start_timer(MR_Timer *timer)
|
|
{
|
|
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
|
|
/*
|
|
** If we don't have enough data to fill in all the fields of this structure
|
|
** we leave them alone, we won't check them later without checking
|
|
** MR_rdtsc{p}_is_available first.
|
|
*/
|
|
if (MR_rdtscp_is_available) {
|
|
MR_rdtscp(&(timer->MR_timer_time), &(timer->MR_timer_processor_id));
|
|
} else if (MR_rdtsc_is_available) {
|
|
MR_rdtsc(&(timer->MR_timer_time));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void
|
|
MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats)
|
|
{
|
|
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
|
|
MR_Timer now;
|
|
MR_int_least64_t duration;
|
|
MR_uint_least64_t duration_squared;
|
|
|
|
if (MR_rdtscp_is_available) {
|
|
MR_rdtscp(&(now.MR_timer_time), &(now.MR_timer_processor_id));
|
|
if (timer->MR_timer_processor_id == now.MR_timer_processor_id) {
|
|
duration = now.MR_timer_time - timer->MR_timer_time;
|
|
duration_squared = duration * duration;
|
|
MR_atomic_inc_uint(&(stats->MR_stat_count_recorded));
|
|
#if MR_LOW_TAG_BITS >= 3
|
|
MR_atomic_add_int(&(stats->MR_stat_sum), duration);
|
|
MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared);
|
|
#else
|
|
MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock));
|
|
stats->MR_stat_sum += duration;
|
|
stats->MR_stat_sum_squares += duration_squared;
|
|
MR_US_UNLOCK(&(stats->MR_stat_sums_lock));
|
|
#endif
|
|
} else {
|
|
MR_atomic_inc_uint(&(stats->MR_stat_count_not_recorded));
|
|
}
|
|
} else if (MR_rdtsc_is_available) {
|
|
MR_rdtsc(&(now.MR_timer_time));
|
|
duration = now.MR_timer_time - timer->MR_timer_time;
|
|
duration_squared = duration * duration;
|
|
MR_atomic_inc_uint(&(stats->MR_stat_count_recorded));
|
|
#if MR_LOW_TAG_BITS >= 3
|
|
MR_atomic_add_int(&(stats->MR_stat_sum), duration);
|
|
MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared);
|
|
#else
|
|
MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock));
|
|
stats->MR_stat_sum += duration;
|
|
stats->MR_stat_sum_squares += duration_squared;
|
|
MR_US_UNLOCK(&(stats->MR_stat_sums_lock));
|
|
#endif
|
|
}
|
|
#else /* not MR_GNUC && (__i386__ || __x86_64__) */
|
|
/* No TSC support on this architecture or with this C compiler */
|
|
MR_atomic_inc_int(&(stats->MR_stat_count_recorded));
|
|
#endif /* not MR_GNUC && (__i386__ || __x86_64__) */
|
|
}
|
|
|
|
/*
|
|
** The TSC works and MR_cpu_cycles_per_sec is nonzero.
|
|
*/
|
|
extern MR_bool
|
|
MR_tsc_is_sensible(void)
|
|
{
|
|
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
|
|
return ((MR_rdtscp_is_available || MR_rdtsc_is_available) &&
|
|
(MR_cpu_cycles_per_sec != 0));
|
|
#else
|
|
return MR_FALSE;
|
|
#endif
|
|
}
|
|
|
|
MR_uint_least64_t
|
|
MR_read_cpu_tsc(void)
|
|
{
|
|
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
|
|
MR_uint_least64_t tsc;
|
|
|
|
if (MR_rdtsc_is_available) {
|
|
MR_rdtsc(&tsc);
|
|
} else {
|
|
tsc = 0;
|
|
}
|
|
return tsc;
|
|
#else /* not MR_GNUC && (__i386__ || __x86_64__) */
|
|
return 0;
|
|
#endif /* not MR_GNUC && (__i386__ || __x86_64__) */
|
|
}
|
|
|
|
/*
|
|
** It's convenient that this instruction is the same on both i386 and x86_64
|
|
*/
|
|
#if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__))
|
|
|
|
static __inline__ void
|
|
MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
|
|
MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d)
|
|
{
|
|
#ifdef __x86_64__
|
|
__asm__("cpuid"
|
|
: "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d)
|
|
: "0"(code), "2"(sub_code));
|
|
#elif defined(__i386__)
|
|
/*
|
|
** i386 is more register staved, in particular we can't use ebx in
|
|
** position independent code. And we can't move ebx into another
|
|
** general purpose register, between register pinning, PIC, the
|
|
** stack and frame pointers and the other registers used by CPUID
|
|
** there are literally no general purpose registers left on i386.
|
|
*/
|
|
__asm__("pushl %%ebx; \
|
|
cpuid; \
|
|
movl %%ebx, %1; \
|
|
popl %%ebx;"
|
|
: "=a"(*a), "=m"(*b), "=c"(*c), "=d"(*d)
|
|
: "0"(code), "2"(sub_code)
|
|
: "memory");
|
|
#endif
|
|
}
|
|
|
|
static __inline__ void
|
|
MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id)
|
|
{
|
|
MR_Unsigned tsc_low;
|
|
MR_Unsigned tsc_high;
|
|
|
|
/*
|
|
** On 64bit systems the high 32 bits of RAX and RDX are 0 filled by
|
|
** rdtsc{p}
|
|
*/
|
|
__asm__("rdtscp"
|
|
: "=a"(tsc_low), "=d"(tsc_high), "=c"(*processor_id));
|
|
|
|
*tsc = tsc_high;
|
|
*tsc = *tsc << 32;
|
|
*tsc |= tsc_low;
|
|
}
|
|
|
|
static __inline__ void
|
|
MR_rdtsc(MR_uint_least64_t *tsc)
|
|
{
|
|
MR_Unsigned tsc_low;
|
|
MR_Unsigned tsc_high;
|
|
|
|
__asm__("rdtsc"
|
|
: "=a"(tsc_low), "=d"(tsc_high));
|
|
|
|
*tsc = tsc_high;
|
|
*tsc = *tsc << 32;
|
|
*tsc |= tsc_low;
|
|
}
|
|
|
|
#endif /* MR_GNUC && (__i386__ || __x86_64__) */
|
|
|
|
#endif /* MR_PROFILE_PARALLEL_EXECUTION_SUPPORT */
|
|
|