/* ** vim:ts=4 sw=4 expandtab */ /* ** Copyright (C) 2007, 2009-2011 The University of Melbourne. ** This file may only be copied under the terms of the GNU Library General ** Public License - see the file COPYING.LIB in the Mercury distribution. */ /* ** mercury_atomic_ops.c */ #include "mercury_imp.h" #include "mercury_atomic_ops.h" /*---------------------------------------------------------------------------*/ #if defined(MR_LL_PARALLEL_CONJ) /* ** Definitions for the atomic functions declared `extern inline'. */ MR_OUTLINE_DEFN( MR_bool MR_compare_and_swap_int(volatile MR_Integer *addr, MR_Integer old, MR_Integer new_val) , { MR_COMPARE_AND_SWAP_WORD_BODY; } ) MR_OUTLINE_DEFN( MR_bool MR_compare_and_swap_uint(volatile MR_Unsigned *addr, MR_Unsigned old, MR_Unsigned new_val) , { MR_COMPARE_AND_SWAP_WORD_BODY; } ) MR_OUTLINE_DEFN( MR_Integer MR_atomic_add_and_fetch_int(volatile MR_Integer *addr, MR_Integer addend) , { MR_ATOMIC_ADD_AND_FETCH_INT_BODY; } ) MR_OUTLINE_DEFN( MR_Unsigned MR_atomic_add_and_fetch_uint(volatile MR_Unsigned *addr, MR_Unsigned addend) , { MR_ATOMIC_ADD_AND_FETCH_UINT_BODY; } ) MR_OUTLINE_DEFN( void MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend) , { MR_ATOMIC_ADD_INT_BODY; } ) MR_OUTLINE_DEFN( void MR_atomic_add_uint(volatile MR_Unsigned *addr, MR_Unsigned addend) , { MR_ATOMIC_ADD_UINT_BODY; } ) MR_OUTLINE_DEFN( void MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x) , { MR_ATOMIC_SUB_INT_BODY; } ) MR_OUTLINE_DEFN( void MR_atomic_inc_int(volatile MR_Integer *addr) , { MR_ATOMIC_INC_INT_BODY; } ) MR_OUTLINE_DEFN( void MR_atomic_inc_uint(volatile MR_Unsigned *addr) , { MR_ATOMIC_INC_UINT_BODY; } ) MR_OUTLINE_DEFN( void MR_atomic_dec_int(volatile MR_Integer *addr) , { MR_ATOMIC_DEC_INT_BODY; } ) MR_OUTLINE_DEFN( MR_bool MR_atomic_dec_and_is_zero_int(volatile MR_Integer *addr) , { MR_ATOMIC_DEC_AND_IS_ZERO_INT_BODY; } ) MR_OUTLINE_DEFN( MR_bool MR_atomic_dec_and_is_zero_uint(volatile MR_Unsigned *addr) , { MR_ATOMIC_DEC_AND_IS_ZERO_UINT_BODY; } ) #endif /* MR_LL_PARALLEL_CONJ */ /*---------------------------------------------------------------------------*/ #if defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT) /* ** Profiling of the parallel runtime. */ #if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__)) /* ** True if the RDTSCP and RDTSC instructions are available respectively. */ static MR_bool MR_rdtscp_is_available = MR_FALSE; static MR_bool MR_rdtsc_is_available = MR_FALSE; #endif MR_uint_least64_t MR_cpu_cycles_per_sec = 0; #if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__)) /* Set this to 1 to enable some printfs below */ #define MR_DEBUG_CPU_FEATURE_DETECTION 0 /* ** cpuid, rdtscp and rdtsc are i386/x86_64 instructions. */ static __inline__ void MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code, MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d); static __inline__ void MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id); static __inline__ void MR_rdtsc(MR_uint_least64_t *tsc); /* ** Return zero if parsing failed, otherwise return the number of cycles per ** second. */ static MR_uint_least64_t parse_freq_from_x86_brand_string(char *string); #endif /* MR_GNUC && (__i386__ || __x86_64__) */ void MR_do_cpu_feature_detection(void) { #if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__)) MR_Unsigned a, b, c, d; MR_Unsigned eflags, old_eflags; MR_Unsigned maximum_extended_page; MR_Unsigned extended_family, basic_family, family; MR_Unsigned extended_model, model; /* ** Check for the CPUID instruction. CPUID is supported if we can flip bit ** 21 in the CPU's EFLAGS register. The assembly below is written in a ** subset of i386 and x86_64 assembly. To read and write EFLAGS we have ** to go via the C stack. */ __asm__ ("pushf; pop %0" :"=r"(eflags)); old_eflags = eflags; /* Flip bit 21 */ eflags ^= (1 << 21); __asm__ ("push %0; popf; pushf; pop %0;" :"=r"(eflags) :"0"(eflags)); /* ** Test to see if our change held. We don't restore eflags, a change to ** the ID bit has no effect. */ if (eflags == old_eflags) { #if MR_DEBUG_CPU_FEATURE_DETECTION fprintf(stderr, "This CPU doesn't support the CPUID instruction.\n", eflags, old_eflags); #endif return; } /* ** CPUID 0 gives the maximum basic CPUID page in EAX. Basic pages go up to ** but not including 0x40000000. */ MR_cpuid(0, 0, &a, &b, &c, &d); if (a < 1) { return; } /* CPUID 1 gives type, family, model and stepping information in EAX. */ MR_cpuid(1, 0, &a, &b, &c, &d); /* Bit 4 in EDX is high if RDTSC is available */ if (d & (1 << 4)) { MR_rdtsc_is_available = MR_TRUE; } /* ** BTW: Intel can't count: ** ** http://www.pagetable.com/?p=18 ** http://www.codinghorror.com/blog/archives/000364.html ** ** 486 (1989): family 4 ** Pentium (1993): family 5 ** Pentium Pro (1995): family 6, models 0 and 1 ** Pentium 2 (1997): family 6, models 3, 5 and 6 ** Pentium 3 (2000): family 6, models 7, 8, 10, 11 ** Itanium (2001): family 7 ** Pentium 4 (2000): family 15/0 ** Itanium 2 (2002): family 15/1 and 15/2 ** Pentium D: family 15/4 ** Pentium M (2003): family 6, models 9 and 13 ** Core (2006): family 6, model 14 ** Core 2 (2006): family 6, model 15 ** i7: family 6, model 26 ** Atom: family 6, model 28 ** ** This list is incomplete, it doesn't cover AMD or any other brand of x86 ** processor, and it probably doesn't cover all post-pentium Intel ** processors. */ /* bits 8-11 (first bit (LSB) is bit 0) */ basic_family = (a & 0x00000F00) >> 8; if (basic_family == 0x0F) { /* bits 20-27 */ extended_family = (a & 0x0FF00000) >> 20; family = basic_family + extended_family; } else { family = basic_family; } /* ** I'm not using the model value but I'll leave the code here incase we ** have a reason to use it in the future. */ /* bits 4-7 */ model = (a & 0x000000F0) >> 4; if ((basic_family == 0x0F) || (basic_family == 0x06)) { /* bits 16-19 */ extended_model = (a & 0x000F0000) >> 16; model += (extended_model << 4); } #if MR_DEBUG_CPU_FEATURE_DETECTION fprintf(stderr, "This is family %d and model %d\n", family, model); #endif /* Now check for P3 or higher since they have the extended pages */ if (family < 6) { /* This is a 486 or Pentium */ return; } /* ** I could bail out here if this was a pentium 3, but there's a more ** reliable check for extended CPUID support below that should work on AMD ** chips as well, if I knew all the model numbers for all family 6 ** processors and knew if they honoured extended CPUID. */ /* ** Extended CPUID 0x80000000. ** ** EAX contains the maximum extended CPUID node. */ MR_cpuid(0x80000000, 0, &a, &b, &c, &d); if ((a & 0x80000000) == 0) { /* ** Extended CPUID is not supported. ** Note that this check is still not as reliable as I'd like. If it ** succeeds I'm not confident that the processor definitely implements ** extended CPUID. */ return; } maximum_extended_page = a; #if MR_DEBUG_CPU_FEATURE_DETECTION fprintf(stderr, "Maximum extended CPUID page: 0x%x\n", maximum_extended_page); #endif /* ** Extended CPUID 0x80000001 ** ** If EDX bit 27 is set the RDTSCP instruction is available. */ if (maximum_extended_page >= 0x80000001) { MR_cpuid(0x80000001, 0, &a, &b, &c, &d); #if MR_DEBUG_CPU_FEATURE_DETECTION fprintf(stderr, "CPUID 0x80000001 EDX: 0x%x\n", d); #endif if ((d & (1 << 27))) { /* ** This processor supports RDTSCP. */ #if MR_DEBUG_CPU_FEATURE_DETECTION fprintf(stderr, "RDTSCP is available\n"); #endif MR_rdtscp_is_available = MR_TRUE; } } if (maximum_extended_page >= 0x80000004) { /* ** 3 CPUID pages, 4 return registers each, containing 4 bytes each, ** plus a null byte. Intel say they include their own null byte, but ** for the cost of a single byte I feel safer using our own. */ #define CPUID_BRAND_STRING_SIZE (3*4*4 + 1) char buff[CPUID_BRAND_STRING_SIZE]; unsigned int page; unsigned int byte; unsigned int shift; /* ** This processor supports the brand string from which we can ** try to extract the clock speed. This algorithm is described ** in the Intel Instruction Set Reference, Volume 2B, Chapter 3, ** Pages 207-208, In particular the flow chart in figure 3-10. ** This does not work on AMD processors since they don't include ** the clock speed in the brand string. */ for (page = 0; page < 3; page++) { MR_cpuid(page + 0x80000002, 0, &a, &b, &c, &d); #if MR_DEBUG_CPU_FEATURE_DETECTION fprintf(stderr, "CPUID page: 0x%.8x, eax: 0x%.8x, ebx: 0x%.8x, ecx: 0x%.8x, edx: 0x%.8x\n", page + 0x80000002, a, b, c, d); #endif for (byte = 0; byte < 4; byte++) { shift = byte * 8; buff[page*4*4 + 0 + byte] = (char)(0xFF & (a >> shift)); buff[page*4*4 + 4 + byte] = (char)(0xFF & (b >> shift)); buff[page*4*4 + 8 + byte] = (char)(0xFF & (c >> shift)); buff[page*4*4 + 12 + byte] = (char)(0xFF & (d >> shift)); } } /* Add a null byte */ buff[CPUID_BRAND_STRING_SIZE - 1] = 0; #if MR_DEBUG_CPU_FEATURE_DETECTION fprintf(stderr, "CPUID Brand string: %s\n", buff); #endif MR_cpu_cycles_per_sec = parse_freq_from_x86_brand_string(buff); #if MR_DEBUG_CPU_FEATURE_DETECTION if (MR_cpu_cycles_per_sec == 0) { fprintf(stderr, "Failed to detect cycles per second " "you can probably blame AMD for this.\n"); } else { fprintf(stderr, "Cycles per second: %ld\n", MR_cpu_cycles_per_sec); } #endif } #endif /* MR_GNUC && (__i386__ || __x86_64__) */ } #if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__)) static MR_uint_least64_t parse_freq_from_x86_brand_string(char *string) { unsigned int brand_string_len; unsigned int i; double multiplier; int freq_index = -1; brand_string_len = strlen(string); /* ** There will be at least five characters if we can parse this, three ** for the '?Hz' suffix, at least one for the units, plus a space at ** the beginning of the number. */ if (!(brand_string_len > 5)) return 0; if (!((string[brand_string_len - 1] == 'z') && (string[brand_string_len - 2] == 'H'))) { return 0; } switch (string[brand_string_len - 3]) { case 'M': multiplier = 1000000.0; break; case 'G': multiplier = 1000000000.0; break; case 'T': /* ** Yes, this is defined in the specification, Intel have some ** strong ambitions regarding Moore's law. :-) We include it here to ** conform with the standard. */ multiplier = 1000000000000.0; break; default: return 0; } /* Search for the beginning of the digits. */ for (i = brand_string_len - 4; i >= 0; i--) { if (string[i] == ' ') { freq_index = i+1; break; } } if (freq_index == -1) { /* We didn't find the beginning of the frequency */ return 0; } /* ** If strtod fails it returns zero, so if we fail to parse a number here, ** we'll return zero which our caller understands as a parsing failure. */ return (MR_uint_least64_t)(strtod(&string[freq_index], NULL) * multiplier); } #endif /* MR_GNUC && (__i386__ || __x86_64__) */ void MR_profiling_start_timer(MR_Timer *timer) { #if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__)) /* ** If we don't have enough data to fill in all the fields of this structure ** we leave them alone, we won't check them later without checking ** MR_rdtsc{p}_is_available first. */ if (MR_rdtscp_is_available) { MR_rdtscp(&(timer->MR_timer_time), &(timer->MR_timer_processor_id)); } else if (MR_rdtsc_is_available) { MR_rdtsc(&(timer->MR_timer_time)); } #endif } void MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats) { #if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__)) MR_Timer now; MR_int_least64_t duration; MR_uint_least64_t duration_squared; if (MR_rdtscp_is_available) { MR_rdtscp(&(now.MR_timer_time), &(now.MR_timer_processor_id)); if (timer->MR_timer_processor_id == now.MR_timer_processor_id) { duration = now.MR_timer_time - timer->MR_timer_time; duration_squared = duration * duration; MR_atomic_inc_uint(&(stats->MR_stat_count_recorded)); #if MR_LOW_TAG_BITS >= 3 MR_atomic_add_int(&(stats->MR_stat_sum), duration); MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared); #else MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock)); stats->MR_stat_sum += duration; stats->MR_stat_sum_squares += duration_squared; MR_US_UNLOCK(&(stats->MR_stat_sums_lock)); #endif } else { MR_atomic_inc_uint(&(stats->MR_stat_count_not_recorded)); } } else if (MR_rdtsc_is_available) { MR_rdtsc(&(now.MR_timer_time)); duration = now.MR_timer_time - timer->MR_timer_time; duration_squared = duration * duration; MR_atomic_inc_uint(&(stats->MR_stat_count_recorded)); #if MR_LOW_TAG_BITS >= 3 MR_atomic_add_int(&(stats->MR_stat_sum), duration); MR_atomic_add_uint(&(stats->MR_stat_sum_squares), duration_squared); #else MR_US_SPIN_LOCK(&(stats->MR_stat_sums_lock)); stats->MR_stat_sum += duration; stats->MR_stat_sum_squares += duration_squared; MR_US_UNLOCK(&(stats->MR_stat_sums_lock)); #endif } #else /* not MR_GNUC && (__i386__ || __x86_64__) */ /* No TSC support on this architecture or with this C compiler */ MR_atomic_inc_int(&(stats->MR_stat_count_recorded)); #endif /* not MR_GNUC && (__i386__ || __x86_64__) */ } /* ** The TSC works and MR_cpu_cycles_per_sec is nonzero. */ extern MR_bool MR_tsc_is_sensible(void) { #if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__)) return ((MR_rdtscp_is_available || MR_rdtsc_is_available) && (MR_cpu_cycles_per_sec != 0)); #else return MR_FALSE; #endif } MR_uint_least64_t MR_read_cpu_tsc(void) { #if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__)) MR_uint_least64_t tsc; if (MR_rdtsc_is_available) { MR_rdtsc(&tsc); } else { tsc = 0; } return tsc; #else /* not MR_GNUC && (__i386__ || __x86_64__) */ return 0; #endif /* not MR_GNUC && (__i386__ || __x86_64__) */ } /* ** It's convenient that this instruction is the same on both i386 and x86_64 */ #if defined(MR_GNUC) && (defined(__i386__) || defined(__x86_64__)) static __inline__ void MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code, MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d) { #ifdef __x86_64__ __asm__("cpuid" : "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d) : "0"(code), "2"(sub_code)); #elif defined(__i386__) /* ** i386 is more register staved, in particular we can't use ebx in ** position independent code. And we can't move ebx into another ** general purpose register, between register pinning, PIC, the ** stack and frame pointers and the other registers used by CPUID ** there are literally no general purpose registers left on i386. */ __asm__("pushl %%ebx; \ cpuid; \ movl %%ebx, %1; \ popl %%ebx;" : "=a"(*a), "=m"(*b), "=c"(*c), "=d"(*d) : "0"(code), "2"(sub_code) : "memory"); #endif } static __inline__ void MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id) { MR_Unsigned tsc_low; MR_Unsigned tsc_high; /* ** On 64bit systems the high 32 bits of RAX and RDX are 0 filled by ** rdtsc{p} */ __asm__("rdtscp" : "=a"(tsc_low), "=d"(tsc_high), "=c"(*processor_id)); *tsc = tsc_high; *tsc = *tsc << 32; *tsc |= tsc_low; } static __inline__ void MR_rdtsc(MR_uint_least64_t *tsc) { MR_Unsigned tsc_low; MR_Unsigned tsc_high; __asm__("rdtsc" : "=a"(tsc_low), "=d"(tsc_high)); *tsc = tsc_high; *tsc = *tsc << 32; *tsc |= tsc_low; } #endif /* MR_GNUC && (__i386__ || __x86_64__) */ #endif /* MR_PROFILE_PARALLEL_EXECUTION_SUPPORT */