Improve thread pinning:

+ Now pins threads intelligently on SMT systems by balancing threads among cores. + performs fewer migrations when pinning threads (If a thread's current CPU is a valid CPU for pinning, then it is not migrated). + Handle cases where the user requests more threads than available CPUs. + Handle cases where the process is restricted to a subset of CPUs by its environment. (for instance, Linux cpuset(7)) This is largely made possible by the hwloc library http://www.open-mpi.org/projects/hwloc/ However, hwloc is not required and the runtime system will fall back to sched_setaffinity(), it will simply be less intelligent WRT SMT. runtime/mercury_context.h: runtime/mercury_context.c: Do thread pinning either via hwloc or sched_setaffinity. Previously only sched_setaffinity was used. Update thread-pinning algorithm, this: Include the general thread pinning code only if MR_HAVE_THREAD_PINNING is defined. Use a combination of sysconf and sched_getaffinity to detect the number of processors when hwloc isn't available. This makes the runtime compatible with Linux cpuset(7) when hwloc isn't available. configure.in: Mmake.common.in: Detect presence of the hwloc library. configure.in: Detect sched_getaffinity() aclocal.m4: acinclude.m4: Move aclocal.m4 to acinclude.m4, the aclocal program will build aclocal.m4 and retrieve macros from the system and the contents of acinclude.m4. Mmakefile: Create a make target for aclocal.m4. runtime/Mmakefile: Link the runtime with libhwloc in low-level C parallel grades. Include CFLAGS for libhwloc. scripts/ml.in: Link programs and libraries with libhwloc in low-level C parallel grades. runtime/mercury_conf.h.in: Define MR_HAVE_HWLOC when it is available. Define MR_HAVE_SCHED_GETAFFINITY when it is available. runtime/mercury_conf_param.h: Define MR_HAVE_THREAD_PINNING if either hwloc or [sched_setaffinity and sched_getaffinity] are available. runtime/mercury_thread.c: runtime/mercury_wrapper.c: Only call MR_pin_thread and MR_pin_primordial_thread if MR_HAVE_THREAD_PINNING is defined. runtime/mercury_thread.h: runtime/mercury_context.h: Move the declaration of MR_pin_primordial_thread to mercury_context.h from mercury_thead.h since it's definition is in mercury_context.c. Require MR_HAVE_THREAD_PINNING for the declaration of MR_pin_primordial_thread. runtime/mercury_wrapper.c: Conform to changes in mercury_context.h INSTALL_CVS: tools/test_mercury Run aclocal at the right times while testing Mercury.
2026-04-15 01:13:30 +00:00 · 2011-10-13 02:42:21 +00:00
parent 1d0cd8d04f
commit a071eaba53
15 changed files with 454 additions and 129 deletions
--- a/3
+++ b/3
@@ -9,7 +9,7 @@
 #	You also need autoconf (version 2.58 or later) (and hence GNU m4)
 #	installed.
 #
-# Step 0.  autoconf
+# Step 0.  aclocal && autoconf
 #
 # Step 1.  ./configure
 #
@@ -40,6 +40,7 @@

 parallel=-j3

+aclocal &&
 autoconf &&
 ./configure &&
 touch Mmake.params &&
--- a/Mmake.common.in
+++ b/Mmake.common.in
@@ -1,5 +1,5 @@
 #-----------------------------------------------------------------------------#
-# Copyright (C) 1995-2006, 2009-2010 The University of Melbourne.
+# Copyright (C) 1995-2006, 2009-2011 The University of Melbourne.
 # This file may only be copied under the terms of the GNU General
 # Public Licence - see the file COPYING in the Mercury distribution.
 #-----------------------------------------------------------------------------#
@@ -234,6 +234,8 @@ NSL_LIBRARY=@NSL_LIBRARY@
 DL_LIBRARY=@DL_LIBRARY@
 READLINE_LIBRARIES=@READLINE_LIBRARIES@
 TERMCAP_LIBRARY=@TERMCAP_LIBRARY@
+HWLOC_CFLAGS=@HWLOC_CFLAGS@
+HWLOC_LIBS=@HWLOC_LIBS@

 # Extensions to use
 O=@OBJ_SUFFIX@
--- a/3
+++ b/3
@@ -370,6 +370,9 @@ cleanint:

 #-----------------------------------------------------------------------------#

+aclocal.m4: configure.in acinclude.m4
+	aclocal
+
 configure: configure.in aclocal.m4
 	autoconf

--- a/acinclude.m4
+++ b/acinclude.m4
--- a/configure.in
+++ b/configure.in
@@ -1265,7 +1265,7 @@ mercury_check_for_functions \
        grantpt unlockpt ptsname tcgetattr tcsetattr ioctl \
        access sleep opendir readdir closedir mkdir symlink readlink \
        gettimeofday setenv putenv _putenv posix_spawn sched_setaffinity \
-        sched_getcpu sched_yield mkstemp
+        sched_getaffinity sched_getcpu sched_yield mkstemp

 #-----------------------------------------------------------------------------#

@@ -5106,6 +5106,30 @@ AC_SUBST(USE_MSVCRT)

 MERCURY_CHECK_READLINE

+#-----------------------------------------------------------------------------#
+#
+# Check for libhwloc, http://www.open-mpi.org/projects/hwloc/
+#
+PKG_PROG_PKG_CONFIG
+PKG_CHECK_MODULES(libhwloc, hwloc >= 1.0,
+    [
+        AC_DEFINE(MR_HAVE_HWLOC)
+    ],
+    [
+        case "$LIBGRADES" in
+            $BEST_LLDS_BASE_GRADE.par.gc*)
+                MERCURY_MSG(["Warning: libhwloc not found, thread pinning in"])
+                MERCURY_MSG(["low-level C parallel grades may be less accurate."])
+                ;;
+            *)
+                ;;
+        esac
+    ])
+HWLOC_LIBS="$libhwloc_LIBS"
+HWLOC_CFLAGS="$libhwloc_CFLAGS"
+AC_SUBST(HWLOC_LIBS)
+AC_SUBST(HWLOC_CFLAGS)
+
 #-----------------------------------------------------------------------------#
 #
 # Check for flex and bison
--- a/runtime/Mmakefile
+++ b/runtime/Mmakefile
@@ -1,5 +1,5 @@
 #-----------------------------------------------------------------------------#
-# Copyright (C) 1998-2010 The University of Melbourne.
+# Copyright (C) 1998-2011 The University of Melbourne.
 # This file may only be copied under the terms of the GNU General
 # Public License - see the file COPYING in the Mercury distribution.
 #-----------------------------------------------------------------------------#
@@ -252,10 +252,14 @@ LDLIBS		= $(SHARED_GC_LIBS)

 THREADLIBS	= \
 		` case "$(GRADE)" in					\
-		    *.par*|*.mps*) echo "-lpthread" ;;			\
-		  esac							\
+			*.mps*) echo $(THREAD_LIBS) ;;			\
+			*.hlc.par*) echo $(THREAD_LIBS) ;;		\
+			*.par*) echo "$(THREAD_LIBS) $(HWLOC_LIBS)" ;;	\
+		esac							\
 		`

+CFLAGS += $(HWLOC_CFLAGS)
+
 $(HDR_CHECK_OBJS):	mercury_conf.h

 #-----------------------------------------------------------------------------#
--- a/runtime/mercury_conf.h.in
+++ b/runtime/mercury_conf.h.in
@@ -274,6 +274,7 @@
 **	MR_HAVE_POSIX_SPAWN	we have the posix_spawn() function.
 **	MR_HAVE_FESETROUND	we have the fesetround() function.
 **	MR_HAVE_SCHED_SETAFFINITY we have the sched_setaffinity() function.
+**	MR_HAVE_SCHED_GETAFFINITY we have the sched_gettaffinity() function.
 **	MR_HAVE_SCHED_GETCPU	we have the sched_getcpu() function (glibc specific).
 **	MR_HAVE_SCHED_YIELD	we have the sched_yield() function.
 **	MR_HAVE_PTHREAD_MUTEXATTR_SETPSHARED we have the
@@ -342,6 +343,7 @@
 #undef	MR_HAVE_POSIX_SPAWN
 #undef	MR_HAVE_FESETROUND
 #undef	MR_HAVE_SCHED_SETAFFINITY
+#undef	MR_HAVE_SCHED_GETAFFINITY
 #undef	MR_HAVE_SCHED_GETCPU
 #undef	MR_HAVE_SCHED_YIELD
 #undef	MR_HAVE_PTHREAD_MUTEXATTR_SETPSHARED
@@ -419,6 +421,11 @@
 #undef MR_THREAD_LOCAL_STORAGE
 #undef MR_PTHREADS_WIN32

+/*
+** MR_HAVE_HWLOC is defined if the hwloc library is available.
+*/
+#undef MR_HAVE_HWLOC
+
 /*
 ** The bytecode files represent floats in 64-bit IEEE format.
 **
--- a/runtime/mercury_conf_param.h
+++ b/runtime/mercury_conf_param.h
@@ -1067,4 +1067,15 @@

 /*---------------------------------------------------------------------------*/

+/*
+** MR_HAVE_THREAD_PINNING is defined if we can pin threads, either with
+** sched_setaffinity or hwloc.
+*/
+#if (defined(MR_HAVE_SCHED_SETAFFINITY) && \
+	defined(MR_HAVE_SCHED_GETAFFINITY)) || defined(MR_HAVE_HWLOC)
+    #define MR_HAVE_THREAD_PINNING
+#endif
+
+/*---------------------------------------------------------------------------*/
+
 #endif /* MERCURY_CONF_PARAM_H */
--- a/runtime/mercury_context.c
+++ b/runtime/mercury_context.c
@@ -46,6 +46,10 @@ ENDINIT
  #include <sys/timeb.h>    /* for _ftime() */
 #endif

+#if defined(MR_LL_PARALLEL_CONJ) && defined(MR_HAVE_HWLOC)
+  #include <hwloc.h>
+#endif
+
 #include "mercury_memory_handlers.h"
 #include "mercury_context.h"
 #include "mercury_engine.h"             /* for `MR_memdebug' */
@@ -161,12 +165,19 @@ static MR_Integer       MR_profile_parallel_regular_context_kept = 0;
 /*
 ** Local variables for thread pinning.
 */
-#ifdef MR_LL_PARALLEL_CONJ
-static MercuryLock      MR_next_cpu_lock;
+#if defined(MR_LL_PARALLEL_CONJ) && defined(MR_HAVE_THREAD_PINNING)
 MR_bool                 MR_thread_pinning = MR_FALSE;
-static MR_Unsigned      MR_next_cpu = 0;
-/* This is initialised the first the MR_pin_primordial_thread() is called */
+
+static MercuryLock      MR_thread_pinning_lock;
+static unsigned         MR_num_threads_left_to_pin;
+static unsigned         MR_num_processors;
 MR_Unsigned             MR_primordial_thread_cpu;
+#ifdef MR_HAVE_HWLOC
+static hwloc_topology_t MR_hw_topology;
+static hwloc_cpuset_t   MR_hw_available_pus = NULL;
+#else /* MR_HAVE_SCHED_SETAFFINITY */
+static cpu_set_t        *MR_available_cpus;
+#endif
 #endif

 #if defined(MR_LL_PARALLEL_CONJ) && \
@@ -217,7 +228,7 @@ MR_SparkDeque           **MR_spark_deques = NULL;

 #ifdef MR_LL_PARALLEL_CONJ
 /*
-** Try to wake up a sleeping message and tell it to do action. The engine
+** Try to wake up a sleeping engine and tell it to do action. The engine
 ** is only woken if the engine is in one of the states in the bitfield states.
 ** If the engine is woken, this function returns MR_TRUE, otherwise it
 ** returns MR_FALSE.
@@ -233,9 +244,35 @@ try_wake_engine(MR_EngineId engine_id, int action,
 static void
 MR_write_out_profiling_parallel_execution(void);

-#if defined(MR_LL_PARALLEL_CONJ) && defined(MR_HAVE_SCHED_SETAFFINITY)
+#if defined(MR_LL_PARALLEL_CONJ)
 static void
+MR_setup_thread_pinning(void);
+
+static MR_bool
 MR_do_pin_thread(int cpu);
+
+/*
+** Determine which CPU this thread is currently running on.
+*/
+static int
+MR_current_cpu(void);
+
+/*
+** Reset or initialize the cpuset that tracks which CPUs are available for
+** binding.
+*/
+static void
+MR_reset_available_cpus(void);
+
+/*
+** Mark the given CPU as unavailable for thread pinning.  This may mark other
+** CPUs as unavailable, if, for instance they share resources with this
+** processor and we can place other tasks elsewhere to avoid this sharing.
+** These resources are usually only considered for hardware threads that share
+** cores.
+*/
+static void
+MR_make_cpu_unavailable(int cpu);
 #endif

 /*---------------------------------------------------------------------------*/
@@ -253,9 +290,6 @@ MR_init_context_stuff(void)
    pthread_mutex_init(&free_context_list_lock, MR_MUTEX_ATTR);
    pthread_mutex_init(&MR_pending_contexts_lock, MR_MUTEX_ATTR);
  #ifdef MR_LL_PARALLEL_CONJ
-    #ifdef MR_HAVE_SCHED_SETAFFINITY
-    pthread_mutex_init(&MR_next_cpu_lock, MR_MUTEX_ATTR);
-    #endif
    #ifdef MR_DEBUG_RUNTIME_GRANULARITY_CONTROL
    pthread_mutex_init(&MR_par_cond_stats_lock, MR_MUTEX_ATTR);
    #endif
@@ -268,40 +302,10 @@ MR_init_context_stuff(void)
    MR_KEY_CREATE(&MR_backjump_next_choice_id_key, (void *)0);
  #endif

-    /*
-    ** If MR_num_threads is unset, configure it to match number of processors
-    ** on the system. If we do this, then we prepare to set processor
-    ** affinities later on.
-    */
-    if (MR_num_threads == 0) {
-      #if defined(MR_HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
-        long result;
-
-        result = sysconf(_SC_NPROCESSORS_ONLN);
-        if (result < 1) {
-            /* We couldn't determine the number of processors. */
-            MR_num_threads = 1;
-        } else {
-            MR_num_threads = result;
-            /*
-            ** On systems that don't support sched_setaffinity, we don't try
-            ** to automatically enable thread pinning. This prevents a runtime
-            ** warning that could unnecessarily confuse the user.
-            **/
-          #if defined(MR_LL_PARALLEL_CONJ) && \
-              defined(MR_HAVE_SCHED_SETAFFINITY)
-            /*
-            ** Comment this back in to enable thread pinning by default
-            ** if we autodetected the correct number of CPUs.
-            */
-            /* MR_thread_pinning = MR_TRUE; */
-          #endif
-        }
-      #else /* ! defined(MR_HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN) */
-        MR_num_threads = 1;
-      #endif /* ! defined(MR_HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN) */
-    }
  #ifdef MR_LL_PARALLEL_CONJ
+    #if defined(MR_HAVE_THREAD_PINNING)
+    MR_setup_thread_pinning();
+    #endif
    MR_granularity_wsdeque_length =
        MR_granularity_wsdeque_length_factor * MR_num_threads;

@@ -329,102 +333,353 @@ MR_init_context_stuff(void)
 ** Pin the primordial thread first to the CPU it is currently using
 ** (if support is available for thread pinning).
 */
-#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
-unsigned
-MR_pin_primordial_thread(void)
+#if defined(MR_HAVE_THREAD_PINNING) && defined(MR_LL_PARALLEL_CONJ)
+static unsigned
+MR_pin_thread_no_locking(void)
 {
    unsigned    cpu;
-    int         temp;
+    unsigned    i = 0;

-    /*
-    ** We don't need locking to pin the primordial thread as it is called
-    ** before any other threads exist.
-    */
-    /*
-    ** We go through the motions of thread pinning even when thread pinning is
-    ** not supported as the allocation of CPUs to threads may be used later.
-    */
-  #ifdef MR_HAVE_SCHED_GETCPU
-    temp = sched_getcpu();
-    if (temp == -1) {
-        MR_primordial_thread_cpu = 0;
-      #ifdef MR_HAVE_SCHED_SET_AFFINITY
-        if (MR_thread_pinning) {
-            perror("Warning: unable to determine the current CPU for "
-                "the primordial thread: ");
+    cpu = MR_current_cpu();
+#ifdef MR_DEBUG_THREAD_PINNING
+    fprintf(stderr, "Currently running on cpu %d\n", cpu);
+#endif
+
+    for (i = 0; i < MR_num_processors && MR_thread_pinning; i++) {
+        if (MR_do_pin_thread((cpu + i) % MR_num_processors)) {
+#ifdef MR_DEBUG_THREAD_PINNING
+            fprintf(stderr, "Pinned to cpu %d\n", (cpu + i) % MR_num_processors);
+            fprintf(stderr, "Now running on cpu %d\n", MR_current_cpu());
+#endif
+            MR_num_threads_left_to_pin--;
+            MR_make_cpu_unavailable((cpu + i) % MR_num_processors);
+            break;
+        }
+        if (!MR_thread_pinning) {
+            /*
+            ** If MR_thread_pinning becomes false then an error prevented us
+            ** from pinning the thread.
+            ** When we fail to pin a thread but MR_thread_pinning remains true
+            ** it means that that CPU has already had a thread pinned to it.
+            */
+            fprintf(stderr, "Couldn't pin Mercury engine to processor");
+            break;
        }
-      #endif
-    } else {
-        MR_primordial_thread_cpu = temp;
    }
-  #else
-    MR_primordial_thread_cpu = 0;
-  #endif
-  #ifdef MR_HAVE_SCHED_SET_AFFINITY
-    if (MR_thread_pinning) {
-        MR_do_pin_thread(MR_primordial_thread_cpu);
-    }
-  #endif
-    return MR_primordial_thread_cpu;
+    return (cpu + 1) % MR_num_processors;
 }
-#endif /* defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ) */

-#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
 unsigned
 MR_pin_thread(void)
 {
    unsigned cpu;

-    /*
-    ** We go through the motions of thread pinning even when thread pinning
-    ** is not supported, as the allocation of CPUs to threads may be
-    ** used later.
-    */
-    MR_LOCK(&MR_next_cpu_lock, "MR_pin_thread");
-    if (MR_next_cpu == MR_primordial_thread_cpu) {
-        /*
-        ** Skip the CPU that the primordial thread was pinned on.
-        */
-        MR_next_cpu++;
-    }
-    cpu = MR_next_cpu++;
-    MR_UNLOCK(&MR_next_cpu_lock, "MR_pin_thread");
-
-#ifdef MR_HAVE_SCHED_SETAFFINITY
-    if (MR_thread_pinning) {
-        MR_do_pin_thread(cpu);
-    }
-#endif
+    MR_LOCK(&MR_thread_pinning_lock, "MR_pin_thread");
+    cpu = MR_pin_thread_no_locking();
+    MR_UNLOCK(&MR_thread_pinning_lock, "MR_pin_thread");

    return cpu;
 }
-#endif /* defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ) */

-#if defined(MR_LL_PARALLEL_CONJ) && defined(MR_HAVE_SCHED_SETAFFINITY)
-static void
+void
+MR_pin_primordial_thread(void)
+{
+    /*
+    ** We don't need locking to pin the primordial thread as it is called
+    ** before any other threads exist.
+    */
+    MR_primordial_thread_cpu = MR_pin_thread_no_locking();
+}
+
+static void MR_setup_thread_pinning(void)
+{
+    unsigned num_processors;
+
+#ifdef MR_HAVE_HWLOC
+    if (-1 == hwloc_topology_init(&MR_hw_topology)) {
+        MR_fatal_error("Error allocating libhwloc topology object");
+    }
+    if (-1 == hwloc_topology_load(MR_hw_topology)) {
+        MR_fatal_error("Error detecting hardware topology (hwloc)");
+    }
+#endif
+
+    /*
+    ** Setup num processors
+    */
+    MR_reset_available_cpus();
+#ifdef MR_HAVE_HWLOC
+    num_processors = hwloc_cpuset_weight(MR_hw_available_pus);
+#elif defined(MR_HAVE_SCHED_GETAFFINITY)
+    /*
+    ** This looks redundant but its not.  MR_num_processors is a guess that was
+    ** gathered by using sysconf.  But the number of CPUs in the CPU_SET is the
+    ** actual number of CPUs that this process is restricted to.
+    */
+  #if defined(MR_HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
+    num_processors = sysconf(_SC_NPROCESSORS_ONLN);
+  #else
+    /*
+    ** The user may have supplied MR_num_processors
+    */
+    num_processors = (MR_num_processors > 1 ? MR_num_processors : 1)
+  #endif
+    num_processors = CPU_COUNT_S(num_processors, MR_available_cpus);
+#endif
+    MR_num_processors = num_processors;
+
+    /*
+    ** If MR_num_threads is unset, configure it to match number of processors
+    ** on the system. If we do this, then we prepare to set processor
+    ** affinities later on.
+    */
+    if (MR_num_threads == 0) {
+        MR_num_threads = num_processors;
+    }
+    MR_num_threads_left_to_pin = MR_num_threads;
+
+#ifdef MR_DEBUG_THREAD_PINNING
+    fprintf(stderr, "Detected %d available processors, will use %d threads\n",
+        MR_num_processors, MR_num_threads);
+#endif
+
+    pthread_mutex_init(&MR_thread_pinning_lock, MR_MUTEX_ATTR);
+
+  /*
+  ** Comment this back in to enable thread pinning by default
+  ** if we autodetected the number of CPUs without error.
+  */
+#if 0
+    if (MR_num_threads > 1) {
+        MR_thread_pinning = MR_TRUE;
+    }
+#endif
+}
+
+/*
+** Determine which CPU this thread is currently running on.
+*/
+static int MR_current_cpu(void)
+{
+#if defined(MR_HAVE_SCHED_GETCPU)
+    int         os_cpu;
+#if defined(MR_HAVE_HWLOC)
+    hwloc_obj_t pu;
+#endif
+
+    os_cpu = sched_getcpu();
+    if (-1 == os_cpu) {
+        os_cpu = 0;
+
+        if (MR_thread_pinning) {
+            perror("Warning: unable to determine the current CPU for "
+                "this thread: ");
+        }
+    }
+
+#if defined(MR_HAVE_HWLOC)
+    pu = hwloc_get_pu_obj_by_os_index(MR_hw_topology, os_cpu);
+    return pu->logical_index;
+#else
+    return os_cpu;
+#endif
+
+#else /* ! MR_HAVE_SCHED_GETCPU */
+    /* We have no idea! */
+    return 0;
+#endif
+}
+
+static MR_bool
 MR_do_pin_thread(int cpu)
 {
-    cpu_set_t   cpus;
+    /*
+    ** Make sure that we're allowed to bind to this CPU.
+    */
+#if defined(MR_HAVE_HWLOC)
+    hwloc_obj_t pu;

-    if (cpu < CPU_SETSIZE) {
-        CPU_ZERO(&cpus);
-        CPU_SET(cpu, &cpus);
-        if (sched_setaffinity(0, sizeof(cpu_set_t), &cpus) == -1) {
-            perror("Warning: Couldn't set CPU affinity: ");
-            /*
-            ** If this failed once, it will probably fail again,
-            ** so we disable it.
-            */
-            MR_thread_pinning = MR_FALSE;
-        }
-    } else {
-        perror("Warning: Couldn't set CPU affinity due to a static "
-            "system limit: ");
-        MR_thread_pinning = MR_FALSE;
+    if (hwloc_cpuset_iszero(MR_hw_available_pus)) {
+        /*
+        ** Each available CPU already has a thread pinned to it.  Reset the
+        ** available_pus set so that we can oversubscribe CPUs but still
+        ** attempt to balance load.
+        */
+        MR_reset_available_cpus();
    }
+
+    pu = hwloc_get_obj_by_type(MR_hw_topology, HWLOC_OBJ_PU, cpu);
+    if (!hwloc_cpuset_intersects(MR_hw_available_pus, pu->cpuset)) {
+        return MR_FALSE;
+    }
+#elif defined(MR_HAVE_SCHED_SETAFFINITY)
+    if (CPU_COUNT_S(MR_num_processors, MR_available_cpus) == 0) {
+        /*
+        ** As above, reset the available cpus.
+        */
+        MR_reset_available_cpus();
+    }
+    if (!CPU_ISSET_S(cpu, MR_num_processors, MR_available_cpus)) {
+        return MR_FALSE;
+    }
+#endif
+
+#if defined(MR_HAVE_HWLOC)
+    errno = hwloc_set_cpubind(MR_hw_topology, pu->cpuset,
+        HWLOC_CPUBIND_THREAD);
+    if (errno != 0) {
+        perror("Warning: Couldn't set CPU affinity: ");
+        MR_thread_pinning = MR_FALSE;
+        return MR_FALSE;
+    }
+#elif defined(MR_HAVE_SCHED_SETAFFINITY)
+    cpu_set_t   *cpus;
+
+    cpus = CPU_ALLOC(MR_num_processors);
+
+    CPU_ZERO_S(MR_num_processors, cpus);
+    CPU_SET_S(cpu, MR_num_processors, cpus);
+    if (sched_setaffinity(0, CPU_ALLOC_SIZE(MR_num_processors), cpus) == -1) {
+        perror("Warning: Couldn't set CPU affinity: ");
+        /*
+        ** If this failed once, it will probably fail again,
+        ** so we disable it.
+        */
+        MR_thread_pinning = MR_FALSE;
+        return MR_FALSE;
+    }
+#endif
+
+    return MR_TRUE;
+}
+
+static void MR_reset_available_cpus(void)
+{
+#if defined(MR_HAVE_HWLOC)
+    hwloc_cpuset_t  inherited_binding;
+
+    /*
+    ** Gather the cpuset that our parent process bound this process to.
+    **
+    ** (For information about how to deliberately restrict a process and it's
+    ** sub-processors to a set of CPUs on Linux see cpuset(7).
+    */
+    inherited_binding = hwloc_cpuset_alloc();
+    hwloc_get_cpubind(MR_hw_topology, inherited_binding, HWLOC_CPUBIND_PROCESS);
+
+    /*
+    ** Set the available processors to the union of inherited_binding and the
+    ** cpuset we're allowed to use as reported by libhwloc.  In my tests with
+    ** libhwloc_1.0-1 (Debian) hwloc reported that all cpus on the system are
+    ** avaliable, it didn't exclude cpus not in the processor's cpuset(7).
+    */
+    if (MR_hw_available_pus == NULL) {
+        MR_hw_available_pus = hwloc_cpuset_alloc();
+    }
+    hwloc_cpuset_and(MR_hw_available_pus, inherited_binding,
+        hwloc_topology_get_allowed_cpuset(MR_hw_topology));
+
+    hwloc_cpuset_free(inherited_binding);
+#elif defined(MR_HAVE_SCHED_GETAFFINITY)
+    unsigned num_processors;
+
+  #if defined(MR_HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
+    num_processors = sysconf(_SC_NPROCESSORS_ONLN);
+  #else
+    /*
+    ** The user may have supplied MR_num_processors
+    */
+    num_processors = (MR_num_processors > 1 ? MR_num_processors : 1)
+  #endif
+
+    if (MR_available_cpus == NULL) {
+        MR_available_cpus = CPU_ALLOC(num_processors);
+    }
+
+    if (-1 == sched_getaffinity(0, CPU_ALLOC_SIZE(num_processors),
+        MR_available_cpus))
+    {
+        perror("Couldn't get CPU affinity");
+        MR_thread_pinning = MR_FALSE;
+        CPU_FREE(MR_available_cpus);
+        MR_available_cpus = NULL;
+    }
+#endif
+}
+
+#if defined(MR_HAVE_HWLOC)
+static MR_bool MR_make_pu_unavailable(const struct hwloc_obj *pu);
+#endif
+
+static void MR_make_cpu_unavailable(int cpu)
+{
+#if defined(MR_HAVE_HWLOC)
+    hwloc_obj_t pu;
+    pu = hwloc_get_obj_by_type(MR_hw_topology, HWLOC_OBJ_PU, cpu);
+    MR_make_pu_unavailable(pu);
+#elif defined(MR_HAVE_SCHED_SETAFFINITY)
+    CPU_CLR_S(cpu, MR_num_processors, MR_available_cpus);
+#endif
+}
+
+#if defined(MR_HAVE_HWLOC)
+static MR_bool MR_make_pu_unavailable(const struct hwloc_obj *pu) {
+    hwloc_obj_t core;
+    static int  siblings_to_make_unavailable;
+    int         i;
+
+#ifdef MR_DEBUG_THREAD_PINNING
+    char *      cpusetstr;
+
+    hwloc_cpuset_asprintf(&cpusetstr, MR_hw_available_pus);
+    fprintf(stderr, "Old available CPU set: %s\n", cpusetstr);
+    free(cpusetstr);
+    hwloc_cpuset_asprintf(&cpusetstr, pu->cpuset);
+    fprintf(stderr, "Making this CPU set unavailable: %s\n", cpusetstr);
+    free(cpusetstr);
+#endif
+
+    hwloc_cpuset_andnot(MR_hw_available_pus, MR_hw_available_pus, pu->cpuset);
+
+#ifdef MR_DEBUG_THREAD_PINNING
+    hwloc_cpuset_asprintf(&cpusetstr, MR_hw_available_pus);
+    fprintf(stderr, "New available CPU set: %s\n", cpusetstr);
+    free(cpusetstr);
+#endif
+
+    siblings_to_make_unavailable = hwloc_cpuset_weight(MR_hw_available_pus) -
+        MR_num_threads_left_to_pin;
+
+    if (siblings_to_make_unavailable > 0) {
+        /*
+        ** Remove sibling processing units that share a core with the one we've just removed.
+        */
+        core = pu->parent;
+        if (core->type != HWLOC_OBJ_CORE) {
+            return MR_FALSE;
+        }
+
+        for (i = 0;
+             (i < core->arity && siblings_to_make_unavailable > 0);
+             i++) {
+            if (core->children[i] == pu) {
+                continue;
+            }
+            if (hwloc_cpuset_intersects(core->children[i]->cpuset,
+                    MR_hw_available_pus)) {
+                if (!MR_make_pu_unavailable(core->children[i])) {
+                    return MR_FALSE;
+                }
+            }
+        }
+    }
+
+    return MR_TRUE;
 }
 #endif

+#endif /* MR_HAVE_THREAD_PINNING && MR_LL_PARALLEL_CONJ */
+
 void
 MR_finalize_context_stuff(void)
 {
--- a/runtime/mercury_context.h
+++ b/runtime/mercury_context.h
@@ -468,12 +468,19 @@ extern  void        MR_init_context_stuff(void);
 ** be pinned to if pinning was both enabled and supported.  That is a valid
 ** value is always returned even if the thread is not actually pinned.
 */
-#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
-extern unsigned
+#if defined(MR_LL_PARALLEL_CONJ)
+#if defined(MR_HAVE_THREAD_PINNING)
+extern void
 MR_pin_primordial_thread(void);
 extern unsigned
 MR_pin_thread(void);

+/*
+** The CPU that the primordial thread is running on.
+*/
+extern MR_Unsigned        MR_primordial_thread_cpu;
+#endif
+
 /*
 ** Shutdown all the engines.
 */
--- a/runtime/mercury_thread.c
+++ b/runtime/mercury_thread.c
@@ -115,7 +115,7 @@ MR_init_thread(MR_when_to_use when_to_use)
    MercuryEngine   *eng;

 #ifdef MR_THREAD_SAFE
-  #ifdef MR_LL_PARALLEL_CONJ
+  #if defined(MR_LL_PARALLEL_CONJ) && defined(MR_HAVE_THREAD_PINNING)
    unsigned        cpu;
  #endif

@@ -129,13 +129,17 @@ MR_init_thread(MR_when_to_use when_to_use)
  #ifdef MR_LL_PARALLEL_CONJ
    switch (when_to_use) {
        case MR_use_later:
+#ifdef MR_HAVE_THREAD_PINNING
            cpu = MR_pin_thread();
+#endif
            break;
        case MR_use_now:
            /*
            ** Don't pin the primordial thread here, it's already been done.
            */
+#ifdef MR_HAVE_THREAD_PINNING
            cpu = MR_primordial_thread_cpu;
+#endif
            break;
        /*
        ** TODO: We may use the cpu value here to determine which CPUs which
--- a/runtime/mercury_thread.h
+++ b/runtime/mercury_thread.h
@@ -211,11 +211,6 @@ MR_null_thread(void);
  */
  extern MercuryThreadKey   MR_exception_handler_key;

-  /*
-  ** The CPU that the primordial thread is running on.
-  */
-  extern MR_Unsigned        MR_primordial_thread_cpu;
-
 #else /* not MR_THREAD_SAFE */

  #define MR_LOCK(nothing, from)        do { } while (0)
--- a/runtime/mercury_wrapper.c
+++ b/runtime/mercury_wrapper.c
@@ -667,7 +667,9 @@ mercury_runtime_init(int argc, char **argv)
 #else

 #if defined(MR_LL_PARALLEL_CONJ)
+#if defined(MR_HAVE_THREAD_PINNING)
    MR_pin_primordial_thread();
+#endif
  #if defined(MR_THREADSCOPE)
    /*
    ** We must setup threadscope before we setup the first engine.
--- a/scripts/ml.in
+++ b/scripts/ml.in
@@ -50,6 +50,7 @@ LDFLAGS_FOR_THREADS="@LDFLAGS_FOR_THREADS@"
 LDFLAGS_FOR_TRACE="@LDFLAGS_FOR_TRACE@"
 LD_LIBFLAGS_FOR_THREADS="@LD_LIBFLAGS_FOR_THREADS@"
 THREAD_LIBS="@THREAD_LIBS@"
+HWLOC_LIBS="@HWLOC_LIBS@"
 TRACE_BASE_LIBS_SYSTEM="@TRACE_BASE_LIBS_SYSTEM@"

 TMPDIR=${TMPDIR=/tmp}
@@ -370,6 +371,13 @@ case $use_thread_libs.$make_shared_lib in
 		THREAD_LIBS=""
 		;;
 esac
+case "$GRADE" in
+	hlc.*.par*)
+		;;
+	*.par*)
+		THREAD_LIBS="$THREAD_LIBS $HWLOC_LIBS"
+		;;
+esac

 # Set the correct flags if we're to use the MS Visual C runtime.
 use_msvcrt=@USE_MSVCRT@
--- a/tools/test_mercury
+++ b/tools/test_mercury
@@ -735,6 +735,7 @@ esac
 # XXX building the depend target in parallel sometimes fails so we don't
 # do that at the moment - it's probably not worth doing anyway.
 #
+aclocal || { false; exit 1; }
 autoconf || { false; exit 1; }
 rm -f config.cache
 ./configure --prefix=$INSTALL_DIR $CONFIG_OPTS || { false; exit 1; }
@@ -862,6 +863,7 @@ case $HOST in $ROTD_HOST)
    : > Mmake.params &&
    rm -f so_locations &&
    rm -f .enable_lib_grades &&
+    aclocal &&
    autoconf &&
    mercury_cv_low_tag_bits=2 \
    mercury_cv_bits_per_word=32 \