diff --git a/libgo/runtime/malloc.goc b/libgo/runtime/malloc.goc
index cde09aa0cb69e0bbd84db0e18ec9be1db79ab64e..7d6af6f3a0ba888683aec1c5050a348d74e9c205 100644
--- a/libgo/runtime/malloc.goc
+++ b/libgo/runtime/malloc.goc
@@ -270,6 +270,9 @@ runtime_allocmcache(void)
 void
 runtime_mallocinit(void)
 {
+	runtime_initfintab();
+	runtime_Mprof_Init();
+
 	runtime_SysMemInit();
 	runtime_InitSizes();
 	runtime_MHeap_Init(&runtime_mheap, runtime_SysAlloc);
diff --git a/libgo/runtime/malloc.h b/libgo/runtime/malloc.h
index 600b3b176ff0f3c7ca2c47472506a31c2cbf7662..585996e6dca0d6a7c89d7ea3cc8ad56772523d39 100644
--- a/libgo/runtime/malloc.h
+++ b/libgo/runtime/malloc.h
@@ -375,6 +375,7 @@ enum
 	RefFlags = 0xFFFF0000U,
 };
 
+void	runtime_Mprof_Init(void);
 void	runtime_MProf_Malloc(void*, uintptr);
 void	runtime_MProf_Free(void*, uintptr);
 void	runtime_MProf_Mark(void (*scan)(byte *, int64));
diff --git a/libgo/runtime/mfinal.c b/libgo/runtime/mfinal.c
index 3f3a610414319baced9faa0a50810cc88f3ecd0e..5d32721e696a146b5c3f3ca37b77c2d1020a3e29 100644
--- a/libgo/runtime/mfinal.c
+++ b/libgo/runtime/mfinal.c
@@ -5,7 +5,13 @@
 #include "runtime.h"
 #include "malloc.h"
 
-static Lock finlock = LOCK_INITIALIZER;
+static Lock finlock;
+
+void
+runtime_initfintab()
+{
+	runtime_initlock(&finlock);
+}
 
 // Finalizer hash table.  Direct hash, linear scan, at most 3/4 full.
 // Table size is power of 3 so that hash can be key % max.
diff --git a/libgo/runtime/mgc0.c b/libgo/runtime/mgc0.c
index 06e557fcc719aa126c69cd3c8a038f9586eb8b98..1a1a5ace834bc2769fb69468195d395cf9e6f784 100644
--- a/libgo/runtime/mgc0.c
+++ b/libgo/runtime/mgc0.c
@@ -27,7 +27,7 @@ struct BlockList
 };
 
 static bool finstarted;
-static Lock finqlock = LOCK_INITIALIZER;
+static pthread_mutex_t finqlock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t finqcond = PTHREAD_COND_INITIALIZER;
 static Finalizer *finq;
 static int32 fingwait;
@@ -284,7 +284,7 @@ sweep(void)
 			sweepspan(s);
 }
 
-static Lock gcsema = LOCK_INITIALIZER;
+static pthread_mutex_t gcsema = PTHREAD_MUTEX_INITIALIZER;
 
 // Initialized from $GOGC.  GOGC=off means no gc.
 //
@@ -327,8 +327,8 @@ runtime_gc(int32 force __attribute__ ((unused)))
 	if(gcpercent < 0)
 		return;
 
-	runtime_lock(&finqlock);
-	runtime_lock(&gcsema);
+	pthread_mutex_lock(&finqlock);
+	pthread_mutex_lock(&gcsema);
 	m->locks++;	// disable gc during the mallocs in newproc
 	t0 = runtime_nanotime();
 	runtime_stoptheworld();
@@ -345,7 +345,7 @@ runtime_gc(int32 force __attribute__ ((unused)))
 	mstats.pause_ns += t1 - t0;
 	if(mstats.debuggc)
 		runtime_printf("pause %llu\n", (unsigned long long)t1-t0);
-	runtime_unlock(&gcsema);
+	pthread_mutex_unlock(&gcsema);
 	runtime_starttheworld();
 
 	// finqlock is still held.
@@ -362,7 +362,7 @@ runtime_gc(int32 force __attribute__ ((unused)))
 		}
 	}
 	m->locks--;
-	runtime_unlock(&finqlock);
+	pthread_mutex_unlock(&finqlock);
 }
 
 static void
@@ -373,16 +373,16 @@ runfinq(void* dummy)
 	USED(dummy);
 
 	for(;;) {
-		runtime_lock(&finqlock);
+		pthread_mutex_lock(&finqlock);
 		f = finq;
 		finq = nil;
 		if(f == nil) {
 			fingwait = 1;
-			pthread_cond_wait(&finqcond, &finqlock.mutex);
-			runtime_unlock(&finqlock);
+			pthread_cond_wait(&finqcond, &finqlock);
+			pthread_mutex_unlock(&finqlock);
 			continue;
 		}
-		runtime_unlock(&finqlock);
+		pthread_mutex_unlock(&finqlock);
 		for(; f; f=next) {
 			void *params[1];
 
diff --git a/libgo/runtime/mprof.goc b/libgo/runtime/mprof.goc
index 5ee6b0f548fc7b6785a54d5e2608f83857e376ef..6bd4ef72724487be84ced77c99df0e5a2b7364ab 100644
--- a/libgo/runtime/mprof.goc
+++ b/libgo/runtime/mprof.goc
@@ -14,7 +14,7 @@ package runtime
 typedef struct __go_open_array Slice;
 
 // NOTE(rsc): Everything here could use cas if contention became an issue.
-static Lock proflock = LOCK_INITIALIZER;
+static Lock proflock;
 
 // Per-call-stack allocation information.
 // Lookup by hashing call stack into a linked-list hash table.
@@ -185,6 +185,12 @@ found:
 	return nil;
 }
 
+void
+runtime_Mprof_Init()
+{
+	runtime_initlock(&proflock);
+}
+
 // Called by malloc to record a profiled block.
 void
 runtime_MProf_Malloc(void *p, uintptr size)
diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h
index e062bd638d7814b5f2c2e3736dfd6482cce26493..3027f0c42d26fcb959dde2a10370da4e7d408139 100644
--- a/libgo/runtime/runtime.h
+++ b/libgo/runtime/runtime.h
@@ -13,6 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
+#include <semaphore.h>
 
 #ifdef HAVE_SYS_MMAN_H
 #include <sys/mman.h>
@@ -53,7 +54,8 @@ typedef	struct	Lock		Lock;
 
 struct	Lock
 {
-	pthread_mutex_t	mutex;
+	uint32 key;
+	sem_t sem;
 };
 
 /* A Note.  */
@@ -119,6 +121,7 @@ struct	M
 
 void*	runtime_mal(uintptr);
 void	runtime_mallocinit(void);
+void	runtime_initfintab(void);
 void	siginit(void);
 bool	__go_sigsend(int32 sig);
 int64	runtime_nanotime(void);
@@ -138,12 +141,10 @@ void	__go_cachestats(void);
  * as fast as spin locks (just a few user-level instructions),
  * but on the contention path they sleep in the kernel.
  */
-#define	LOCK_INITIALIZER	{ PTHREAD_MUTEX_INITIALIZER }
 void	runtime_initlock(Lock*);
 void	runtime_lock(Lock*);
 void	runtime_unlock(Lock*);
 void	runtime_destroylock(Lock*);
-bool	runtime_trylock(Lock*);
 
 void semacquire (uint32 *) asm ("libgo_runtime.runtime.Semacquire");
 void semrelease (uint32 *) asm ("libgo_runtime.runtime.Semrelease");
@@ -178,7 +179,7 @@ void	runtime_addfinalizer(void*, void(*fn)(void*), const struct __go_func_type *
 void	runtime_walkfintab(void (*fn)(void*), void (*scan)(byte *, int64));
 #define runtime_mmap mmap
 #define runtime_munmap(p, s) munmap((p), (s))
-#define cas(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
+#define runtime_cas(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
 
 struct __go_func_type;
 void reflect_call(const struct __go_func_type *, const void *, _Bool, void **,
diff --git a/libgo/runtime/sigqueue.goc b/libgo/runtime/sigqueue.goc
index 0f758ac5b9660b99b9536cf0c1024356bc29f817..b5f2954bc8e045317338938e422315c2896ba60b 100644
--- a/libgo/runtime/sigqueue.goc
+++ b/libgo/runtime/sigqueue.goc
@@ -67,7 +67,7 @@ __go_sigsend(int32 s)
 		mask = sig.mask;
 		if(mask & bit)
 			break;		// signal already in queue
-		if(cas(&sig.mask, mask, mask|bit)) {
+		if(runtime_cas(&sig.mask, mask, mask|bit)) {
 			// Added to queue.
 			// Only send a wakeup for the first signal in each round.
 			if(mask == 0)
@@ -86,7 +86,7 @@ func Sigrecv() (m uint32) {
 	noteclear(&sig);
 	for(;;) {
 		m = sig.mask;
-		if(cas(&sig.mask, m, 0))
+		if(runtime_cas(&sig.mask, m, 0))
 			break;
 	}
 }
diff --git a/libgo/runtime/thread.c b/libgo/runtime/thread.c
index 565121768b6df97a78833189fff86a537b438ea3..99a0d68f03b575818fa2652912486fcd2b036302 100644
--- a/libgo/runtime/thread.c
+++ b/libgo/runtime/thread.c
@@ -7,32 +7,67 @@
 void
 runtime_initlock(Lock *l)
 {
-	if(pthread_mutex_init(&l->mutex, NULL) != 0)
-		runtime_throw("pthread_mutex_init failed");
+	l->key = 0;
+	if(sem_init(&l->sem, 0, 0) != 0)
+		runtime_throw("sem_init failed");
+}
+
+static uint32
+runtime_xadd(uint32 volatile *val, int32 delta)
+{
+	uint32 oval, nval;
+
+	for(;;){
+		oval = *val;
+		nval = oval + delta;
+		if(runtime_cas(val, oval, nval))
+			return nval;
+	}
+}
+
+// noinline so that runtime_lock doesn't have to split the stack.
+static void runtime_lock_full(Lock *l) __attribute__ ((noinline));
+
+static void
+runtime_lock_full(Lock *l)
+{
+	if(sem_wait(&l->sem) != 0)
+		runtime_throw("sem_wait failed");
 }
 
 void
 runtime_lock(Lock *l)
 {
-	if(pthread_mutex_lock(&l->mutex) != 0)
-		runtime_throw("lock failed");
+	if(m->locks < 0)
+		runtime_throw("lock count");
+	m->locks++;
+
+	if(runtime_xadd(&l->key, 1) > 1)	// someone else has it; wait
+		runtime_lock_full(l);
 }
 
-void
-runtime_unlock(Lock *l)
+static void runtime_unlock_full(Lock *l) __attribute__ ((noinline));
+
+static void
+runtime_unlock_full(Lock *l)
 {
-	if(pthread_mutex_unlock(&l->mutex) != 0)
-		runtime_throw("unlock failed");
+	if(sem_post(&l->sem) != 0)
+		runtime_throw("sem_post failed");
 }
 
 void
-runtime_destroylock(Lock *l)
+runtime_unlock(Lock *l)
 {
-	pthread_mutex_destroy(&l->mutex);
+	m->locks--;
+	if(m->locks < 0)
+		runtime_throw("lock count");
+
+	if(runtime_xadd(&l->key, -1) > 0)	// someone else is waiting
+		runtime_unlock_full(l);
 }
 
-bool
-runtime_trylock(Lock *l)
+void
+runtime_destroylock(Lock *l)
 {
-	return pthread_mutex_trylock(&l->mutex) == 0;
+	sem_destroy(&l->sem);
 }