jobextra/onetbb/retry-pthread_create.patch

65 lines
2.3 KiB
Diff

From 7e9474594c775efa70b852f0c0c5089e5980722d Mon Sep 17 00:00:00 2001
From: Rui Ueyama <ruiu@cs.stanford.edu>
Date: Sat, 7 May 2022 19:55:24 +0800
Subject: [PATCH] Retry if pthread_create fails with EAGAIN
On many Unix-like systems, pthread_create can fail spuriously even if
the running machine has enough resources to spawn a new thread.
Therefore, if EAGAIN is returned from pthread_create, we actually have
to try again.
I observed this issue when running the mold linker
(https://github.com/rui314/mold) under a heavy load. mold uses OneTBB
for parallelization.
As another data point, Go has the same logic to retry on EAGAIN:
https://go-review.googlesource.com/c/go/+/33894/
nanosleep is defined in POSIX 2001, so I believe that all Unix-like
systems support it.
Signed-off-by: Rui Ueyama <ruiu@cs.stanford.edu>
---
src/tbb/rml_thread_monitor.h | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/src/tbb/rml_thread_monitor.h b/src/tbb/rml_thread_monitor.h
index 13b556380f..57e9c30b07 100644
--- a/src/tbb/rml_thread_monitor.h
+++ b/src/tbb/rml_thread_monitor.h
@@ -31,6 +31,7 @@
#include <pthread.h>
#include <cstring>
#include <cstdlib>
+#include <time.h>
#else
#error Unsupported platform
#endif
@@ -191,8 +192,25 @@ inline thread_monitor::handle_type thread_monitor::launch( void* (*thread_routin
check(pthread_attr_init( &s ), "pthread_attr_init has failed");
if( stack_size>0 )
check(pthread_attr_setstacksize( &s, stack_size ), "pthread_attr_setstack_size has failed" );
+
+ // pthread_create(2) can spuriously fail with EAGAIN. We retry
+ // max_num_tries times with progressively longer wait times.
pthread_t handle;
- check( pthread_create( &handle, &s, thread_routine, arg ), "pthread_create has failed" );
+ const int max_num_tries = 20;
+ int error = EAGAIN;
+
+ for (int i = 0; i < max_num_tries && error == EAGAIN; i++) {
+ if (i != 0) {
+ // Wait i milliseconds
+ struct timespec ts = {0, i * 1000 * 1000};
+ nanosleep(&ts, NULL);
+ }
+ error = pthread_create(&handle, &s, thread_routine, arg);
+ }
+
+ if (error)
+ handle_perror(error, "pthread_create has failed");
+
check( pthread_attr_destroy( &s ), "pthread_attr_destroy has failed" );
return handle;
}