summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Boehm <hboehm@google.com>2021-05-21 09:23:38 -0700
committerNicolas Geoffray <ngeoffray@google.com>2021-09-04 13:18:05 +0000
commit1363c1bf72f25f088b1f83250e91a47fc8cc7173 (patch)
tree2660c9c7b13b6f9829c0aa41a54815a20709c635
parentd971d7e57687e51071583cbcb2acaabf8cdb6074 (diff)
Improve suspension timeout diagnostic and fix race
Fix a data race on state_and_flags. Since the access was volatile and there are system calls in the loop, this is extremely unlikey to have casused the bug here, but ... So, assuming this is still broken, produce more informative output once we time out. Remove unused argument from SuspendThreadByPeer(). It made the logic more complicated and made it harder to reason about correctness. Remove dead code after LOG(FATAL, ...) Bug: 181778559 Test: TreeHugger, temporarily paste log message into hotter path. Merged-In: I6f3455925b3a3f4726a870150aeb54ea60a38d67 (cherry picked from commit 9d27fbc8ced914f4726187920a7794b07eca3e71) Change-Id: Ia3f04153fb0a4f1b899fb0f68a6121728f89cb91 (cherry picked from commit 116203735734738cbfdffc2163b08b1707089f9c)
-rw-r--r--openjdkjvm/OpenjdkJvm.cc1
-rw-r--r--openjdkjvmti/ti_thread.cc1
-rw-r--r--runtime/native/dalvik_system_VMStack.cc1
-rw-r--r--runtime/native/java_lang_Thread.cc1
-rw-r--r--runtime/thread.cc6
-rw-r--r--runtime/thread.h5
-rw-r--r--runtime/thread_list.cc30
-rw-r--r--runtime/thread_list.h5
-rw-r--r--test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc2
9 files changed, 29 insertions, 23 deletions
diff --git a/openjdkjvm/OpenjdkJvm.cc b/openjdkjvm/OpenjdkJvm.cc
index d64086d43c..9b514af452 100644
--- a/openjdkjvm/OpenjdkJvm.cc
+++ b/openjdkjvm/OpenjdkJvm.cc
@@ -423,7 +423,6 @@ JNIEXPORT void JVM_SetNativeThreadName(JNIEnv* env, jobject jthread, jstring jav
art::Thread* thread;
{
thread = thread_list->SuspendThreadByPeer(jthread,
- true,
art::SuspendReason::kInternal,
&timed_out);
}
diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc
index bb8fa3b694..a9a6ee8895 100644
--- a/openjdkjvmti/ti_thread.cc
+++ b/openjdkjvmti/ti_thread.cc
@@ -898,7 +898,6 @@ jvmtiError ThreadUtil::SuspendOther(art::Thread* self,
bool timeout = true;
art::Thread* ret_target = art::Runtime::Current()->GetThreadList()->SuspendThreadByPeer(
target_jthread,
- /* request_suspension= */ true,
art::SuspendReason::kForUserCode,
&timeout);
if (ret_target == nullptr && !timeout) {
diff --git a/runtime/native/dalvik_system_VMStack.cc b/runtime/native/dalvik_system_VMStack.cc
index 9d2dfac069..e88516e248 100644
--- a/runtime/native/dalvik_system_VMStack.cc
+++ b/runtime/native/dalvik_system_VMStack.cc
@@ -59,7 +59,6 @@ static ResultT GetThreadStack(const ScopedFastNativeObjectAccess& soa,
ThreadList* thread_list = Runtime::Current()->GetThreadList();
bool timed_out;
Thread* thread = thread_list->SuspendThreadByPeer(peer,
- /* request_suspension= */ true,
SuspendReason::kInternal,
&timed_out);
if (thread != nullptr) {
diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc
index 37b3fe642e..c3b4fe09de 100644
--- a/runtime/native/java_lang_Thread.cc
+++ b/runtime/native/java_lang_Thread.cc
@@ -148,7 +148,6 @@ static void Thread_setNativeName(JNIEnv* env, jobject peer, jstring java_name) {
bool timed_out;
// Take suspend thread lock to avoid races with threads trying to suspend this one.
Thread* thread = thread_list->SuspendThreadByPeer(peer,
- /* request_suspension= */ true,
SuspendReason::kInternal,
&timed_out);
if (thread != nullptr) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 46aa2b59ad..16a5f93be4 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -4468,6 +4468,12 @@ bool Thread::IsSystemDaemon() const {
WellKnownClasses::java_lang_Thread_systemDaemon)->GetBoolean(GetPeer());
}
+std::string Thread::StateAndFlagsAsHexString() const {
+ std::stringstream result_stream;
+ result_stream << std::hex << tls32_.state_and_flags.as_atomic_int.load();
+ return result_stream.str();
+}
+
ScopedExceptionStorage::ScopedExceptionStorage(art::Thread* self)
: self_(self), hs_(self_), excp_(hs_.NewHandle<art::mirror::Throwable>(self_->GetException())) {
self_->ClearException();
diff --git a/runtime/thread.h b/runtime/thread.h
index 7a408021c1..676bfd81de 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -254,7 +254,7 @@ class Thread {
bool IsSuspended() const {
union StateAndFlags state_and_flags;
- state_and_flags.as_int = tls32_.state_and_flags.as_int;
+ state_and_flags.as_int = tls32_.state_and_flags.as_atomic_int.load(std::memory_order_relaxed);
return state_and_flags.as_struct.state != kRunnable &&
(state_and_flags.as_struct.flags & kSuspendRequest) != 0;
}
@@ -1517,6 +1517,9 @@ class Thread {
};
static_assert(sizeof(StateAndFlags) == sizeof(int32_t), "Weird state_and_flags size");
+ // Format state and flags as a hex string. For diagnostic output.
+ std::string StateAndFlagsAsHexString() const;
+
static void ThreadExitCallback(void* arg);
// Maximum number of suspend barriers.
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index f8e99e8f62..84b7384c46 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -874,10 +874,11 @@ static void ThreadSuspendByPeerWarning(Thread* self,
}
Thread* ThreadList::SuspendThreadByPeer(jobject peer,
- bool request_suspension,
SuspendReason reason,
bool* timed_out) {
+ bool request_suspension = true;
const uint64_t start_time = NanoTime();
+ int self_suspend_count = 0;
useconds_t sleep_us = kThreadSuspendInitialSleepUs;
*timed_out = false;
Thread* const self = Thread::Current();
@@ -926,6 +927,7 @@ Thread* ThreadList::SuspendThreadByPeer(jobject peer,
// We hold the suspend count lock but another thread is trying to suspend us. Its not
// safe to try to suspend another thread in case we get a cycle. Start the loop again
// which will allow this thread to be suspended.
+ ++self_suspend_count;
continue;
}
CHECK(suspended_thread == nullptr);
@@ -957,20 +959,22 @@ Thread* ThreadList::SuspendThreadByPeer(jobject peer,
}
const uint64_t total_delay = NanoTime() - start_time;
if (total_delay >= thread_suspend_timeout_ns_) {
- ThreadSuspendByPeerWarning(self,
- ::android::base::FATAL,
- "Thread suspension timed out",
- peer);
- if (suspended_thread != nullptr) {
+ if (suspended_thread == nullptr) {
+ ThreadSuspendByPeerWarning(self,
+ ::android::base::FATAL,
+ "Failed to issue suspend request",
+ peer);
+ } else {
CHECK_EQ(suspended_thread, thread);
- bool updated = suspended_thread->ModifySuspendCount(soa.Self(),
- -1,
- nullptr,
- reason);
- DCHECK(updated);
+ LOG(WARNING) << "Suspended thread state_and_flags: "
+ << suspended_thread->StateAndFlagsAsHexString()
+ << ", self_suspend_count = " << self_suspend_count;
+ ThreadSuspendByPeerWarning(self,
+ ::android::base::FATAL,
+ "Thread suspension timed out",
+ peer);
}
- *timed_out = true;
- return nullptr;
+ UNREACHABLE();
} else if (sleep_us == 0 &&
total_delay > static_cast<uint64_t>(kThreadSuspendMaxYieldUs) * 1000) {
// We have spun for kThreadSuspendMaxYieldUs time, switch to sleeps to prevent
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 87a4c8dc61..f5b58a0c54 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -81,11 +81,8 @@ class ThreadList {
// Suspend a thread using a peer, typically used by the debugger. Returns the thread on success,
// else null. The peer is used to identify the thread to avoid races with the thread terminating.
- // If the thread should be suspended then value of request_suspension should be true otherwise
- // the routine will wait for a previous suspend request. If the suspension times out then *timeout
- // is set to true.
+ // If the suspension times out then *timeout is set to true.
Thread* SuspendThreadByPeer(jobject peer,
- bool request_suspension,
SuspendReason reason,
bool* timed_out)
REQUIRES(!Locks::mutator_lock_,
diff --git a/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc b/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc
index a185446ca5..a10fe2e905 100644
--- a/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc
+++ b/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc
@@ -81,7 +81,7 @@ extern "C" JNIEXPORT void JNICALL Java_Main_waitAndDeopt(JNIEnv*, jobject, jobje
}
bool timed_out = false;
Thread* other = Runtime::Current()->GetThreadList()->SuspendThreadByPeer(
- target, true, SuspendReason::kInternal, &timed_out);
+ target, SuspendReason::kInternal, &timed_out);
CHECK(!timed_out);
CHECK(other != nullptr);
ScopedSuspendAll ssa(__FUNCTION__);