summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWoody Lin <woodylin@google.com>2020-05-20 23:24:09 +0800
committerWoody Lin <woodylin@google.com>2020-10-13 11:15:28 +0800
commitc1125934ecee46b79f39df498ab4e75d07c73f7e (patch)
tree45fec410941afffed37157e48eb207c5928204c7
parent5d2e89e03da708a03ad1eee25f59deefa4764d25 (diff)
Watchdog: break timeout loop via system fatal crash
Request system fatal crash via SysRq when a watchdog timeout loop is detected. This escapes automatically the device hanging symptom and also preserves context of system server in memory snapshot. More details and background: go/break-sys-watchdog-loop Bug: 141948707 Test: Insert 'sleep_forever()' to block BinderThreadMonitor to reproduce watchdog timeout. Change-Id: I3ae4b33b0d7811764c61663ac3718311b55fd048
-rw-r--r--core/sysprop/Android.bp8
-rw-r--r--core/sysprop/WatchdogProperties.sysprop45
-rw-r--r--core/sysprop/api/com.android.sysprop.watchdog-current.txt20
-rw-r--r--core/sysprop/api/com.android.sysprop.watchdog-latest.txt20
-rw-r--r--services/core/Android.bp1
-rw-r--r--services/core/java/com/android/server/Watchdog.java121
6 files changed, 215 insertions, 0 deletions
diff --git a/core/sysprop/Android.bp b/core/sysprop/Android.bp
index 7f20a0ba6642..237ede2006ea 100644
--- a/core/sysprop/Android.bp
+++ b/core/sysprop/Android.bp
@@ -19,3 +19,11 @@ sysprop_library {
api_packages: ["android.sysprop"],
vendor_available: false,
}
+
+sysprop_library {
+ name: "com.android.sysprop.watchdog",
+ srcs: ["WatchdogProperties.sysprop"],
+ property_owner: "Platform",
+ api_packages: ["android.sysprop"],
+ vendor_available: false,
+}
diff --git a/core/sysprop/WatchdogProperties.sysprop b/core/sysprop/WatchdogProperties.sysprop
new file mode 100644
index 000000000000..1bcc773a9a5d
--- /dev/null
+++ b/core/sysprop/WatchdogProperties.sysprop
@@ -0,0 +1,45 @@
+# Copyright (C) 2020 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+module: "android.sysprop.WatchdogProperties"
+owner: Platform
+
+# To escape the watchdog timeout loop, fatal reboot the system when
+# watchdog timed out 'fatal_count' times in 'fatal_window_second'
+# seconds, if both values are not 0. Default value of both is 0.
+prop {
+ api_name: "fatal_count"
+ type: Integer
+ prop_name: "framework_watchdog.fatal_count"
+ scope: Internal
+ access: Readonly
+}
+
+prop {
+ api_name: "fatal_window_second"
+ type: Integer
+ prop_name: "framework_watchdog.fatal_window.second"
+ scope: Internal
+ access: Readonly
+}
+
+# The fatal counting can be disabled by setting property
+# 'is_fatal_ignore' to true.
+prop {
+ api_name: "is_fatal_ignore"
+ type: Boolean
+ prop_name: "persist.debug.framework_watchdog.fatal_ignore"
+ scope: Internal
+ access: Readonly
+}
diff --git a/core/sysprop/api/com.android.sysprop.watchdog-current.txt b/core/sysprop/api/com.android.sysprop.watchdog-current.txt
new file mode 100644
index 000000000000..d901aef945c9
--- /dev/null
+++ b/core/sysprop/api/com.android.sysprop.watchdog-current.txt
@@ -0,0 +1,20 @@
+props {
+ module: "android.sysprop.WatchdogProperties"
+ prop {
+ api_name: "fatal_count"
+ type: Integer
+ scope: Internal
+ prop_name: "framework_watchdog.fatal_count"
+ }
+ prop {
+ api_name: "fatal_window_second"
+ type: Integer
+ scope: Internal
+ prop_name: "framework_watchdog.fatal_window.second"
+ }
+ prop {
+ api_name: "is_fatal_ignore"
+ scope: Internal
+ prop_name: "persist.debug.framework_watchdog.fatal_ignore"
+ }
+}
diff --git a/core/sysprop/api/com.android.sysprop.watchdog-latest.txt b/core/sysprop/api/com.android.sysprop.watchdog-latest.txt
new file mode 100644
index 000000000000..d901aef945c9
--- /dev/null
+++ b/core/sysprop/api/com.android.sysprop.watchdog-latest.txt
@@ -0,0 +1,20 @@
+props {
+ module: "android.sysprop.WatchdogProperties"
+ prop {
+ api_name: "fatal_count"
+ type: Integer
+ scope: Internal
+ prop_name: "framework_watchdog.fatal_count"
+ }
+ prop {
+ api_name: "fatal_window_second"
+ type: Integer
+ scope: Internal
+ prop_name: "framework_watchdog.fatal_window.second"
+ }
+ prop {
+ api_name: "is_fatal_ignore"
+ scope: Internal
+ prop_name: "persist.debug.framework_watchdog.fatal_ignore"
+ }
+}
diff --git a/services/core/Android.bp b/services/core/Android.bp
index 776c8f5912ff..431555bb57d5 100644
--- a/services/core/Android.bp
+++ b/services/core/Android.bp
@@ -132,6 +132,7 @@ java_library_static {
"netd_aidl_interfaces-platform-java",
"overlayable_policy_aidl-java",
"SurfaceFlingerProperties",
+ "com.android.sysprop.watchdog",
],
}
diff --git a/services/core/java/com/android/server/Watchdog.java b/services/core/java/com/android/server/Watchdog.java
index 17c0970c5ca7..418deb801085 100644
--- a/services/core/java/com/android/server/Watchdog.java
+++ b/services/core/java/com/android/server/Watchdog.java
@@ -23,7 +23,9 @@ import android.content.Intent;
import android.content.IntentFilter;
import android.hidl.manager.V1_0.IServiceManager;
import android.os.Binder;
+import android.os.Build;
import android.os.Debug;
+import android.os.FileUtils;
import android.os.Handler;
import android.os.IPowerManager;
import android.os.Looper;
@@ -31,10 +33,12 @@ import android.os.Process;
import android.os.RemoteException;
import android.os.ServiceManager;
import android.os.SystemClock;
+import android.os.SystemProperties;
import android.util.EventLog;
import android.util.Log;
import android.util.Slog;
import android.util.SparseArray;
+import android.sysprop.WatchdogProperties;
import com.android.internal.os.ProcessCpuTracker;
import com.android.internal.os.ZygoteConnectionConstants;
@@ -42,12 +46,16 @@ import com.android.internal.util.FrameworkStatsLog;
import com.android.server.am.ActivityManagerService;
import com.android.server.wm.SurfaceAnimationThread;
+import java.io.BufferedReader;
import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.concurrent.TimeUnit;
import java.util.HashSet;
import java.util.List;
@@ -75,6 +83,12 @@ public class Watchdog extends Thread {
private static final int WAITED_HALF = 2;
private static final int OVERDUE = 3;
+ // Track watchdog timeout history and break the crash loop if there is.
+ private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt";
+ private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count";
+ private static final String PROP_FATAL_LOOP_WINDOWS_SECS =
+ "framework_watchdog.fatal_window.second";
+
// Which native processes to dump into dropbox's stack traces
public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
"/system/bin/audioserver",
@@ -688,6 +702,10 @@ public class Watchdog extends Thread {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
Slog.w(TAG, "*** GOODBYE!");
+ if (!Build.IS_USER && isCrashLoopFound()
+ && !WatchdogProperties.is_fatal_ignore().orElse(false)) {
+ breakCrashLoop();
+ }
Process.killProcess(Process.myPid());
System.exit(10);
}
@@ -705,4 +723,107 @@ public class Watchdog extends Thread {
Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
}
}
+
+ private void resetTimeoutHistory() {
+ writeTimeoutHistory(new ArrayList<String>());
+ }
+
+ private void writeTimeoutHistory(Iterable<String> crashHistory) {
+ String data = String.join(",", crashHistory);
+
+ try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) {
+ writer.write(SystemProperties.get("ro.boottime.zygote"));
+ writer.write(":");
+ writer.write(data);
+ } catch (IOException e) {
+ Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e);
+ }
+ }
+
+ private String[] readTimeoutHistory() {
+ final String[] emptyStringArray = {};
+
+ try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) {
+ String line = reader.readLine();
+ if (line == null) {
+ return emptyStringArray;
+ }
+
+ String[] data = line.trim().split(":");
+ String boottime = data.length >= 1 ? data[0] : "";
+ String history = data.length >= 2 ? data[1] : "";
+ if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) {
+ return history.split(",");
+ } else {
+ return emptyStringArray;
+ }
+ } catch (FileNotFoundException e) {
+ return emptyStringArray;
+ } catch (IOException e) {
+ Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e);
+ return emptyStringArray;
+ }
+ }
+
+ private boolean hasActiveUsbConnection() {
+ try {
+ final String state = FileUtils.readTextFile(
+ new File("/sys/class/android_usb/android0/state"),
+ 128 /*max*/, null /*ellipsis*/).trim();
+ if ("CONFIGURED".equals(state)) {
+ return true;
+ }
+ } catch (IOException e) {
+ Slog.w(TAG, "Failed to determine if device was on USB", e);
+ }
+ return false;
+ }
+
+ private boolean isCrashLoopFound() {
+ int fatalCount = WatchdogProperties.fatal_count().orElse(0);
+ long fatalWindowMs = TimeUnit.SECONDS.toMillis(
+ WatchdogProperties.fatal_window_second().orElse(0));
+ if (fatalCount == 0 || fatalWindowMs == 0) {
+ if (fatalCount != fatalWindowMs) {
+ Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together",
+ PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS));
+ }
+ return false;
+ }
+
+ // new-history = [last (fatalCount - 1) items in old-history] + [nowMs].
+ long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep.
+ String[] rawCrashHistory = readTimeoutHistory();
+ ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange(
+ rawCrashHistory,
+ Math.max(0, rawCrashHistory.length - fatalCount - 1),
+ rawCrashHistory.length)));
+ // Something wrong here.
+ crashHistory.add(String.valueOf(nowMs));
+ writeTimeoutHistory(crashHistory);
+
+ // Returns false if the device has an active USB connection.
+ if (hasActiveUsbConnection()) {
+ return false;
+ }
+
+ long firstCrashMs;
+ try {
+ firstCrashMs = Long.parseLong(crashHistory.get(0));
+ } catch (NumberFormatException t) {
+ Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t);
+ resetTimeoutHistory();
+ return false;
+ }
+ return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs;
+ }
+
+ private void breakCrashLoop() {
+ try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) {
+ kmsg.append("Fatal reset to escape the system_server crashing loop\n");
+ } catch (IOException e) {
+ Slog.w(TAG, "Failed to append to kmsg", e);
+ }
+ doSysRq('c');
+ }
}