Watchdog: break timeout loop via system fatal crash
Request system fatal crash via SysRq when a watchdog timeout loop is
detected. This escapes automatically the device hanging symptom and also
preserves context of system server in memory snapshot.
More details and background: go/break-sys-watchdog-loop
Bug: 141948707
Test: Insert 'sleep_forever()' to block BinderThreadMonitor to reproduce
watchdog timeout.
Change-Id: I3ae4b33b0d7811764c61663ac3718311b55fd048
This commit is contained in:
@@ -19,3 +19,11 @@ sysprop_library {
|
||||
api_packages: ["android.sysprop"],
|
||||
vendor_available: false,
|
||||
}
|
||||
|
||||
sysprop_library {
|
||||
name: "com.android.sysprop.watchdog",
|
||||
srcs: ["WatchdogProperties.sysprop"],
|
||||
property_owner: "Platform",
|
||||
api_packages: ["android.sysprop"],
|
||||
vendor_available: false,
|
||||
}
|
||||
|
||||
45
core/sysprop/WatchdogProperties.sysprop
Normal file
45
core/sysprop/WatchdogProperties.sysprop
Normal file
@@ -0,0 +1,45 @@
|
||||
# Copyright (C) 2020 The Android Open Source Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
module: "android.sysprop.WatchdogProperties"
|
||||
owner: Platform
|
||||
|
||||
# To escape the watchdog timeout loop, fatal reboot the system when
|
||||
# watchdog timed out 'fatal_count' times in 'fatal_window_second'
|
||||
# seconds, if both values are not 0. Default value of both is 0.
|
||||
prop {
|
||||
api_name: "fatal_count"
|
||||
type: Integer
|
||||
prop_name: "framework_watchdog.fatal_count"
|
||||
scope: Internal
|
||||
access: Readonly
|
||||
}
|
||||
|
||||
prop {
|
||||
api_name: "fatal_window_second"
|
||||
type: Integer
|
||||
prop_name: "framework_watchdog.fatal_window.second"
|
||||
scope: Internal
|
||||
access: Readonly
|
||||
}
|
||||
|
||||
# The fatal counting can be disabled by setting property
|
||||
# 'is_fatal_ignore' to true.
|
||||
prop {
|
||||
api_name: "is_fatal_ignore"
|
||||
type: Boolean
|
||||
prop_name: "persist.debug.framework_watchdog.fatal_ignore"
|
||||
scope: Internal
|
||||
access: Readonly
|
||||
}
|
||||
20
core/sysprop/api/com.android.sysprop.watchdog-current.txt
Normal file
20
core/sysprop/api/com.android.sysprop.watchdog-current.txt
Normal file
@@ -0,0 +1,20 @@
|
||||
props {
|
||||
module: "android.sysprop.WatchdogProperties"
|
||||
prop {
|
||||
api_name: "fatal_count"
|
||||
type: Integer
|
||||
scope: Internal
|
||||
prop_name: "framework_watchdog.fatal_count"
|
||||
}
|
||||
prop {
|
||||
api_name: "fatal_window_second"
|
||||
type: Integer
|
||||
scope: Internal
|
||||
prop_name: "framework_watchdog.fatal_window.second"
|
||||
}
|
||||
prop {
|
||||
api_name: "is_fatal_ignore"
|
||||
scope: Internal
|
||||
prop_name: "persist.debug.framework_watchdog.fatal_ignore"
|
||||
}
|
||||
}
|
||||
20
core/sysprop/api/com.android.sysprop.watchdog-latest.txt
Normal file
20
core/sysprop/api/com.android.sysprop.watchdog-latest.txt
Normal file
@@ -0,0 +1,20 @@
|
||||
props {
|
||||
module: "android.sysprop.WatchdogProperties"
|
||||
prop {
|
||||
api_name: "fatal_count"
|
||||
type: Integer
|
||||
scope: Internal
|
||||
prop_name: "framework_watchdog.fatal_count"
|
||||
}
|
||||
prop {
|
||||
api_name: "fatal_window_second"
|
||||
type: Integer
|
||||
scope: Internal
|
||||
prop_name: "framework_watchdog.fatal_window.second"
|
||||
}
|
||||
prop {
|
||||
api_name: "is_fatal_ignore"
|
||||
scope: Internal
|
||||
prop_name: "persist.debug.framework_watchdog.fatal_ignore"
|
||||
}
|
||||
}
|
||||
@@ -132,6 +132,7 @@ java_library_static {
|
||||
"netd_aidl_interfaces-platform-java",
|
||||
"overlayable_policy_aidl-java",
|
||||
"SurfaceFlingerProperties",
|
||||
"com.android.sysprop.watchdog",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,9 @@ import android.content.Intent;
|
||||
import android.content.IntentFilter;
|
||||
import android.hidl.manager.V1_0.IServiceManager;
|
||||
import android.os.Binder;
|
||||
import android.os.Build;
|
||||
import android.os.Debug;
|
||||
import android.os.FileUtils;
|
||||
import android.os.Handler;
|
||||
import android.os.IPowerManager;
|
||||
import android.os.Looper;
|
||||
@@ -31,10 +33,12 @@ import android.os.Process;
|
||||
import android.os.RemoteException;
|
||||
import android.os.ServiceManager;
|
||||
import android.os.SystemClock;
|
||||
import android.os.SystemProperties;
|
||||
import android.util.EventLog;
|
||||
import android.util.Log;
|
||||
import android.util.Slog;
|
||||
import android.util.SparseArray;
|
||||
import android.sysprop.WatchdogProperties;
|
||||
|
||||
import com.android.internal.os.ProcessCpuTracker;
|
||||
import com.android.internal.os.ZygoteConnectionConstants;
|
||||
@@ -42,12 +46,16 @@ import com.android.internal.util.FrameworkStatsLog;
|
||||
import com.android.server.am.ActivityManagerService;
|
||||
import com.android.server.wm.SurfaceAnimationThread;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileReader;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
@@ -75,6 +83,12 @@ public class Watchdog extends Thread {
|
||||
private static final int WAITED_HALF = 2;
|
||||
private static final int OVERDUE = 3;
|
||||
|
||||
// Track watchdog timeout history and break the crash loop if there is.
|
||||
private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt";
|
||||
private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count";
|
||||
private static final String PROP_FATAL_LOOP_WINDOWS_SECS =
|
||||
"framework_watchdog.fatal_window.second";
|
||||
|
||||
// Which native processes to dump into dropbox's stack traces
|
||||
public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
|
||||
"/system/bin/audioserver",
|
||||
@@ -688,6 +702,10 @@ public class Watchdog extends Thread {
|
||||
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
|
||||
WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
|
||||
Slog.w(TAG, "*** GOODBYE!");
|
||||
if (!Build.IS_USER && isCrashLoopFound()
|
||||
&& !WatchdogProperties.is_fatal_ignore().orElse(false)) {
|
||||
breakCrashLoop();
|
||||
}
|
||||
Process.killProcess(Process.myPid());
|
||||
System.exit(10);
|
||||
}
|
||||
@@ -705,4 +723,107 @@ public class Watchdog extends Thread {
|
||||
Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void resetTimeoutHistory() {
|
||||
writeTimeoutHistory(new ArrayList<String>());
|
||||
}
|
||||
|
||||
private void writeTimeoutHistory(Iterable<String> crashHistory) {
|
||||
String data = String.join(",", crashHistory);
|
||||
|
||||
try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) {
|
||||
writer.write(SystemProperties.get("ro.boottime.zygote"));
|
||||
writer.write(":");
|
||||
writer.write(data);
|
||||
} catch (IOException e) {
|
||||
Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e);
|
||||
}
|
||||
}
|
||||
|
||||
private String[] readTimeoutHistory() {
|
||||
final String[] emptyStringArray = {};
|
||||
|
||||
try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) {
|
||||
String line = reader.readLine();
|
||||
if (line == null) {
|
||||
return emptyStringArray;
|
||||
}
|
||||
|
||||
String[] data = line.trim().split(":");
|
||||
String boottime = data.length >= 1 ? data[0] : "";
|
||||
String history = data.length >= 2 ? data[1] : "";
|
||||
if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) {
|
||||
return history.split(",");
|
||||
} else {
|
||||
return emptyStringArray;
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
return emptyStringArray;
|
||||
} catch (IOException e) {
|
||||
Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e);
|
||||
return emptyStringArray;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasActiveUsbConnection() {
|
||||
try {
|
||||
final String state = FileUtils.readTextFile(
|
||||
new File("/sys/class/android_usb/android0/state"),
|
||||
128 /*max*/, null /*ellipsis*/).trim();
|
||||
if ("CONFIGURED".equals(state)) {
|
||||
return true;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
Slog.w(TAG, "Failed to determine if device was on USB", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isCrashLoopFound() {
|
||||
int fatalCount = WatchdogProperties.fatal_count().orElse(0);
|
||||
long fatalWindowMs = TimeUnit.SECONDS.toMillis(
|
||||
WatchdogProperties.fatal_window_second().orElse(0));
|
||||
if (fatalCount == 0 || fatalWindowMs == 0) {
|
||||
if (fatalCount != fatalWindowMs) {
|
||||
Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together",
|
||||
PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// new-history = [last (fatalCount - 1) items in old-history] + [nowMs].
|
||||
long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep.
|
||||
String[] rawCrashHistory = readTimeoutHistory();
|
||||
ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange(
|
||||
rawCrashHistory,
|
||||
Math.max(0, rawCrashHistory.length - fatalCount - 1),
|
||||
rawCrashHistory.length)));
|
||||
// Something wrong here.
|
||||
crashHistory.add(String.valueOf(nowMs));
|
||||
writeTimeoutHistory(crashHistory);
|
||||
|
||||
// Returns false if the device has an active USB connection.
|
||||
if (hasActiveUsbConnection()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long firstCrashMs;
|
||||
try {
|
||||
firstCrashMs = Long.parseLong(crashHistory.get(0));
|
||||
} catch (NumberFormatException t) {
|
||||
Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t);
|
||||
resetTimeoutHistory();
|
||||
return false;
|
||||
}
|
||||
return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs;
|
||||
}
|
||||
|
||||
private void breakCrashLoop() {
|
||||
try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) {
|
||||
kmsg.append("Fatal reset to escape the system_server crashing loop\n");
|
||||
} catch (IOException e) {
|
||||
Slog.w(TAG, "Failed to append to kmsg", e);
|
||||
}
|
||||
doSysRq('c');
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user