From a0a280846d175e8e08e08ed8c4389bac9a5f25d4 Mon Sep 17 00:00:00 2001 From: Narayan Kamath Date: Mon, 31 Jul 2017 15:58:59 +0100 Subject: [PATCH] system_server : trigger runtime restart when we're close to the soft FD limit. We arbitrarily define a high water mark as 12 below the max limit (1024) and dump the list of open descriptors and restart the system_server when we reach that mark. The list of file descriptors is dumped to a file with prefix /data/anr/anr_fd_xxxxx. This might be construed as a hack, but it allows us to take advantage of existing code in dumpstate that will capture this file and add it to any bugreport that's collected after. Test: Manual Bug: 63004717 Change-Id: I4052625574a3ab2df9ddf591f281a412e7b4b511 --- .../java/com/android/server/Watchdog.java | 149 +++++++++++++++--- 1 file changed, 125 insertions(+), 24 deletions(-) diff --git a/services/core/java/com/android/server/Watchdog.java b/services/core/java/com/android/server/Watchdog.java index 6a81d3211c9d8..8d46d1e272354 100644 --- a/services/core/java/com/android/server/Watchdog.java +++ b/services/core/java/com/android/server/Watchdog.java @@ -18,7 +18,11 @@ package com.android.server; import android.app.IActivityController; import android.os.Binder; +import android.os.Build; import android.os.RemoteException; +import android.system.ErrnoException; +import android.system.OsConstants; +import android.system.StructRlimit; import com.android.internal.os.ZygoteConnectionConstants; import com.android.server.am.ActivityManagerService; @@ -45,6 +49,7 @@ import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -107,6 +112,7 @@ public class Watchdog extends Thread { int mPhonePid; IActivityController mController; boolean mAllowRestart = true; + final OpenFdMonitor mOpenFdMonitor; /** * Used for checking status of handle threads and scheduling monitor callbacks. @@ -269,6 +275,8 @@ public class Watchdog extends Thread { // Initialize monitor for Binder threads. addMonitor(new BinderThreadMonitor()); + mOpenFdMonitor = OpenFdMonitor.create(); + // See the notes on DEFAULT_TIMEOUT. assert DB || DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS; @@ -358,7 +366,7 @@ public class Watchdog extends Thread { return checkers; } - private String describeCheckersLocked(ArrayList checkers) { + private String describeCheckersLocked(List checkers) { StringBuilder builder = new StringBuilder(128); for (int i=0; i 0) { @@ -410,7 +418,7 @@ public class Watchdog extends Thread { public void run() { boolean waitedHalf = false; while (true) { - final ArrayList blockedCheckers; + final List blockedCheckers; final String subject; final boolean allowRestart; int debuggerWasConnected = 0; @@ -447,30 +455,40 @@ public class Watchdog extends Thread { timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); } - final int waitState = evaluateCheckerCompletionLocked(); - if (waitState == COMPLETED) { - // The monitors have returned; reset - waitedHalf = false; - continue; - } else if (waitState == WAITING) { - // still waiting but within their configured intervals; back off and recheck - continue; - } else if (waitState == WAITED_HALF) { - if (!waitedHalf) { - // We've waited half the deadlock-detection interval. Pull a stack - // trace and wait another half. - ArrayList pids = new ArrayList(); - pids.add(Process.myPid()); - ActivityManagerService.dumpStackTraces(true, pids, null, null, - getInterestingNativePids()); - waitedHalf = true; - } - continue; + boolean fdLimitTriggered = false; + if (mOpenFdMonitor != null) { + fdLimitTriggered = mOpenFdMonitor.monitor(); } - // something is overdue! - blockedCheckers = getBlockedCheckersLocked(); - subject = describeCheckersLocked(blockedCheckers); + if (!fdLimitTriggered) { + final int waitState = evaluateCheckerCompletionLocked(); + if (waitState == COMPLETED) { + // The monitors have returned; reset + waitedHalf = false; + continue; + } else if (waitState == WAITING) { + // still waiting but within their configured intervals; back off and recheck + continue; + } else if (waitState == WAITED_HALF) { + if (!waitedHalf) { + // We've waited half the deadlock-detection interval. Pull a stack + // trace and wait another half. + ArrayList pids = new ArrayList(); + pids.add(Process.myPid()); + ActivityManagerService.dumpStackTraces(true, pids, null, null, + getInterestingNativePids()); + waitedHalf = true; + } + continue; + } + + // something is overdue! + blockedCheckers = getBlockedCheckersLocked(); + subject = describeCheckersLocked(blockedCheckers); + } else { + blockedCheckers = Collections.emptyList(); + subject = "Open FD high water mark reached"; + } allowRestart = mAllowRestart; } @@ -584,4 +602,87 @@ public class Watchdog extends Thread { } private native void native_dumpKernelStacks(String tracesPath); + + public static final class OpenFdMonitor { + /** + * Number of FDs below the soft limit that we trigger a runtime restart at. This was + * chosen arbitrarily, but will need to be at least 6 in order to have a sufficient number + * of FDs in reserve to complete a dump. + */ + private static final int FD_HIGH_WATER_MARK = 12; + + private final File mDumpDir; + private final File mFdHighWaterMark; + + public static OpenFdMonitor create() { + // Only run the FD monitor on debuggable builds (such as userdebug and eng builds). + if (!Build.IS_DEBUGGABLE) { + return null; + } + + // Don't run the FD monitor on builds that have a global ANR trace file. We're using + // the ANR trace directory as a quick hack in order to get these traces in bugreports + // and we wouldn't want to overwrite something important. + final String dumpDirStr = SystemProperties.get("dalvik.vm.stack-trace-dir", ""); + if (dumpDirStr.isEmpty()) { + return null; + } + + final StructRlimit rlimit; + try { + rlimit = android.system.Os.getrlimit(OsConstants.RLIMIT_NOFILE); + } catch (ErrnoException errno) { + Slog.w(TAG, "Error thrown from getrlimit(RLIMIT_NOFILE)", errno); + return null; + } + + // The assumption we're making here is that FD numbers are allocated (more or less) + // sequentially, which is currently (and historically) true since open is currently + // specified to always return the lowest-numbered non-open file descriptor for the + // current process. + // + // We do this to avoid having to enumerate the contents of /proc/self/fd in order to + // count the number of descriptors open in the process. + final File fdThreshold = new File("/proc/self/fd/" + (rlimit.rlim_cur - FD_HIGH_WATER_MARK)); + return new OpenFdMonitor(new File(dumpDirStr), fdThreshold); + } + + OpenFdMonitor(File dumpDir, File fdThreshold) { + mDumpDir = dumpDir; + mFdHighWaterMark = fdThreshold; + } + + private void dumpOpenDescriptors() { + try { + File dumpFile = File.createTempFile("anr_fd_", "", mDumpDir); + java.lang.Process proc = new ProcessBuilder() + .command("/system/bin/lsof", "-p", String.valueOf(Process.myPid())) + .redirectErrorStream(true) + .redirectOutput(dumpFile) + .start(); + + int returnCode = proc.waitFor(); + if (returnCode != 0) { + Slog.w(TAG, "Unable to dump open descriptors, lsof return code: " + + returnCode); + dumpFile.delete(); + } + } catch (IOException | InterruptedException ex) { + Slog.w(TAG, "Unable to dump open descriptors: " + ex); + } + } + + /** + * @return {@code true} if the high water mark was breached and a dump was written, + * {@code false} otherwise. + */ + public boolean monitor() { + if (mFdHighWaterMark.exists()) { + dumpOpenDescriptors(); + return true; + } + + return false; + } + } }