Move native crash detection to Package Watchdog

For the sake of consolidating various error detection mechanisms,
move native crash detection to Package Watchdog. Add a method
to allow the traditional threshold logic to be bypassed in this
case. This method will be used in the future for prioritizing
explicit health check failures.

Test: atest StagedRollbackTest#testNativeWatchdogTriggersRollback
Bug: 145584672
Change-Id: I98eb9f45a6f4a6d15001650e31ba9c596905663a
This commit is contained in:
Gavin Corkery
2019-12-10 17:18:54 +00:00
parent ab43bef713
commit f9b3fd49c4
3 changed files with 126 additions and 65 deletions

View File

@@ -29,6 +29,7 @@ import android.net.ConnectivityModuleConnector;
import android.os.Environment;
import android.os.Handler;
import android.os.Looper;
import android.os.SystemProperties;
import android.provider.DeviceConfig;
import android.text.TextUtils;
import android.util.ArrayMap;
@@ -82,6 +83,12 @@ public class PackageWatchdog {
static final String PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED =
"watchdog_explicit_health_check_enabled";
// TODO: make the following values configurable via DeviceConfig
private static final long NATIVE_CRASH_POLLING_INTERVAL_MILLIS =
TimeUnit.SECONDS.toMillis(30);
private static final long NUMBER_OF_NATIVE_CRASH_POLLS = 10;
public static final int FAILURE_REASON_UNKNOWN = 0;
public static final int FAILURE_REASON_NATIVE_CRASH = 1;
public static final int FAILURE_REASON_EXPLICIT_HEALTH_CHECK = 2;
@@ -110,6 +117,8 @@ public class PackageWatchdog {
// Whether explicit health checks are enabled or not
private static final boolean DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED = true;
private long mNumberOfNativeCrashPollsRemaining;
private static final int DB_VERSION = 1;
private static final String TAG_PACKAGE_WATCHDOG = "package-watchdog";
private static final String TAG_PACKAGE = "package";
@@ -188,6 +197,7 @@ public class PackageWatchdog {
mHealthCheckController = controller;
mConnectivityModuleConnector = connectivityModuleConnector;
mSystemClock = clock;
mNumberOfNativeCrashPollsRemaining = NUMBER_OF_NATIVE_CRASH_POLLS;
loadFromFile();
}
@@ -337,37 +347,68 @@ public class PackageWatchdog {
return;
}
for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
VersionedPackage versionedPackage = packages.get(pIndex);
// Observer that will receive failure for versionedPackage
PackageHealthObserver currentObserverToNotify = null;
int currentObserverImpact = Integer.MAX_VALUE;
if (failureReason == FAILURE_REASON_NATIVE_CRASH) {
handleFailureImmediately(packages, failureReason);
} else {
for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
VersionedPackage versionedPackage = packages.get(pIndex);
// Observer that will receive failure for versionedPackage
PackageHealthObserver currentObserverToNotify = null;
int currentObserverImpact = Integer.MAX_VALUE;
// Find observer with least user impact
for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
ObserverInternal observer = mAllObservers.valueAt(oIndex);
PackageHealthObserver registeredObserver = observer.registeredObserver;
if (registeredObserver != null
&& observer.onPackageFailureLocked(
versionedPackage.getPackageName())) {
int impact = registeredObserver.onHealthCheckFailed(versionedPackage);
if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE
&& impact < currentObserverImpact) {
currentObserverToNotify = registeredObserver;
currentObserverImpact = impact;
// Find observer with least user impact
for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
ObserverInternal observer = mAllObservers.valueAt(oIndex);
PackageHealthObserver registeredObserver = observer.registeredObserver;
if (registeredObserver != null
&& observer.onPackageFailureLocked(
versionedPackage.getPackageName())) {
int impact = registeredObserver.onHealthCheckFailed(
versionedPackage, failureReason);
if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE
&& impact < currentObserverImpact) {
currentObserverToNotify = registeredObserver;
currentObserverImpact = impact;
}
}
}
}
// Execute action with least user impact
if (currentObserverToNotify != null) {
currentObserverToNotify.execute(versionedPackage, failureReason);
// Execute action with least user impact
if (currentObserverToNotify != null) {
currentObserverToNotify.execute(versionedPackage, failureReason);
}
}
}
}
});
}
/**
* For native crashes, call directly into each observer to mitigate the error without going
* through failure threshold logic.
*/
private void handleFailureImmediately(List<VersionedPackage> packages,
@FailureReasons int failureReason) {
VersionedPackage failingPackage = packages.size() > 0 ? packages.get(0) : null;
PackageHealthObserver currentObserverToNotify = null;
int currentObserverImpact = Integer.MAX_VALUE;
for (ObserverInternal observer: mAllObservers.values()) {
PackageHealthObserver registeredObserver = observer.registeredObserver;
if (registeredObserver != null) {
int impact = registeredObserver.onHealthCheckFailed(
failingPackage, failureReason);
if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE
&& impact < currentObserverImpact) {
currentObserverToNotify = registeredObserver;
currentObserverImpact = impact;
}
}
}
if (currentObserverToNotify != null) {
currentObserverToNotify.execute(failingPackage, failureReason);
}
}
// TODO(b/120598832): Optimize write? Maybe only write a separate smaller file? Also
// avoid holding lock?
// This currently adds about 7ms extra to shutdown thread
@@ -400,6 +441,37 @@ public class PackageWatchdog {
}
}
/**
* This method should be only called on mShortTaskHandler, since it modifies
* {@link #mNumberOfNativeCrashPollsRemaining}.
*/
private void checkAndMitigateNativeCrashes() {
mNumberOfNativeCrashPollsRemaining--;
// Check if native watchdog reported a crash
if ("1".equals(SystemProperties.get("sys.init.updatable_crashing"))) {
// We rollback everything available when crash is unattributable
onPackageFailure(Collections.EMPTY_LIST, FAILURE_REASON_NATIVE_CRASH);
// we stop polling after an attempt to execute rollback, regardless of whether the
// attempt succeeds or not
} else {
if (mNumberOfNativeCrashPollsRemaining > 0) {
mShortTaskHandler.postDelayed(() -> checkAndMitigateNativeCrashes(),
NATIVE_CRASH_POLLING_INTERVAL_MILLIS);
}
}
}
/**
* Since this method can eventually trigger a rollback, it should be called
* only once boot has completed {@code onBootCompleted} and not earlier, because the install
* session must be entirely completed before we try to rollback.
*/
public void scheduleCheckAndMitigateNativeCrashes() {
Slog.i(TAG, "Scheduling " + mNumberOfNativeCrashPollsRemaining + " polls to check "
+ "and mitigate native crashes");
mShortTaskHandler.post(()->checkAndMitigateNativeCrashes());
}
/** Possible severity values of the user impact of a {@link PackageHealthObserver#execute}. */
@Retention(SOURCE)
@IntDef(value = {PackageHealthObserverImpact.USER_IMPACT_NONE,
@@ -422,17 +494,28 @@ public class PackageWatchdog {
/**
* Called when health check fails for the {@code versionedPackage}.
*
* @param versionedPackage the package that is failing. This may be null if a native
* service is crashing.
* @param failureReason the type of failure that is occurring.
*
*
* @return any one of {@link PackageHealthObserverImpact} to express the impact
* to the user on {@link #execute}
*/
@PackageHealthObserverImpact int onHealthCheckFailed(VersionedPackage versionedPackage);
@PackageHealthObserverImpact int onHealthCheckFailed(
@Nullable VersionedPackage versionedPackage,
@FailureReasons int failureReason);
/**
* Executes mitigation for {@link #onHealthCheckFailed}.
*
* @param versionedPackage the package that is failing. This may be null if a native
* service is crashing.
* @param failureReason the type of failure that is occurring.
* @return {@code true} if action was executed successfully, {@code false} otherwise
*/
boolean execute(VersionedPackage versionedPackage, @FailureReasons int failureReason);
boolean execute(@Nullable VersionedPackage versionedPackage,
@FailureReasons int failureReason);
// TODO(b/120598832): Ensure uniqueness?
/**

View File

@@ -61,7 +61,6 @@ import java.io.PrintWriter;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
/**
* {@link PackageHealthObserver} for {@link RollbackManagerService}.
@@ -74,10 +73,6 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
private static final String TAG = "RollbackPackageHealthObserver";
private static final String NAME = "rollback-observer";
private static final int INVALID_ROLLBACK_ID = -1;
// TODO: make the following values configurable via DeviceConfig
private static final long NATIVE_CRASH_POLLING_INTERVAL_MILLIS =
TimeUnit.SECONDS.toMillis(30);
private static final long NUMBER_OF_NATIVE_CRASH_POLLS = 10;
private final Context mContext;
private final Handler mHandler;
@@ -85,13 +80,9 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
// Staged rollback ids that have been committed but their session is not yet ready
@GuardedBy("mPendingStagedRollbackIds")
private final Set<Integer> mPendingStagedRollbackIds = new ArraySet<>();
// this field is initialized in the c'tor and then only accessed from mHandler thread, so
// no need to guard with a lock
private long mNumberOfNativeCrashPollsRemaining;
RollbackPackageHealthObserver(Context context) {
mContext = context;
mNumberOfNativeCrashPollsRemaining = NUMBER_OF_NATIVE_CRASH_POLLS;
HandlerThread handlerThread = new HandlerThread("RollbackPackageHealthObserver");
handlerThread.start();
mHandler = handlerThread.getThreadHandler();
@@ -102,7 +93,14 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
}
@Override
public int onHealthCheckFailed(VersionedPackage failedPackage) {
public int onHealthCheckFailed(@Nullable VersionedPackage failedPackage,
@FailureReasons int failureReason) {
// For native crashes, we will roll back any available rollbacks
if (failureReason == PackageWatchdog.FAILURE_REASON_NATIVE_CRASH
&& !mContext.getSystemService(RollbackManager.class)
.getAvailableRollbacks().isEmpty()) {
return PackageHealthObserverImpact.USER_IMPACT_MEDIUM;
}
if (getAvailableRollback(failedPackage) == null) {
// Don't handle the notification, no rollbacks available for the package
return PackageHealthObserverImpact.USER_IMPACT_NONE;
@@ -113,7 +111,13 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
}
@Override
public boolean execute(VersionedPackage failedPackage, @FailureReasons int rollbackReason) {
public boolean execute(@Nullable VersionedPackage failedPackage,
@FailureReasons int rollbackReason) {
if (rollbackReason == PackageWatchdog.FAILURE_REASON_NATIVE_CRASH) {
rollbackAll();
return true;
}
RollbackInfo rollback = getAvailableRollback(failedPackage);
if (rollback == null) {
Slog.w(TAG, "Expected rollback but no valid rollback found for package: [ "
@@ -152,7 +156,8 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
String moduleMetadataPackageName = getModuleMetadataPackageName();
if (!rollbackManager.getAvailableRollbacks().isEmpty()) {
scheduleCheckAndMitigateNativeCrashes();
// TODO(gavincorkery): Call into Package Watchdog from outside the observer
PackageWatchdog.getInstance(mContext).scheduleCheckAndMitigateNativeCrashes();
}
int rollbackId = popLastStagedRollbackId();
@@ -343,24 +348,6 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
}
}
/**
* This method should be only called on mHandler thread, since it modifies
* {@link #mNumberOfNativeCrashPollsRemaining} and we want to keep this class lock free.
*/
private void checkAndMitigateNativeCrashes() {
mNumberOfNativeCrashPollsRemaining--;
// Check if native watchdog reported a crash
if ("1".equals(SystemProperties.get("sys.init.updatable_crashing"))) {
rollbackAll();
// we stop polling after an attempt to execute rollback, regardless of whether the
// attempt succeeds or not
} else {
if (mNumberOfNativeCrashPollsRemaining > 0) {
mHandler.postDelayed(() -> checkAndMitigateNativeCrashes(),
NATIVE_CRASH_POLLING_INTERVAL_MILLIS);
}
}
}
/**
* Returns true if the package name is the name of a module.
@@ -456,16 +443,6 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
}
}
/**
* Since this method can eventually trigger a RollbackManager rollback, it should be called
* only once boot has completed {@code onBootCompleted} and not earlier, because the install
* session must be entirely completed before we try to rollback.
*/
private void scheduleCheckAndMitigateNativeCrashes() {
Slog.i(TAG, "Scheduling " + mNumberOfNativeCrashPollsRemaining + " polls to check "
+ "and mitigate native crashes");
mHandler.post(()->checkAndMitigateNativeCrashes());
}
private int mapFailureReasonToMetric(@FailureReasons int failureReason) {
switch (failureReason) {

View File

@@ -328,7 +328,8 @@ public class PackageWatchdogTest {
long differentVersionCode = 2L;
TestObserver observer = new TestObserver(OBSERVER_NAME_1) {
@Override
public int onHealthCheckFailed(VersionedPackage versionedPackage) {
public int onHealthCheckFailed(VersionedPackage versionedPackage,
int failureReason) {
if (versionedPackage.getVersionCode() == VERSION_CODE) {
// Only rollback for specific versionCode
return PackageHealthObserverImpact.USER_IMPACT_MEDIUM;
@@ -1012,7 +1013,7 @@ public class PackageWatchdogTest {
mImpact = impact;
}
public int onHealthCheckFailed(VersionedPackage versionedPackage) {
public int onHealthCheckFailed(VersionedPackage versionedPackage, int failureReason) {
mHealthCheckFailedPackages.add(versionedPackage.getPackageName());
return mImpact;
}