Move native crash detection to Package Watchdog
For the sake of consolidating various error detection mechanisms, move native crash detection to Package Watchdog. Add a method to allow the traditional threshold logic to be bypassed in this case. This method will be used in the future for prioritizing explicit health check failures. Test: atest StagedRollbackTest#testNativeWatchdogTriggersRollback Bug: 145584672 Change-Id: I98eb9f45a6f4a6d15001650e31ba9c596905663a
This commit is contained in:
@@ -29,6 +29,7 @@ import android.net.ConnectivityModuleConnector;
|
||||
import android.os.Environment;
|
||||
import android.os.Handler;
|
||||
import android.os.Looper;
|
||||
import android.os.SystemProperties;
|
||||
import android.provider.DeviceConfig;
|
||||
import android.text.TextUtils;
|
||||
import android.util.ArrayMap;
|
||||
@@ -82,6 +83,12 @@ public class PackageWatchdog {
|
||||
static final String PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED =
|
||||
"watchdog_explicit_health_check_enabled";
|
||||
|
||||
// TODO: make the following values configurable via DeviceConfig
|
||||
private static final long NATIVE_CRASH_POLLING_INTERVAL_MILLIS =
|
||||
TimeUnit.SECONDS.toMillis(30);
|
||||
private static final long NUMBER_OF_NATIVE_CRASH_POLLS = 10;
|
||||
|
||||
|
||||
public static final int FAILURE_REASON_UNKNOWN = 0;
|
||||
public static final int FAILURE_REASON_NATIVE_CRASH = 1;
|
||||
public static final int FAILURE_REASON_EXPLICIT_HEALTH_CHECK = 2;
|
||||
@@ -110,6 +117,8 @@ public class PackageWatchdog {
|
||||
// Whether explicit health checks are enabled or not
|
||||
private static final boolean DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED = true;
|
||||
|
||||
private long mNumberOfNativeCrashPollsRemaining;
|
||||
|
||||
private static final int DB_VERSION = 1;
|
||||
private static final String TAG_PACKAGE_WATCHDOG = "package-watchdog";
|
||||
private static final String TAG_PACKAGE = "package";
|
||||
@@ -188,6 +197,7 @@ public class PackageWatchdog {
|
||||
mHealthCheckController = controller;
|
||||
mConnectivityModuleConnector = connectivityModuleConnector;
|
||||
mSystemClock = clock;
|
||||
mNumberOfNativeCrashPollsRemaining = NUMBER_OF_NATIVE_CRASH_POLLS;
|
||||
loadFromFile();
|
||||
}
|
||||
|
||||
@@ -337,37 +347,68 @@ public class PackageWatchdog {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
|
||||
VersionedPackage versionedPackage = packages.get(pIndex);
|
||||
// Observer that will receive failure for versionedPackage
|
||||
PackageHealthObserver currentObserverToNotify = null;
|
||||
int currentObserverImpact = Integer.MAX_VALUE;
|
||||
if (failureReason == FAILURE_REASON_NATIVE_CRASH) {
|
||||
handleFailureImmediately(packages, failureReason);
|
||||
} else {
|
||||
for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
|
||||
VersionedPackage versionedPackage = packages.get(pIndex);
|
||||
// Observer that will receive failure for versionedPackage
|
||||
PackageHealthObserver currentObserverToNotify = null;
|
||||
int currentObserverImpact = Integer.MAX_VALUE;
|
||||
|
||||
// Find observer with least user impact
|
||||
for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
|
||||
ObserverInternal observer = mAllObservers.valueAt(oIndex);
|
||||
PackageHealthObserver registeredObserver = observer.registeredObserver;
|
||||
if (registeredObserver != null
|
||||
&& observer.onPackageFailureLocked(
|
||||
versionedPackage.getPackageName())) {
|
||||
int impact = registeredObserver.onHealthCheckFailed(versionedPackage);
|
||||
if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE
|
||||
&& impact < currentObserverImpact) {
|
||||
currentObserverToNotify = registeredObserver;
|
||||
currentObserverImpact = impact;
|
||||
// Find observer with least user impact
|
||||
for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
|
||||
ObserverInternal observer = mAllObservers.valueAt(oIndex);
|
||||
PackageHealthObserver registeredObserver = observer.registeredObserver;
|
||||
if (registeredObserver != null
|
||||
&& observer.onPackageFailureLocked(
|
||||
versionedPackage.getPackageName())) {
|
||||
int impact = registeredObserver.onHealthCheckFailed(
|
||||
versionedPackage, failureReason);
|
||||
if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE
|
||||
&& impact < currentObserverImpact) {
|
||||
currentObserverToNotify = registeredObserver;
|
||||
currentObserverImpact = impact;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Execute action with least user impact
|
||||
if (currentObserverToNotify != null) {
|
||||
currentObserverToNotify.execute(versionedPackage, failureReason);
|
||||
// Execute action with least user impact
|
||||
if (currentObserverToNotify != null) {
|
||||
currentObserverToNotify.execute(versionedPackage, failureReason);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* For native crashes, call directly into each observer to mitigate the error without going
|
||||
* through failure threshold logic.
|
||||
*/
|
||||
private void handleFailureImmediately(List<VersionedPackage> packages,
|
||||
@FailureReasons int failureReason) {
|
||||
VersionedPackage failingPackage = packages.size() > 0 ? packages.get(0) : null;
|
||||
PackageHealthObserver currentObserverToNotify = null;
|
||||
int currentObserverImpact = Integer.MAX_VALUE;
|
||||
for (ObserverInternal observer: mAllObservers.values()) {
|
||||
PackageHealthObserver registeredObserver = observer.registeredObserver;
|
||||
if (registeredObserver != null) {
|
||||
int impact = registeredObserver.onHealthCheckFailed(
|
||||
failingPackage, failureReason);
|
||||
if (impact != PackageHealthObserverImpact.USER_IMPACT_NONE
|
||||
&& impact < currentObserverImpact) {
|
||||
currentObserverToNotify = registeredObserver;
|
||||
currentObserverImpact = impact;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (currentObserverToNotify != null) {
|
||||
currentObserverToNotify.execute(failingPackage, failureReason);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(b/120598832): Optimize write? Maybe only write a separate smaller file? Also
|
||||
// avoid holding lock?
|
||||
// This currently adds about 7ms extra to shutdown thread
|
||||
@@ -400,6 +441,37 @@ public class PackageWatchdog {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This method should be only called on mShortTaskHandler, since it modifies
|
||||
* {@link #mNumberOfNativeCrashPollsRemaining}.
|
||||
*/
|
||||
private void checkAndMitigateNativeCrashes() {
|
||||
mNumberOfNativeCrashPollsRemaining--;
|
||||
// Check if native watchdog reported a crash
|
||||
if ("1".equals(SystemProperties.get("sys.init.updatable_crashing"))) {
|
||||
// We rollback everything available when crash is unattributable
|
||||
onPackageFailure(Collections.EMPTY_LIST, FAILURE_REASON_NATIVE_CRASH);
|
||||
// we stop polling after an attempt to execute rollback, regardless of whether the
|
||||
// attempt succeeds or not
|
||||
} else {
|
||||
if (mNumberOfNativeCrashPollsRemaining > 0) {
|
||||
mShortTaskHandler.postDelayed(() -> checkAndMitigateNativeCrashes(),
|
||||
NATIVE_CRASH_POLLING_INTERVAL_MILLIS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Since this method can eventually trigger a rollback, it should be called
|
||||
* only once boot has completed {@code onBootCompleted} and not earlier, because the install
|
||||
* session must be entirely completed before we try to rollback.
|
||||
*/
|
||||
public void scheduleCheckAndMitigateNativeCrashes() {
|
||||
Slog.i(TAG, "Scheduling " + mNumberOfNativeCrashPollsRemaining + " polls to check "
|
||||
+ "and mitigate native crashes");
|
||||
mShortTaskHandler.post(()->checkAndMitigateNativeCrashes());
|
||||
}
|
||||
|
||||
/** Possible severity values of the user impact of a {@link PackageHealthObserver#execute}. */
|
||||
@Retention(SOURCE)
|
||||
@IntDef(value = {PackageHealthObserverImpact.USER_IMPACT_NONE,
|
||||
@@ -422,17 +494,28 @@ public class PackageWatchdog {
|
||||
/**
|
||||
* Called when health check fails for the {@code versionedPackage}.
|
||||
*
|
||||
* @param versionedPackage the package that is failing. This may be null if a native
|
||||
* service is crashing.
|
||||
* @param failureReason the type of failure that is occurring.
|
||||
*
|
||||
*
|
||||
* @return any one of {@link PackageHealthObserverImpact} to express the impact
|
||||
* to the user on {@link #execute}
|
||||
*/
|
||||
@PackageHealthObserverImpact int onHealthCheckFailed(VersionedPackage versionedPackage);
|
||||
@PackageHealthObserverImpact int onHealthCheckFailed(
|
||||
@Nullable VersionedPackage versionedPackage,
|
||||
@FailureReasons int failureReason);
|
||||
|
||||
/**
|
||||
* Executes mitigation for {@link #onHealthCheckFailed}.
|
||||
*
|
||||
* @param versionedPackage the package that is failing. This may be null if a native
|
||||
* service is crashing.
|
||||
* @param failureReason the type of failure that is occurring.
|
||||
* @return {@code true} if action was executed successfully, {@code false} otherwise
|
||||
*/
|
||||
boolean execute(VersionedPackage versionedPackage, @FailureReasons int failureReason);
|
||||
boolean execute(@Nullable VersionedPackage versionedPackage,
|
||||
@FailureReasons int failureReason);
|
||||
|
||||
// TODO(b/120598832): Ensure uniqueness?
|
||||
/**
|
||||
|
||||
@@ -61,7 +61,6 @@ import java.io.PrintWriter;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* {@link PackageHealthObserver} for {@link RollbackManagerService}.
|
||||
@@ -74,10 +73,6 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
|
||||
private static final String TAG = "RollbackPackageHealthObserver";
|
||||
private static final String NAME = "rollback-observer";
|
||||
private static final int INVALID_ROLLBACK_ID = -1;
|
||||
// TODO: make the following values configurable via DeviceConfig
|
||||
private static final long NATIVE_CRASH_POLLING_INTERVAL_MILLIS =
|
||||
TimeUnit.SECONDS.toMillis(30);
|
||||
private static final long NUMBER_OF_NATIVE_CRASH_POLLS = 10;
|
||||
|
||||
private final Context mContext;
|
||||
private final Handler mHandler;
|
||||
@@ -85,13 +80,9 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
|
||||
// Staged rollback ids that have been committed but their session is not yet ready
|
||||
@GuardedBy("mPendingStagedRollbackIds")
|
||||
private final Set<Integer> mPendingStagedRollbackIds = new ArraySet<>();
|
||||
// this field is initialized in the c'tor and then only accessed from mHandler thread, so
|
||||
// no need to guard with a lock
|
||||
private long mNumberOfNativeCrashPollsRemaining;
|
||||
|
||||
RollbackPackageHealthObserver(Context context) {
|
||||
mContext = context;
|
||||
mNumberOfNativeCrashPollsRemaining = NUMBER_OF_NATIVE_CRASH_POLLS;
|
||||
HandlerThread handlerThread = new HandlerThread("RollbackPackageHealthObserver");
|
||||
handlerThread.start();
|
||||
mHandler = handlerThread.getThreadHandler();
|
||||
@@ -102,7 +93,14 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
|
||||
}
|
||||
|
||||
@Override
|
||||
public int onHealthCheckFailed(VersionedPackage failedPackage) {
|
||||
public int onHealthCheckFailed(@Nullable VersionedPackage failedPackage,
|
||||
@FailureReasons int failureReason) {
|
||||
// For native crashes, we will roll back any available rollbacks
|
||||
if (failureReason == PackageWatchdog.FAILURE_REASON_NATIVE_CRASH
|
||||
&& !mContext.getSystemService(RollbackManager.class)
|
||||
.getAvailableRollbacks().isEmpty()) {
|
||||
return PackageHealthObserverImpact.USER_IMPACT_MEDIUM;
|
||||
}
|
||||
if (getAvailableRollback(failedPackage) == null) {
|
||||
// Don't handle the notification, no rollbacks available for the package
|
||||
return PackageHealthObserverImpact.USER_IMPACT_NONE;
|
||||
@@ -113,7 +111,13 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean execute(VersionedPackage failedPackage, @FailureReasons int rollbackReason) {
|
||||
public boolean execute(@Nullable VersionedPackage failedPackage,
|
||||
@FailureReasons int rollbackReason) {
|
||||
if (rollbackReason == PackageWatchdog.FAILURE_REASON_NATIVE_CRASH) {
|
||||
rollbackAll();
|
||||
return true;
|
||||
}
|
||||
|
||||
RollbackInfo rollback = getAvailableRollback(failedPackage);
|
||||
if (rollback == null) {
|
||||
Slog.w(TAG, "Expected rollback but no valid rollback found for package: [ "
|
||||
@@ -152,7 +156,8 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
|
||||
String moduleMetadataPackageName = getModuleMetadataPackageName();
|
||||
|
||||
if (!rollbackManager.getAvailableRollbacks().isEmpty()) {
|
||||
scheduleCheckAndMitigateNativeCrashes();
|
||||
// TODO(gavincorkery): Call into Package Watchdog from outside the observer
|
||||
PackageWatchdog.getInstance(mContext).scheduleCheckAndMitigateNativeCrashes();
|
||||
}
|
||||
|
||||
int rollbackId = popLastStagedRollbackId();
|
||||
@@ -343,24 +348,6 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This method should be only called on mHandler thread, since it modifies
|
||||
* {@link #mNumberOfNativeCrashPollsRemaining} and we want to keep this class lock free.
|
||||
*/
|
||||
private void checkAndMitigateNativeCrashes() {
|
||||
mNumberOfNativeCrashPollsRemaining--;
|
||||
// Check if native watchdog reported a crash
|
||||
if ("1".equals(SystemProperties.get("sys.init.updatable_crashing"))) {
|
||||
rollbackAll();
|
||||
// we stop polling after an attempt to execute rollback, regardless of whether the
|
||||
// attempt succeeds or not
|
||||
} else {
|
||||
if (mNumberOfNativeCrashPollsRemaining > 0) {
|
||||
mHandler.postDelayed(() -> checkAndMitigateNativeCrashes(),
|
||||
NATIVE_CRASH_POLLING_INTERVAL_MILLIS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the package name is the name of a module.
|
||||
@@ -456,16 +443,6 @@ public final class RollbackPackageHealthObserver implements PackageHealthObserve
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Since this method can eventually trigger a RollbackManager rollback, it should be called
|
||||
* only once boot has completed {@code onBootCompleted} and not earlier, because the install
|
||||
* session must be entirely completed before we try to rollback.
|
||||
*/
|
||||
private void scheduleCheckAndMitigateNativeCrashes() {
|
||||
Slog.i(TAG, "Scheduling " + mNumberOfNativeCrashPollsRemaining + " polls to check "
|
||||
+ "and mitigate native crashes");
|
||||
mHandler.post(()->checkAndMitigateNativeCrashes());
|
||||
}
|
||||
|
||||
private int mapFailureReasonToMetric(@FailureReasons int failureReason) {
|
||||
switch (failureReason) {
|
||||
|
||||
@@ -328,7 +328,8 @@ public class PackageWatchdogTest {
|
||||
long differentVersionCode = 2L;
|
||||
TestObserver observer = new TestObserver(OBSERVER_NAME_1) {
|
||||
@Override
|
||||
public int onHealthCheckFailed(VersionedPackage versionedPackage) {
|
||||
public int onHealthCheckFailed(VersionedPackage versionedPackage,
|
||||
int failureReason) {
|
||||
if (versionedPackage.getVersionCode() == VERSION_CODE) {
|
||||
// Only rollback for specific versionCode
|
||||
return PackageHealthObserverImpact.USER_IMPACT_MEDIUM;
|
||||
@@ -1012,7 +1013,7 @@ public class PackageWatchdogTest {
|
||||
mImpact = impact;
|
||||
}
|
||||
|
||||
public int onHealthCheckFailed(VersionedPackage versionedPackage) {
|
||||
public int onHealthCheckFailed(VersionedPackage versionedPackage, int failureReason) {
|
||||
mHealthCheckFailedPackages.add(versionedPackage.getPackageName());
|
||||
return mImpact;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user