Merge "Refine incidentd error handling and logging"
This commit is contained in:
committed by
Android (Google) Code Review
commit
bc6a68cd08
@@ -53,6 +53,7 @@ cc_binary {
|
||||
"libprotoutil",
|
||||
"libservices",
|
||||
"libutils",
|
||||
"libprotobuf-cpp-lite",
|
||||
],
|
||||
|
||||
init_rc: ["incidentd.rc"],
|
||||
|
||||
@@ -47,9 +47,13 @@ status_t FdBuffer::read(int fd, int64_t timeout) {
|
||||
while (true) {
|
||||
if (mBuffer.size() >= MAX_BUFFER_COUNT * BUFFER_SIZE) {
|
||||
mTruncated = true;
|
||||
VLOG("Truncating data");
|
||||
break;
|
||||
}
|
||||
if (mBuffer.writeBuffer() == NULL) return NO_MEMORY;
|
||||
if (mBuffer.writeBuffer() == NULL) {
|
||||
VLOG("No memory");
|
||||
return NO_MEMORY;
|
||||
}
|
||||
|
||||
int64_t remainingTime = (mStartTime + timeout) - uptimeMillis();
|
||||
if (remainingTime <= 0) {
|
||||
@@ -58,7 +62,7 @@ status_t FdBuffer::read(int fd, int64_t timeout) {
|
||||
break;
|
||||
}
|
||||
|
||||
int count = poll(&pfds, 1, remainingTime);
|
||||
int count = TEMP_FAILURE_RETRY(poll(&pfds, 1, remainingTime));
|
||||
if (count == 0) {
|
||||
VLOG("timed out due to block calling poll");
|
||||
mTimedOut = true;
|
||||
@@ -102,7 +106,10 @@ status_t FdBuffer::readFully(int fd) {
|
||||
VLOG("Truncating data");
|
||||
break;
|
||||
}
|
||||
if (mBuffer.writeBuffer() == NULL) return NO_MEMORY;
|
||||
if (mBuffer.writeBuffer() == NULL) {
|
||||
VLOG("No memory");
|
||||
return NO_MEMORY;
|
||||
}
|
||||
|
||||
ssize_t amt =
|
||||
TEMP_FAILURE_RETRY(::read(fd, mBuffer.writeBuffer(), mBuffer.currentToWrite()));
|
||||
@@ -144,10 +151,14 @@ status_t FdBuffer::readProcessedDataInStream(int fd, unique_fd toFd, unique_fd f
|
||||
// This is the buffer used to store processed data
|
||||
while (true) {
|
||||
if (mBuffer.size() >= MAX_BUFFER_COUNT * BUFFER_SIZE) {
|
||||
VLOG("Truncating data");
|
||||
mTruncated = true;
|
||||
break;
|
||||
}
|
||||
if (mBuffer.writeBuffer() == NULL) return NO_MEMORY;
|
||||
if (mBuffer.writeBuffer() == NULL) {
|
||||
VLOG("No memory");
|
||||
return NO_MEMORY;
|
||||
}
|
||||
|
||||
int64_t remainingTime = (mStartTime + timeoutMs) - uptimeMillis();
|
||||
if (remainingTime <= 0) {
|
||||
@@ -157,7 +168,7 @@ status_t FdBuffer::readProcessedDataInStream(int fd, unique_fd toFd, unique_fd f
|
||||
}
|
||||
|
||||
// wait for any pfds to be ready to perform IO
|
||||
int count = poll(pfds, 3, remainingTime);
|
||||
int count = TEMP_FAILURE_RETRY(poll(pfds, 3, remainingTime));
|
||||
if (count == 0) {
|
||||
VLOG("timed out due to block calling poll");
|
||||
mTimedOut = true;
|
||||
|
||||
@@ -42,13 +42,11 @@ enum { WHAT_RUN_REPORT = 1, WHAT_SEND_BACKLOG_TO_DROPBOX = 2 };
|
||||
#define DEFAULT_BYTES_SIZE_LIMIT (20 * 1024 * 1024) // 20MB
|
||||
#define DEFAULT_REFACTORY_PERIOD_MS (24 * 60 * 60 * 1000) // 1 Day
|
||||
|
||||
// Skip logs (1100 - 1108) because they are already in the bug report
|
||||
// Skip 1200, 1201, 1202, 3018 because they take too long
|
||||
// TODO(120079956): Skip 3008, 3015 because of error
|
||||
// Skip these sections for dumpstate only. Dumpstate allows 10s max for each service to dump.
|
||||
// Skip logs (1100 - 1108) and traces (1200 - 1202) because they are already in the bug report.
|
||||
// Skip 3018 because it takes too long.
|
||||
#define SKIPPED_SECTIONS { 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, /* Logs */ \
|
||||
1200, 1201, 1202, /* Native, hal, java traces */ \
|
||||
3008, /* "package --proto" */ \
|
||||
3015, /* "activity --proto processes" */ \
|
||||
3018 /* "meminfo -a --proto" */ }
|
||||
|
||||
namespace android {
|
||||
|
||||
@@ -96,7 +96,12 @@ status_t PrivacyBuffer::stripField(const Privacy* parentPolicy, const PrivacySpe
|
||||
uint64_t token = mProto.start(encode_field_id(policy));
|
||||
while (mData.rp()->pos() - start != msgSize) {
|
||||
status_t err = stripField(policy, spec, depth + 1);
|
||||
if (err != NO_ERROR) return err;
|
||||
if (err != NO_ERROR) {
|
||||
VLOG("Bad value when stripping id %d, wiretype %d, tag %#x, depth %d, size %d, "
|
||||
"relative pos %zu, ", fieldId, read_wire_type(fieldTag), fieldTag, depth,
|
||||
msgSize, mData.rp()->pos() - start);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
mProto.end(token);
|
||||
return NO_ERROR;
|
||||
@@ -117,9 +122,13 @@ status_t PrivacyBuffer::strip(const PrivacySpec& spec) {
|
||||
}
|
||||
while (mData.hasNext()) {
|
||||
status_t err = stripField(mPolicy, spec, 0);
|
||||
if (err != NO_ERROR) return err;
|
||||
if (err != NO_ERROR) return err; // Error logged in stripField.
|
||||
}
|
||||
if (mData.bytesRead() != mData.size()) {
|
||||
VLOG("Buffer corrupted: expect %zu bytes, read %zu bytes",
|
||||
mData.size(), mData.bytesRead());
|
||||
return BAD_VALUE;
|
||||
}
|
||||
if (mData.bytesRead() != mData.size()) return BAD_VALUE;
|
||||
mSize = mProto.size();
|
||||
mData.rp()->rewind(); // rewind the read pointer back to beginning after the strip.
|
||||
return NO_ERROR;
|
||||
|
||||
@@ -129,6 +129,7 @@ Reporter::run_report_status_t Reporter::runReport(size_t* reportByteSize) {
|
||||
bool needMainFd = false;
|
||||
int mainFd = -1;
|
||||
int mainDest = -1;
|
||||
int sectionCount = 0;
|
||||
HeaderSection headers;
|
||||
MetadataSection metadataSection;
|
||||
std::string buildType = android::base::GetProperty("ro.build.type", "");
|
||||
@@ -180,12 +181,12 @@ Reporter::run_report_status_t Reporter::runReport(size_t* reportByteSize) {
|
||||
for (const Section** section = SECTION_LIST; *section; section++) {
|
||||
const int id = (*section)->id;
|
||||
if ((*section)->userdebugAndEngOnly && !isUserdebugOrEng) {
|
||||
ALOGD("Skipping incident report section %d '%s' because it's limited to userdebug/eng",
|
||||
VLOG("Skipping incident report section %d '%s' because it's limited to userdebug/eng",
|
||||
id, (*section)->name.string());
|
||||
continue;
|
||||
}
|
||||
if (this->batch.containsSection(id)) {
|
||||
ALOGD("Taking incident report section %d '%s'", id, (*section)->name.string());
|
||||
VLOG("Taking incident report section %d '%s'", id, (*section)->name.string());
|
||||
for (ReportRequestSet::iterator it = batch.begin(); it != batch.end(); it++) {
|
||||
if ((*it)->listener != NULL && (*it)->args.containsSection(id)) {
|
||||
(*it)->listener->onReportSectionStatus(
|
||||
@@ -198,11 +199,12 @@ Reporter::run_report_status_t Reporter::runReport(size_t* reportByteSize) {
|
||||
int64_t startTime = uptimeMillis();
|
||||
err = (*section)->Execute(&batch);
|
||||
int64_t endTime = uptimeMillis();
|
||||
stats->set_success(err == NO_ERROR);
|
||||
stats->set_exec_duration_ms(endTime - startTime);
|
||||
if (err != NO_ERROR) {
|
||||
ALOGW("Incident section %s (%d) failed: %s. Stopping report.",
|
||||
(*section)->name.string(), id, strerror(-err));
|
||||
// Execute() has already recorded this status. Only update if there's new failure.
|
||||
stats->set_success(false);
|
||||
goto DONE;
|
||||
}
|
||||
(*reportByteSize) += stats->report_size_bytes();
|
||||
@@ -214,11 +216,13 @@ Reporter::run_report_status_t Reporter::runReport(size_t* reportByteSize) {
|
||||
id, IIncidentReportStatusListener::STATUS_FINISHED);
|
||||
}
|
||||
}
|
||||
ALOGD("Finish incident report section %d '%s'", id, (*section)->name.string());
|
||||
VLOG("Finish incident report section %d '%s'", id, (*section)->name.string());
|
||||
sectionCount++;
|
||||
}
|
||||
}
|
||||
|
||||
DONE:
|
||||
ALOGD("Incident reporting took %d sections.", sectionCount);
|
||||
// Reports the metdadata when taking the incident report.
|
||||
if (!isTest) metadataSection.Execute(&batch);
|
||||
|
||||
|
||||
@@ -75,6 +75,7 @@ static void write_section_stats(IncidentMetadata::SectionStats* stats, const FdB
|
||||
stats->set_dump_duration_ms(buffer.durationMs());
|
||||
stats->set_timed_out(buffer.timedOut());
|
||||
stats->set_is_truncated(buffer.truncated());
|
||||
stats->set_success(!buffer.timedOut() && !buffer.truncated());
|
||||
}
|
||||
|
||||
// Reads data from FdBuffer and writes it to the requests file descriptor.
|
||||
@@ -83,7 +84,8 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer,
|
||||
status_t err = -EBADF;
|
||||
EncodedBuffer::iterator data = buffer.data();
|
||||
PrivacyBuffer privacyBuffer(get_privacy_of_section(id), data);
|
||||
int writeable = 0;
|
||||
IncidentMetadata::SectionStats* stats = requests->sectionStats(id);
|
||||
int nPassed = 0;
|
||||
|
||||
// The streaming ones, group requests by spec in order to save unnecessary strip operations
|
||||
map<PrivacySpec, vector<sp<ReportRequest>>> requestsBySpec;
|
||||
@@ -99,7 +101,18 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer,
|
||||
for (auto mit = requestsBySpec.begin(); mit != requestsBySpec.end(); mit++) {
|
||||
PrivacySpec spec = mit->first;
|
||||
err = privacyBuffer.strip(spec);
|
||||
if (err != NO_ERROR) return err; // it means the privacyBuffer data is corrupted.
|
||||
if (err != NO_ERROR) {
|
||||
// Privacy Buffer is corrupted, probably due to an ill-formatted proto. This is a
|
||||
// non-fatal error. The whole report is still valid. So just log the failure.
|
||||
ALOGW("Failed to strip section %d with spec %d: %s",
|
||||
id, spec.dest, strerror(-err));
|
||||
stats->set_success(false);
|
||||
stats->set_error_msg("Failed to strip section: privacy buffer corrupted, probably "
|
||||
"due to an ill-formatted proto");
|
||||
nPassed++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (privacyBuffer.size() == 0) continue;
|
||||
|
||||
for (auto it = mit->second.begin(); it != mit->second.end(); it++) {
|
||||
@@ -114,7 +127,7 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer,
|
||||
request->err = err;
|
||||
continue;
|
||||
}
|
||||
writeable++;
|
||||
nPassed++;
|
||||
VLOG("Section %d flushed %zu bytes to fd %d with spec %d", id, privacyBuffer.size(),
|
||||
request->fd, spec.dest);
|
||||
}
|
||||
@@ -125,7 +138,15 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer,
|
||||
if (requests->mainFd() >= 0) {
|
||||
PrivacySpec spec = PrivacySpec::new_spec(requests->mainDest());
|
||||
err = privacyBuffer.strip(spec);
|
||||
if (err != NO_ERROR) return err; // the buffer data is corrupted.
|
||||
if (err != NO_ERROR) {
|
||||
ALOGW("Failed to strip section %d with spec %d for dropbox: %s",
|
||||
id, spec.dest, strerror(-err));
|
||||
stats->set_success(false);
|
||||
stats->set_error_msg("Failed to strip section: privacy buffer corrupted, probably "
|
||||
"due to an ill-formatted proto");
|
||||
nPassed++;
|
||||
goto DONE;
|
||||
}
|
||||
if (privacyBuffer.size() == 0) goto DONE;
|
||||
|
||||
err = write_section_header(requests->mainFd(), id, privacyBuffer.size());
|
||||
@@ -138,7 +159,7 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer,
|
||||
requests->setMainFd(-1);
|
||||
goto DONE;
|
||||
}
|
||||
writeable++;
|
||||
nPassed++;
|
||||
VLOG("Section %d flushed %zu bytes to dropbox %d with spec %d", id, privacyBuffer.size(),
|
||||
requests->mainFd(), spec.dest);
|
||||
// Reports bytes of the section uploaded via dropbox after filtering.
|
||||
@@ -147,7 +168,7 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer,
|
||||
|
||||
DONE:
|
||||
// only returns error if there is no fd to write to.
|
||||
return writeable > 0 ? NO_ERROR : err;
|
||||
return nPassed > 0 ? NO_ERROR : err;
|
||||
}
|
||||
|
||||
// ================================================================================
|
||||
@@ -213,6 +234,8 @@ status_t MetadataSection::Execute(ReportRequestSet* requests) const {
|
||||
stats.timed_out());
|
||||
proto.write(FIELD_TYPE_BOOL | IncidentMetadata::SectionStats::kIsTruncatedFieldNumber,
|
||||
stats.is_truncated());
|
||||
proto.write(FIELD_TYPE_STRING | IncidentMetadata::SectionStats::kErrorMsgFieldNumber,
|
||||
stats.error_msg());
|
||||
proto.end(token);
|
||||
}
|
||||
|
||||
@@ -439,8 +462,9 @@ status_t WorkerThreadSection::Execute(ReportRequestSet* requests) const {
|
||||
status_t err = NO_ERROR;
|
||||
pthread_t thread;
|
||||
pthread_attr_t attr;
|
||||
bool timedOut = false;
|
||||
bool workerDone = false;
|
||||
FdBuffer buffer;
|
||||
IncidentMetadata::SectionStats* stats = requests->sectionStats(this->id);
|
||||
|
||||
// Data shared between this thread and the worker thread.
|
||||
sp<WorkerThreadData> data = new WorkerThreadData(this);
|
||||
@@ -475,8 +499,7 @@ status_t WorkerThreadSection::Execute(ReportRequestSet* requests) const {
|
||||
// Loop reading until either the timeout or the worker side is done (i.e. eof).
|
||||
err = buffer.read(data->pipe.readFd().get(), this->timeoutMs);
|
||||
if (err != NO_ERROR) {
|
||||
// TODO: Log this error into the incident report.
|
||||
ALOGW("[%s] reader failed with error '%s'", this->name.string(), strerror(-err));
|
||||
ALOGE("[%s] reader failed with error '%s'", this->name.string(), strerror(-err));
|
||||
}
|
||||
|
||||
// Done with the read fd. The worker thread closes the write one so
|
||||
@@ -484,31 +507,32 @@ status_t WorkerThreadSection::Execute(ReportRequestSet* requests) const {
|
||||
data->pipe.readFd().reset();
|
||||
|
||||
// If the worker side is finished, then return its error (which may overwrite
|
||||
// our possible error -- but it's more interesting anyway). If not, then we timed out.
|
||||
// our possible error -- but it's more interesting anyway). If not, then we timed out.
|
||||
{
|
||||
unique_lock<mutex> lock(data->lock);
|
||||
if (!data->workerDone) {
|
||||
// We timed out
|
||||
timedOut = true;
|
||||
} else {
|
||||
if (data->workerError != NO_ERROR) {
|
||||
err = data->workerError;
|
||||
// TODO: Log this error into the incident report.
|
||||
ALOGW("[%s] worker failed with error '%s'", this->name.string(), strerror(-err));
|
||||
}
|
||||
if (data->workerError != NO_ERROR) {
|
||||
err = data->workerError;
|
||||
ALOGE("[%s] worker failed with error '%s'", this->name.string(), strerror(-err));
|
||||
}
|
||||
workerDone = data->workerDone;
|
||||
}
|
||||
|
||||
write_section_stats(requests->sectionStats(this->id), buffer);
|
||||
if (timedOut || buffer.timedOut()) {
|
||||
ALOGW("[%s] timed out", this->name.string());
|
||||
write_section_stats(stats, buffer);
|
||||
if (err != NO_ERROR) {
|
||||
char errMsg[128];
|
||||
snprintf(errMsg, 128, "[%s] failed with error '%s'",
|
||||
this->name.string(), strerror(-err));
|
||||
stats->set_success(false);
|
||||
stats->set_error_msg(errMsg);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
// TODO: There was an error with the command or buffering. Report that. For now
|
||||
// just exit with a log messasge.
|
||||
if (err != NO_ERROR) {
|
||||
ALOGW("[%s] failed with error '%s'", this->name.string(), strerror(-err));
|
||||
if (buffer.truncated()) {
|
||||
ALOGW("[%s] too large, truncating", this->name.string());
|
||||
// Do not write a truncated section. It won't pass through the PrivacyBuffer.
|
||||
return NO_ERROR;
|
||||
}
|
||||
if (!workerDone || buffer.timedOut()) {
|
||||
ALOGW("[%s] timed out", this->name.string());
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
@@ -617,14 +641,8 @@ status_t DumpsysSection::BlockingCall(int pipeWriteFd) const {
|
||||
sp<IBinder> service = defaultServiceManager()->checkService(mService);
|
||||
|
||||
if (service == NULL) {
|
||||
// Returning an error interrupts the entire incident report, so just
|
||||
// log the failure.
|
||||
// TODO: have a meta record inside the report that would log this
|
||||
// failure inside the report, because the fact that we can't find
|
||||
// the service is good data in and of itself. This is running in
|
||||
// another thread so lock that carefully...
|
||||
ALOGW("DumpsysSection: Can't lookup service: %s", String8(mService).string());
|
||||
return NO_ERROR;
|
||||
return NAME_NOT_FOUND;
|
||||
}
|
||||
|
||||
service->dump(pipeWriteFd, mArgs);
|
||||
|
||||
@@ -61,8 +61,10 @@ message IncidentMetadata {
|
||||
optional bool timed_out = 7;
|
||||
// true if the section is truncated.
|
||||
optional bool is_truncated = 8;
|
||||
// message for debugging if there is an error.
|
||||
optional string error_msg = 9;
|
||||
|
||||
// Next Tag: 9
|
||||
// Next Tag: 10;
|
||||
}
|
||||
repeated SectionStats sections = 6;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user