Merge "Remove the obsolete code for logd and add statsd socket log loss detection."

This commit is contained in:
TreeHugger Robot
2018-08-11 00:30:47 +00:00
committed by Android (Google) Code Review
15 changed files with 33 additions and 470 deletions

View File

@@ -45,7 +45,6 @@ statsd_common_src := \
src/external/puller_util.cpp \
src/logd/LogEvent.cpp \
src/logd/LogListener.cpp \
src/logd/LogReader.cpp \
src/matchers/CombinationLogMatchingTracker.cpp \
src/matchers/matcher_util.cpp \
src/matchers/SimpleLogMatchingTracker.cpp \
@@ -194,7 +193,6 @@ LOCAL_SRC_FILES := \
tests/external/puller_util_test.cpp \
tests/indexed_priority_queue_test.cpp \
tests/LogEntryMatcher_test.cpp \
tests/LogReader_test.cpp \
tests/LogEvent_test.cpp \
tests/MetricsManager_test.cpp \
tests/StatsLogProcessor_test.cpp \

View File

@@ -161,10 +161,6 @@ void StatsLogProcessor::onIsolatedUidChangedEventLocked(const LogEvent& event) {
}
}
void StatsLogProcessor::OnLogEvent(LogEvent* event) {
OnLogEvent(event, false);
}
void StatsLogProcessor::resetConfigs() {
std::lock_guard<std::mutex> lock(mMetricsMutex);
resetConfigsLocked(getElapsedRealtimeNs());
@@ -178,7 +174,7 @@ void StatsLogProcessor::resetConfigsLocked(const int64_t timestampNs) {
resetConfigsLocked(timestampNs, configKeys);
}
void StatsLogProcessor::OnLogEvent(LogEvent* event, bool reconnected) {
void StatsLogProcessor::OnLogEvent(LogEvent* event) {
std::lock_guard<std::mutex> lock(mMetricsMutex);
#ifdef VERY_VERBOSE_PRINTING
@@ -188,41 +184,6 @@ void StatsLogProcessor::OnLogEvent(LogEvent* event, bool reconnected) {
#endif
const int64_t currentTimestampNs = event->GetElapsedTimestampNs();
if (reconnected && mLastTimestampSeen != 0) {
// LogReader tells us the connection has just been reset. Now we need
// to enter reconnection state to find the last CP.
mInReconnection = true;
}
if (mInReconnection) {
// We see the checkpoint
if (currentTimestampNs == mLastTimestampSeen) {
mInReconnection = false;
// Found the CP. ignore this event, and we will start to read from next event.
return;
}
if (currentTimestampNs > mLargestTimestampSeen) {
// We see a new log but CP has not been found yet. Give up now.
mLogLossCount++;
mInReconnection = false;
StatsdStats::getInstance().noteLogLost(currentTimestampNs);
// Persist the data before we reset. Do we want this?
WriteDataToDiskLocked(CONFIG_RESET);
// We see fresher event before we see the checkpoint. We might have lost data.
// The best we can do is to reset.
resetConfigsLocked(currentTimestampNs);
} else {
// Still in search of the CP. Keep going.
return;
}
}
mLogCount++;
mLastTimestampSeen = currentTimestampNs;
if (mLargestTimestampSeen < currentTimestampNs) {
mLargestTimestampSeen = currentTimestampNs;
}
resetIfConfigTtlExpiredLocked(currentTimestampNs);
StatsdStats::getInstance().noteAtomLogged(

View File

@@ -18,7 +18,6 @@
#include <gtest/gtest_prod.h>
#include "config/ConfigListener.h"
#include "logd/LogReader.h"
#include "metrics/MetricsManager.h"
#include "packages/UidMap.h"
#include "external/StatsPullerManager.h"
@@ -52,9 +51,6 @@ public:
const std::function<bool(const ConfigKey&)>& sendBroadcast);
virtual ~StatsLogProcessor();
void OnLogEvent(LogEvent* event, bool reconnectionStarts);
// for testing only.
void OnLogEvent(LogEvent* event);
void OnConfigUpdated(const int64_t timestampNs, const ConfigKey& key,
@@ -174,14 +170,6 @@ private:
int64_t mLastTimestampSeen = 0;
bool mInReconnection = false;
// Processed log count
uint64_t mLogCount = 0;
// Log loss detected count
int mLogLossCount = 0;
long mLastPullerCacheClearTimeSec = 0;
#ifdef VERY_VERBOSE_PRINTING

View File

@@ -883,8 +883,8 @@ void StatsService::Startup() {
mConfigManager->Startup();
}
void StatsService::OnLogEvent(LogEvent* event, bool reconnectionStarts) {
mProcessor->OnLogEvent(event, reconnectionStarts);
void StatsService::OnLogEvent(LogEvent* event) {
mProcessor->OnLogEvent(event);
}
Status StatsService::getData(int64_t key, const String16& packageName, vector<uint8_t>* output) {

View File

@@ -22,6 +22,7 @@
#include "anomaly/AlarmMonitor.h"
#include "config/ConfigManager.h"
#include "external/StatsPullerManager.h"
#include "logd/LogListener.h"
#include "packages/UidMap.h"
#include "statscompanion_util.h"
@@ -75,7 +76,7 @@ public:
/**
* Called by LogReader when there's a log event to process.
*/
virtual void OnLogEvent(LogEvent* event, bool reconnectionStarts);
virtual void OnLogEvent(LogEvent* event);
/**
* Binder call for clients to request data for this configuration key.

View File

@@ -180,12 +180,12 @@ void StatsdStats::noteConfigReset(const ConfigKey& key) {
noteConfigResetInternalLocked(key);
}
void StatsdStats::noteLogLost(int64_t timestampNs) {
void StatsdStats::noteLogLost(int64_t timestampNs, int32_t count) {
lock_guard<std::mutex> lock(mLock);
if (mLogLossTimestampNs.size() == kMaxLoggerErrors) {
mLogLossTimestampNs.pop_front();
}
mLogLossTimestampNs.push_back(timestampNs);
mLogLossTimestampNs.push_back(std::make_pair(timestampNs, count));
}
void StatsdStats::noteBroadcastSent(const ConfigKey& key) {
@@ -529,7 +529,8 @@ void StatsdStats::dumpStats(FILE* out) const {
}
for (const auto& loss : mLogLossTimestampNs) {
fprintf(out, "Log loss detected at %lld (elapsedRealtimeNs)\n", (long long)loss);
fprintf(out, "Log loss: %lld (elapsedRealtimeNs) - %d (count)\n", (long long)loss.first,
loss.second);
}
}
@@ -687,7 +688,7 @@ void StatsdStats::dumpStats(std::vector<uint8_t>* output, bool reset) {
for (const auto& loss : mLogLossTimestampNs) {
proto.write(FIELD_TYPE_INT64 | FIELD_ID_LOG_LOSS_STATS | FIELD_COUNT_REPEATED,
(long long)loss);
(long long)loss.first);
}
for (const auto& restart : mSystemServerRestartSec) {

View File

@@ -284,7 +284,7 @@ public:
/**
* Records statsd skipped an event.
*/
void noteLogLost(int64_t timestamp);
void noteLogLost(int64_t timestamp, int32_t count);
/**
* Reset the historical stats. Including all stats in icebox, and the tracked stats about
@@ -341,8 +341,8 @@ private:
// Logd errors. Size capped by kMaxLoggerErrors.
std::list<const std::pair<int, int>> mLoggerErrors;
// Timestamps when we detect log loss after logd reconnect.
std::list<int64_t> mLogLossTimestampNs;
// Timestamps when we detect log loss, and the number of logs lost.
std::list<std::pair<int64_t, int32_t>> mLogLossTimestampNs;
std::list<int32_t> mSystemServerRestartSec;

View File

@@ -14,17 +14,7 @@
* limitations under the License.
*/
#include "logd/LogReader.h"
#include <log/log_read.h>
#include <utils/Errors.h>
#include <time.h>
#include <unistd.h>
using namespace android;
using namespace std;
#include "logd/LogListener.h"
namespace android {
namespace os {

View File

@@ -19,7 +19,6 @@
#include "logd/LogEvent.h"
#include <utils/RefBase.h>
#include <vector>
namespace android {
namespace os {
@@ -33,7 +32,7 @@ public:
LogListener();
virtual ~LogListener();
virtual void OnLogEvent(LogEvent* msg, bool reconnectionStarts) = 0;
virtual void OnLogEvent(LogEvent* msg) = 0;
};
} // namespace statsd

View File

@@ -1,129 +0,0 @@
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "logd/LogReader.h"
#include "guardrail/StatsdStats.h"
#include <time.h>
#include <unistd.h>
#include <utils/Errors.h>
using namespace android;
using namespace std;
namespace android {
namespace os {
namespace statsd {
#define SNOOZE_INITIAL_MS 100
#define SNOOZE_MAX_MS (10 * 60 * 1000) // Ten minutes
LogReader::LogReader(const sp<LogListener>& listener) : mListener(listener) {
}
LogReader::~LogReader() {
}
void LogReader::Run() {
int nextSnoozeMs = SNOOZE_INITIAL_MS;
// In an ideal world, this outer loop will only ever run one iteration, but it
// exists to handle crashes in logd. The inner loop inside connect_and_read()
// reads from logd forever, but if that read fails, we fall out to the outer
// loop, do the backoff (resetting the backoff timeout if we successfully read
// something), and then try again.
while (true) {
// Connect and read
int lineCount = connect_and_read();
// Figure out how long to sleep.
if (lineCount > 0) {
// If we managed to read at least one line, reset the backoff
nextSnoozeMs = SNOOZE_INITIAL_MS;
} else {
// Otherwise, expontial backoff
nextSnoozeMs *= 1.5f;
if (nextSnoozeMs > 10 * 60 * 1000) {
// Don't wait for toooo long.
nextSnoozeMs = SNOOZE_MAX_MS;
}
}
// Sleep
timespec ts;
timespec rem;
ts.tv_sec = nextSnoozeMs / 1000;
ts.tv_nsec = (nextSnoozeMs % 1000) * 1000000L;
while (nanosleep(&ts, &rem) == -1) {
if (errno == EINTR) {
ts = rem;
}
// other errors are basically impossible
}
}
}
int LogReader::connect_and_read() {
int lineCount = 0;
status_t err;
logger_list* loggers;
logger* eventLogger;
// Prepare the logging context
loggers = android_logger_list_alloc(ANDROID_LOG_RDONLY,
/* don't stop after N lines */ 0,
/* no pid restriction */ 0);
// Open the buffer(s)
eventLogger = android_logger_open(loggers, LOG_ID_STATS);
// Read forever
if (eventLogger) {
log_msg msg;
while (true) {
// Read a message
err = android_logger_list_read(loggers, &msg);
// err = 0 - no content, unexpected connection drop or EOF.
// err = +ive number - size of retrieved data from logger
// err = -ive number, OS supplied error _except_ for -EAGAIN
if (err <= 0) {
StatsdStats::getInstance().noteLoggerError(err);
fprintf(stderr, "logcat read failure: %s\n", strerror(err));
break;
}
// Record that we read one (used above to know how to snooze).
lineCount++;
// Wrap it in a LogEvent object
LogEvent event(msg);
// Call the listener
mListener->OnLogEvent(&event,
lineCount == 1 /* indicate whether it's a new connection */);
}
}
// Free the logger list and close the individual loggers
android_logger_list_free(loggers);
return lineCount;
}
} // namespace statsd
} // namespace os
} // namespace android

View File

@@ -1,69 +0,0 @@
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LOGREADER_H
#define LOGREADER_H
#include "logd/LogListener.h"
#include <utils/RefBase.h>
#include <vector>
namespace android {
namespace os {
namespace statsd {
/**
* Class to read logs from logd.
*/
class LogReader : public virtual android::RefBase {
public:
/**
* Construct the LogReader with the event listener. (Which is StatsService)
*/
LogReader(const sp<LogListener>& listener);
/**
* Destructor.
*/
virtual ~LogReader();
/**
* Run the main LogReader loop
*/
void Run();
private:
/**
* Who is going to get the events when they're read.
*/
sp<LogListener> mListener;
/**
* Connect to a single instance of logd, and read until there's a read error.
* Logd can crash, exit, be killed etc.
*
* Returns the number of lines that were read.
*/
int connect_and_read();
};
} // namespace statsd
} // namespace os
} // namespace android
#endif // LOGREADER_H

View File

@@ -18,7 +18,6 @@
#include "Log.h"
#include "StatsService.h"
#include "logd/LogReader.h"
#include "socket/StatsSocketListener.h"
#include <binder/IInterface.h>
@@ -39,9 +38,6 @@
using namespace android;
using namespace android::os::statsd;
const bool kUseLogd = false;
const bool kUseStatsdSocket = true;
/**
* Thread function data.
*/
@@ -49,58 +45,6 @@ struct log_reader_thread_data {
sp<StatsService> service;
};
/**
* Thread func for where the log reader runs.
*/
static void* log_reader_thread_func(void* cookie) {
log_reader_thread_data* data = static_cast<log_reader_thread_data*>(cookie);
sp<LogReader> reader = new LogReader(data->service);
// Run the read loop. Never returns.
reader->Run();
ALOGW("statsd LogReader.Run() is not supposed to return.");
delete data;
return NULL;
}
/**
* Creates and starts the thread to own the LogReader.
*/
static status_t start_log_reader_thread(const sp<StatsService>& service) {
status_t err;
pthread_attr_t attr;
pthread_t thread;
// Thread data.
std::unique_ptr<log_reader_thread_data> data = std::make_unique<log_reader_thread_data>();
data->service = service;
// Create the thread
err = pthread_attr_init(&attr);
if (err != NO_ERROR) {
return err;
}
err = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
if (err != NO_ERROR) {
pthread_attr_destroy(&attr);
return err;
}
err = pthread_create(&thread, &attr, log_reader_thread_func,
static_cast<void*>(data.get()));
if (err != NO_ERROR) {
pthread_attr_destroy(&attr);
return err;
}
// Release here rather than in pthread_create, since an error creating the
// thread leaves `data` ownerless.
data.release();
pthread_attr_destroy(&attr);
return NO_ERROR;
}
int main(int /*argc*/, char** /*argv*/) {
// Set up the looper
sp<Looper> looper(Looper::prepare(0 /* opts */));
@@ -124,22 +68,11 @@ int main(int /*argc*/, char** /*argv*/) {
sp<StatsSocketListener> socketListener = new StatsSocketListener(service);
if (kUseLogd) {
ALOGI("using logd");
// Start the log reader thread
status_t err = start_log_reader_thread(service);
if (err != NO_ERROR) {
return 1;
}
}
if (kUseStatsdSocket) {
ALOGI("using statsd socket");
// Backlog and /proc/sys/net/unix/max_dgram_qlen set to large value
if (socketListener->startListener(600)) {
exit(1);
}
}
// Loop forever -- the reports run on this thread in a handler, and the
// binder calls remain responsive in their pool of one thread.

View File

@@ -40,6 +40,7 @@ namespace os {
namespace statsd {
static const int kLogMsgHeaderSize = 28;
static const int kLibLogTag = 1006;
StatsSocketListener::StatsSocketListener(const sp<LogListener>& listener)
: SocketListener(getLogSocket(), false /*start listen*/), mListener(listener) {
@@ -99,6 +100,22 @@ bool StatsSocketListener::onDataAvailable(SocketClient* cli) {
char* ptr = ((char*)buffer) + sizeof(android_log_header_t);
n -= sizeof(android_log_header_t);
// When a log failed to write to statsd socket (e.g., due ot EBUSY), a special message would
// be sent to statsd when the socket communication becomes available again.
// The format is android_log_event_int_t with a single integer in the payload indicating the
// number of logs that failed. (*FORMAT MUST BE IN SYNC WITH system/core/libstats*)
// Note that all normal stats logs are in the format of event_list, so there won't be confusion.
//
// TODO(b/80538532): In addition to log it in StatsdStats, we should properly reset the config.
if (n == sizeof(android_log_event_int_t)) {
android_log_event_int_t* int_event = reinterpret_cast<android_log_event_int_t*>(ptr);
if (int_event->header.tag == kLibLogTag && int_event->payload.type == EVENT_TYPE_INT) {
ALOGE("Found dropped events: %d", int_event->payload.data);
StatsdStats::getInstance().noteLogLost(getElapsedRealtimeNs(), int_event->payload.data);
return true;
}
}
log_msg msg;
msg.entry.len = n;
@@ -111,7 +128,7 @@ bool StatsSocketListener::onDataAvailable(SocketClient* cli) {
LogEvent event(msg);
// Call the listener
mListener->OnLogEvent(&event, false /*reconnected, N/A in statsd socket*/);
mListener->OnLogEvent(&event);
return true;
}

View File

@@ -17,7 +17,6 @@
#pragma once
#include "HashableDimensionKey.h"
#include "logd/LogReader.h"
#include <unordered_map>

View File

@@ -238,132 +238,6 @@ TEST(StatsLogProcessorTest, TestReportIncludesSubConfig) {
EXPECT_EQ(2, report.annotation(0).field_int32());
}
TEST(StatsLogProcessorTest, TestOutOfOrderLogs) {
// Setup simple config key corresponding to empty config.
sp<UidMap> m = new UidMap();
sp<StatsPullerManager> pullerManager = new StatsPullerManager();
sp<AlarmMonitor> anomalyAlarmMonitor;
sp<AlarmMonitor> subscriberAlarmMonitor;
int broadcastCount = 0;
StatsLogProcessor p(m, pullerManager, anomalyAlarmMonitor, subscriberAlarmMonitor, 0,
[&broadcastCount](const ConfigKey& key) {
broadcastCount++;
return true;
});
LogEvent event1(0, 1 /*logd timestamp*/, 1001 /*elapsedRealtime*/);
event1.init();
LogEvent event2(0, 2, 1002);
event2.init();
LogEvent event3(0, 3, 1005);
event3.init();
LogEvent event4(0, 4, 1004);
event4.init();
// <----- Reconnection happens
LogEvent event5(0, 5, 999);
event5.init();
LogEvent event6(0, 6, 2000);
event6.init();
// <----- Reconnection happens
LogEvent event7(0, 7, 3000);
event7.init();
// first event ever
p.OnLogEvent(&event1, true);
EXPECT_EQ(1UL, p.mLogCount);
EXPECT_EQ(1001LL, p.mLargestTimestampSeen);
EXPECT_EQ(1001LL, p.mLastTimestampSeen);
p.OnLogEvent(&event2, false);
EXPECT_EQ(2UL, p.mLogCount);
EXPECT_EQ(1002LL, p.mLargestTimestampSeen);
EXPECT_EQ(1002LL, p.mLastTimestampSeen);
p.OnLogEvent(&event3, false);
EXPECT_EQ(3UL, p.mLogCount);
EXPECT_EQ(1005LL, p.mLargestTimestampSeen);
EXPECT_EQ(1005LL, p.mLastTimestampSeen);
p.OnLogEvent(&event4, false);
EXPECT_EQ(4UL, p.mLogCount);
EXPECT_EQ(1005LL, p.mLargestTimestampSeen);
EXPECT_EQ(1004LL, p.mLastTimestampSeen);
EXPECT_FALSE(p.mInReconnection);
// Reconnect happens, event1 out of buffer. Read event2
p.OnLogEvent(&event2, true);
EXPECT_EQ(4UL, p.mLogCount);
EXPECT_EQ(1005LL, p.mLargestTimestampSeen);
EXPECT_EQ(1004LL, p.mLastTimestampSeen);
EXPECT_TRUE(p.mInReconnection);
p.OnLogEvent(&event3, false);
EXPECT_EQ(4UL, p.mLogCount);
EXPECT_EQ(1005LL, p.mLargestTimestampSeen);
EXPECT_EQ(1004LL, p.mLastTimestampSeen);
EXPECT_TRUE(p.mInReconnection);
p.OnLogEvent(&event4, false);
EXPECT_EQ(4UL, p.mLogCount);
EXPECT_EQ(1005LL, p.mLargestTimestampSeen);
EXPECT_EQ(1004LL, p.mLastTimestampSeen);
EXPECT_FALSE(p.mInReconnection);
// Fresh event comes.
p.OnLogEvent(&event5, false);
EXPECT_EQ(5UL, p.mLogCount);
EXPECT_EQ(1005LL, p.mLargestTimestampSeen);
EXPECT_EQ(999LL, p.mLastTimestampSeen);
p.OnLogEvent(&event6, false);
EXPECT_EQ(6UL, p.mLogCount);
EXPECT_EQ(2000LL, p.mLargestTimestampSeen);
EXPECT_EQ(2000LL, p.mLastTimestampSeen);
// Reconnect happens, read from event4
p.OnLogEvent(&event4, true);
EXPECT_EQ(6UL, p.mLogCount);
EXPECT_EQ(2000LL, p.mLargestTimestampSeen);
EXPECT_EQ(2000LL, p.mLastTimestampSeen);
EXPECT_TRUE(p.mInReconnection);
p.OnLogEvent(&event5, false);
EXPECT_EQ(6UL, p.mLogCount);
EXPECT_EQ(2000LL, p.mLargestTimestampSeen);
EXPECT_EQ(2000LL, p.mLastTimestampSeen);
EXPECT_TRUE(p.mInReconnection);
// Before we get out of reconnection state, it reconnects again.
p.OnLogEvent(&event5, true);
EXPECT_EQ(6UL, p.mLogCount);
EXPECT_EQ(2000LL, p.mLargestTimestampSeen);
EXPECT_EQ(2000LL, p.mLastTimestampSeen);
EXPECT_TRUE(p.mInReconnection);
p.OnLogEvent(&event6, false);
EXPECT_EQ(6UL, p.mLogCount);
EXPECT_EQ(2000LL, p.mLargestTimestampSeen);
EXPECT_EQ(2000LL, p.mLastTimestampSeen);
EXPECT_FALSE(p.mInReconnection);
EXPECT_EQ(0, p.mLogLossCount);
// it reconnects again. All old events are gone. We lose CP.
p.OnLogEvent(&event7, true);
EXPECT_EQ(7UL, p.mLogCount);
EXPECT_EQ(3000LL, p.mLargestTimestampSeen);
EXPECT_EQ(3000LL, p.mLastTimestampSeen);
EXPECT_EQ(1, p.mLogLossCount);
EXPECT_FALSE(p.mInReconnection);
}
#else
GTEST_LOG_(INFO) << "This test does nothing.\n";
#endif