summaryrefslogtreecommitdiff
path: root/cmds/statsd/src/guardrail/StatsdStats.h
blob: 564b9ee8051c757ca62e79f125879e7ca48fe62c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
/*
 * Copyright 2017, The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#pragma once

#include "config/ConfigKey.h"
#include "atoms_info.h"

#include <gtest/gtest_prod.h>
#include <log/log_time.h>
#include <list>
#include <mutex>
#include <string>
#include <vector>
#include <unordered_map>

namespace android {
namespace os {
namespace statsd {

struct ConfigStats {
    int32_t uid;
    int64_t id;
    int32_t creation_time_sec;
    int32_t deletion_time_sec = 0;
    int32_t reset_time_sec = 0;
    int32_t metric_count;
    int32_t condition_count;
    int32_t matcher_count;
    int32_t alert_count;
    bool is_valid;

    std::list<int32_t> broadcast_sent_time_sec;

    // Times at which this config is activated.
    std::list<int32_t> activation_time_sec;

    // Times at which this config is deactivated.
    std::list<int32_t> deactivation_time_sec;

    std::list<int32_t> data_drop_time_sec;
    // Number of bytes dropped at corresponding time.
    std::list<int64_t> data_drop_bytes;
    std::list<std::pair<int32_t, int64_t>> dump_report_stats;

    // Stores how many times a matcher have been matched. The map size is capped by kMaxConfigCount.
    std::map<const int64_t, int> matcher_stats;

    // Stores the number of output tuple of condition trackers when it's bigger than
    // kDimensionKeySizeSoftLimit. When you see the number is kDimensionKeySizeHardLimit +1,
    // it means some data has been dropped. The map size is capped by kMaxConfigCount.
    std::map<const int64_t, int> condition_stats;

    // Stores the number of output tuple of metric producers when it's bigger than
    // kDimensionKeySizeSoftLimit. When you see the number is kDimensionKeySizeHardLimit +1,
    // it means some data has been dropped. The map size is capped by kMaxConfigCount.
    std::map<const int64_t, int> metric_stats;

    // Stores the max number of output tuple of dimensions in condition across dimensions in what
    // when it's bigger than kDimensionKeySizeSoftLimit. When you see the number is
    // kDimensionKeySizeHardLimit +1, it means some data has been dropped. The map size is capped by
    // kMaxConfigCount.
    std::map<const int64_t, int> metric_dimension_in_condition_stats;

    // Stores the number of times an anomaly detection alert has been declared.
    // The map size is capped by kMaxConfigCount.
    std::map<const int64_t, int> alert_stats;

    // Stores the config ID for each sub-config used.
    std::list<std::pair<const int64_t, const int32_t>> annotations;
};

struct UidMapStats {
    int32_t changes;
    int32_t bytes_used;
    int32_t dropped_changes;
    int32_t deleted_apps = 0;
};

// Keeps track of stats of statsd.
// Single instance shared across the process. All public methods are thread safe.
class StatsdStats {
public:
    static StatsdStats& getInstance();
    ~StatsdStats(){};

    const static int kDimensionKeySizeSoftLimit = 500;
    const static int kDimensionKeySizeHardLimit = 800;

    // Per atom dimension key size limit
    static const std::map<int, std::pair<size_t, size_t>> kAtomDimensionKeySizeLimitMap;

    const static int kMaxConfigCountPerUid = 10;
    const static int kMaxAlertCountPerConfig = 100;
    const static int kMaxConditionCountPerConfig = 300;
    const static int kMaxMetricCountPerConfig = 1000;
    const static int kMaxMatcherCountPerConfig = 800;

    // The max number of old config stats we keep.
    const static int kMaxIceBoxSize = 20;

    const static int kMaxLoggerErrors = 20;

    const static int kMaxSystemServerRestarts = 20;

    const static int kMaxTimestampCount = 20;

    const static int kMaxLogSourceCount = 50;

    // Max memory allowed for storing metrics per configuration. If this limit is exceeded, statsd
    // drops the metrics data in memory.
    static const size_t kMaxMetricsBytesPerConfig = 2 * 1024 * 1024;

    // Soft memory limit per configuration. Once this limit is exceeded, we begin notifying the
    // data subscriber that it's time to call getData.
    static const size_t kBytesPerConfigTriggerGetData = 192 * 1024;

    // Cap the UID map's memory usage to this. This should be fairly high since the UID information
    // is critical for understanding the metrics.
    const static size_t kMaxBytesUsedUidMap = 50 * 1024;

    // The number of deleted apps that are stored in the uid map.
    const static int kMaxDeletedAppsInUidMap = 100;

    /* Minimum period between two broadcasts in nanoseconds. */
    static const int64_t kMinBroadcastPeriodNs = 60 * NS_PER_SEC;

    /* Min period between two checks of byte size per config key in nanoseconds. */
    static const int64_t kMinByteSizeCheckPeriodNs = 60 * NS_PER_SEC;

    /* Minimum period between two activation broadcasts in nanoseconds. */
    static const int64_t kMinActivationBroadcastPeriodNs = 10 * NS_PER_SEC;

    // Maximum age (30 days) that files on disk can exist in seconds.
    static const int kMaxAgeSecond = 60 * 60 * 24 * 30;

    // Maximum age (2 days) that local history files on disk can exist in seconds.
    static const int kMaxLocalHistoryAgeSecond = 60 * 60 * 24 * 2;

    // Maximum number of files (1000) that can be in stats directory on disk.
    static const int kMaxFileNumber = 1000;

    // Maximum size of all files that can be written to stats directory on disk.
    static const int kMaxFileSize = 50 * 1024 * 1024;

    // How long to try to clear puller cache from last time
    static const long kPullerCacheClearIntervalSec = 1;

    // Max time to do a pull.
    static const int64_t kPullMaxDelayNs = 10 * NS_PER_SEC;

    // Maximum number of pushed atoms statsd stats will track above kMaxPushedAtomId.
    static const int kMaxNonPlatformPushedAtoms = 100;

    // Max platform atom tag number.
    static const int32_t kMaxPlatformAtomTag = 100000;

    // Vendor pulled atom start id.
    static const int32_t kVendorPulledAtomStartTag = 150000;

    // Beginning of range for timestamp truncation.
    static const int32_t kTimestampTruncationStartTag = 300000;

    // End of range for timestamp truncation.
    static const int32_t kTimestampTruncationEndTag = 304999;

    // Max accepted atom id.
    static const int32_t kMaxAtomTag = 200000;

    static const int64_t kInt64Max = 0x7fffffffffffffffLL;

    /**
     * Report a new config has been received and report the static stats about the config.
     *
     * The static stats include: the count of metrics, conditions, matchers, and alerts.
     * If the config is not valid, this config stats will be put into icebox immediately.
     */
    void noteConfigReceived(const ConfigKey& key, int metricsCount, int conditionsCount,
                            int matchersCount, int alertCount,
                            const std::list<std::pair<const int64_t, const int32_t>>& annotations,
                            bool isValid);
    /**
     * Report a config has been removed.
     */
    void noteConfigRemoved(const ConfigKey& key);
    /**
     * Report a config has been reset when ttl expires.
     */
    void noteConfigReset(const ConfigKey& key);

    /**
     * Report a broadcast has been sent to a config owner to collect the data.
     */
    void noteBroadcastSent(const ConfigKey& key);

    /**
     * Report that a config has become activated or deactivated.
     * This can be different from whether or not a broadcast is sent if the
     * guardrail prevented the broadcast from being sent.
     */
    void noteActiveStatusChanged(const ConfigKey& key, bool activate);

    /**
     * Report a config's metrics data has been dropped.
     */
    void noteDataDropped(const ConfigKey& key, const size_t totalBytes);

    /**
     * Report metrics data report has been sent.
     *
     * The report may be requested via StatsManager API, or through adb cmd.
     */
    void noteMetricsReportSent(const ConfigKey& key, const size_t num_bytes);

    /**
     * Report the size of output tuple of a condition.
     *
     * Note: only report when the condition has an output dimension, and the tuple
     * count > kDimensionKeySizeSoftLimit.
     *
     * [key]: The config key that this condition belongs to.
     * [id]: The id of the condition.
     * [size]: The output tuple size.
     */
    void noteConditionDimensionSize(const ConfigKey& key, const int64_t& id, int size);

    /**
     * Report the size of output tuple of a metric.
     *
     * Note: only report when the metric has an output dimension, and the tuple
     * count > kDimensionKeySizeSoftLimit.
     *
     * [key]: The config key that this metric belongs to.
     * [id]: The id of the metric.
     * [size]: The output tuple size.
     */
    void noteMetricDimensionSize(const ConfigKey& key, const int64_t& id, int size);

    /**
     * Report the max size of output tuple of dimension in condition across dimensions in what.
     *
     * Note: only report when the metric has an output dimension in condition, and the max tuple
     * count > kDimensionKeySizeSoftLimit.
     *
     * [key]: The config key that this metric belongs to.
     * [id]: The id of the metric.
     * [size]: The output tuple size.
     */
    void noteMetricDimensionInConditionSize(const ConfigKey& key, const int64_t& id, int size);

    /**
     * Report a matcher has been matched.
     *
     * [key]: The config key that this matcher belongs to.
     * [id]: The id of the matcher.
     */
    void noteMatcherMatched(const ConfigKey& key, const int64_t& id);

    /**
     * Report that an anomaly detection alert has been declared.
     *
     * [key]: The config key that this alert belongs to.
     * [id]: The id of the alert.
     */
    void noteAnomalyDeclared(const ConfigKey& key, const int64_t& id);

    /**
     * Report an atom event has been logged.
     */
    void noteAtomLogged(int atomId, int32_t timeSec);

    /**
     * Report that statsd modified the anomaly alarm registered with StatsCompanionService.
     */
    void noteRegisteredAnomalyAlarmChanged();

    /**
     * Report that statsd modified the periodic alarm registered with StatsCompanionService.
     */
    void noteRegisteredPeriodicAlarmChanged();

    /**
     * Records the number of delta entries that are being dropped from the uid map.
     */
    void noteUidMapDropped(int deltas);

    /**
     * Records that an app was deleted (from statsd's map).
     */
    void noteUidMapAppDeletionDropped();

    /**
     * Updates the number of changes currently stored in the uid map.
     */
    void setUidMapChanges(int changes);
    void setCurrentUidMapMemory(int bytes);

    /*
     * Updates minimum interval between pulls for an pulled atom.
     */
    void updateMinPullIntervalSec(int pullAtomId, long intervalSec);

    /*
     * Notes an atom is pulled.
     */
    void notePull(int pullAtomId);

    /*
     * Notes an atom is served from puller cache.
     */
    void notePullFromCache(int pullAtomId);

    /*
     * Notify data error for pulled atom.
     */
    void notePullDataError(int pullAtomId);

    /*
     * Records time for actual pulling, not including those served from cache and not including
     * statsd processing delays.
     */
    void notePullTime(int pullAtomId, int64_t pullTimeNs);

    /*
     * Records pull delay for a pulled atom, including those served from cache and including statsd
     * processing delays.
     */
    void notePullDelay(int pullAtomId, int64_t pullDelayNs);

    /*
     * Records pull exceeds timeout for the puller.
     */
    void notePullTimeout(int pullAtomId);

    /*
     * Records pull exceeds max delay for a metric.
     */
    void notePullExceedMaxDelay(int pullAtomId);

    /*
     * Records when system server restarts.
     */
    void noteSystemServerRestart(int32_t timeSec);

    /**
     * Records statsd skipped an event.
     */
    void noteLogLost(int32_t wallClockTimeSec, int32_t count, int32_t lastError,
                     int32_t lastAtomTag, int32_t uid, int32_t pid);

    /**
     * Records that the pull of an atom has failed
     */
    void notePullFailed(int atomId);

    /**
     * Records that the pull of StatsCompanionService atom has failed
     */
    void noteStatsCompanionPullFailed(int atomId);

    /**
     * Records that the pull of a StatsCompanionService atom has failed due to a failed binder
     * transaction. This can happen when StatsCompanionService returns too
     * much data (the max Binder parcel size is 1MB)
     */
    void noteStatsCompanionPullBinderTransactionFailed(int atomId);

    /**
     * A pull with no data occurred
     */
    void noteEmptyData(int atomId);

    /**
     * Records that a puller callback for the given atomId was registered or unregistered.
     *
     * @param registered True if the callback was registered, false if was unregistered.
     */
    void notePullerCallbackRegistrationChanged(int atomId, bool registered);

    /**
     * Hard limit was reached in the cardinality of an atom
     */
    void noteHardDimensionLimitReached(int64_t metricId);

    /**
     * A log event was too late, arrived in the wrong bucket and was skipped
     */
    void noteLateLogEventSkipped(int64_t metricId);

    /**
     * Buckets were skipped as time elapsed without any data for them
     */
    void noteSkippedForwardBuckets(int64_t metricId);

    /**
     * An unsupported value type was received
     */
    void noteBadValueType(int64_t metricId);

    /**
     * Buckets were dropped due to reclaim memory.
     */
    void noteBucketDropped(int64_t metricId);

    /**
     * A condition change was too late, arrived in the wrong bucket and was skipped
     */
    void noteConditionChangeInNextBucket(int64_t metricId);

    /**
     * A bucket has been tagged as invalid.
     */
    void noteInvalidatedBucket(int64_t metricId);

    /**
     * Tracks the total number of buckets (include skipped/invalid buckets).
     */
    void noteBucketCount(int64_t metricId);

    /**
     * For pulls at bucket boundaries, it represents the misalignment between the real timestamp and
     * the end of the bucket.
     */
    void noteBucketBoundaryDelayNs(int64_t metricId, int64_t timeDelayNs);

    /**
     * Number of buckets with unknown condition.
     */
    void noteBucketUnknownCondition(int64_t metricId);

    /* Reports one event has been dropped due to queue overflow, and the oldest event timestamp in
     * the queue */
    void noteEventQueueOverflow(int64_t oldestEventTimestampNs);

    /**
     * Reports that the activation broadcast guardrail was hit for this uid. Namely, the broadcast
     * should have been sent, but instead was skipped due to hitting the guardrail.
     */
     void noteActivationBroadcastGuardrailHit(const int uid);

    /**
     * Reset the historical stats. Including all stats in icebox, and the tracked stats about
     * metrics, matchers, and atoms. The active configs will be kept and StatsdStats will continue
     * to collect stats after reset() has been called.
     */
    void reset();

    /**
     * Output the stats in protobuf binary format to [buffer].
     *
     * [reset]: whether to clear the historical stats after the call.
     */
    void dumpStats(std::vector<uint8_t>* buffer, bool reset);

    /**
     * Output statsd stats in human readable format to [out] file descriptor.
     */
    void dumpStats(int outFd) const;

    typedef struct {
        long totalPull = 0;
        long totalPullFromCache = 0;
        long minPullIntervalSec = LONG_MAX;
        int64_t avgPullTimeNs = 0;
        int64_t maxPullTimeNs = 0;
        long numPullTime = 0;
        int64_t avgPullDelayNs = 0;
        int64_t maxPullDelayNs = 0;
        long numPullDelay = 0;
        long dataError = 0;
        long pullTimeout = 0;
        long pullExceedMaxDelay = 0;
        long pullFailed = 0;
        long statsCompanionPullFailed = 0;
        long statsCompanionPullBinderTransactionFailed = 0;
        long emptyData = 0;
        long registeredCount = 0;
        long unregisteredCount = 0;
    } PulledAtomStats;

    typedef struct {
        long hardDimensionLimitReached = 0;
        long lateLogEventSkipped = 0;
        long skippedForwardBuckets = 0;
        long badValueType = 0;
        long conditionChangeInNextBucket = 0;
        long invalidatedBucket = 0;
        long bucketDropped = 0;
        int64_t minBucketBoundaryDelayNs = 0;
        int64_t maxBucketBoundaryDelayNs = 0;
        long bucketUnknownCondition = 0;
        long bucketCount = 0;
    } AtomMetricStats;

private:
    StatsdStats();

    mutable std::mutex mLock;

    int32_t mStartTimeSec;

    // Track the number of dropped entries used by the uid map.
    UidMapStats mUidMapStats;

    // The stats about the configs that are still in use.
    // The map size is capped by kMaxConfigCount.
    std::map<const ConfigKey, std::shared_ptr<ConfigStats>> mConfigStats;

    // Stores the stats for the configs that are no longer in use.
    // The size of the vector is capped by kMaxIceBoxSize.
    std::list<const std::shared_ptr<ConfigStats>> mIceBox;

    // Stores the number of times a pushed atom is logged.
    // The size of the vector is the largest pushed atom id in atoms.proto + 1. Atoms
    // out of that range will be put in mNonPlatformPushedAtomStats.
    // This is a vector, not a map because it will be accessed A LOT -- for each stats log.
    std::vector<int> mPushedAtomStats;

    // Stores the number of times a pushed atom is logged for atom ids above kMaxPushedAtomId.
    // The max size of the map is kMaxNonPlatformPushedAtoms.
    std::unordered_map<int, int> mNonPlatformPushedAtomStats;

    // Maps PullAtomId to its stats. The size is capped by the puller atom counts.
    std::map<int, PulledAtomStats> mPulledAtomStats;

    // Maps metric ID to its stats. The size is capped by the number of metrics.
    std::map<int64_t, AtomMetricStats> mAtomMetricStats;

    // Maps uids to times when the activation changed broadcast not sent due to hitting the
    // guardrail. The size is capped by the number of configs, and up to 20 times per uid.
    std::map<int, std::list<int32_t>> mActivationBroadcastGuardrailStats;

    struct LogLossStats {
        LogLossStats(int32_t sec, int32_t count, int32_t error, int32_t tag, int32_t uid,
                     int32_t pid)
            : mWallClockSec(sec),
              mCount(count),
              mLastError(error),
              mLastTag(tag),
              mUid(uid),
              mPid(pid) {
        }
        int32_t mWallClockSec;
        int32_t mCount;
        // error code defined in linux/errno.h
        int32_t mLastError;
        int32_t mLastTag;
        int32_t mUid;
        int32_t mPid;
    };

    // Max of {(now - oldestEventTimestamp) when overflow happens}.
    // This number is helpful to understand how SLOW statsd can be.
    int64_t mMaxQueueHistoryNs = 0;

    // Min of {(now - oldestEventTimestamp) when overflow happens}.
    // This number is helpful to understand how FAST the events floods to statsd.
    int64_t mMinQueueHistoryNs = kInt64Max;

    // Total number of events that are lost due to queue overflow.
    int32_t mOverflowCount = 0;

    // Timestamps when we detect log loss, and the number of logs lost.
    std::list<LogLossStats> mLogLossStats;

    std::list<int32_t> mSystemServerRestartSec;

    // Stores the number of times statsd modified the anomaly alarm registered with
    // StatsCompanionService.
    int mAnomalyAlarmRegisteredStats = 0;

    // Stores the number of times statsd registers the periodic alarm changes
    int mPeriodicAlarmRegisteredStats = 0;

    void noteConfigResetInternalLocked(const ConfigKey& key);

    void noteConfigRemovedInternalLocked(const ConfigKey& key);

    void resetInternalLocked();

    void noteDataDropped(const ConfigKey& key, const size_t totalBytes, int32_t timeSec);

    void noteMetricsReportSent(const ConfigKey& key, const size_t num_bytes, int32_t timeSec);

    void noteBroadcastSent(const ConfigKey& key, int32_t timeSec);

    void noteActiveStatusChanged(const ConfigKey& key, bool activate, int32_t timeSec);

    void noteActivationBroadcastGuardrailHit(const int uid, int32_t timeSec);

    void addToIceBoxLocked(std::shared_ptr<ConfigStats>& stats);

    /**
     * Get a reference to AtomMetricStats for a metric. If none exists, create it. The reference
     * will live as long as `this`.
     */
    StatsdStats::AtomMetricStats& getAtomMetricStats(int64_t metricId);

    FRIEND_TEST(StatsdStatsTest, TestValidConfigAdd);
    FRIEND_TEST(StatsdStatsTest, TestInvalidConfigAdd);
    FRIEND_TEST(StatsdStatsTest, TestConfigRemove);
    FRIEND_TEST(StatsdStatsTest, TestSubStats);
    FRIEND_TEST(StatsdStatsTest, TestAtomLog);
    FRIEND_TEST(StatsdStatsTest, TestNonPlatformAtomLog);
    FRIEND_TEST(StatsdStatsTest, TestTimestampThreshold);
    FRIEND_TEST(StatsdStatsTest, TestAnomalyMonitor);
    FRIEND_TEST(StatsdStatsTest, TestSystemServerCrash);
    FRIEND_TEST(StatsdStatsTest, TestPullAtomStats);
    FRIEND_TEST(StatsdStatsTest, TestAtomMetricsStats);
    FRIEND_TEST(StatsdStatsTest, TestActivationBroadcastGuardrailHit);
};

}  // namespace statsd
}  // namespace os
}  // namespace android