1 files changed, 457 insertions, 0 deletions
diff --git a/startop/scripts/app_startup/analyze_metrics.py b/startop/scripts/app_startup/analyze_metrics.py
new file mode 100755
index 000000000000..d74d6f68d823
--- /dev/null
+++ b/startop/scripts/app_startup/analyze_metrics.py
@@ -0,0 +1,457 @@
+#!/usr/bin/env python3
+#
+# Copyright 2018, The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Perform statistical analysis on measurements produced by app_startup_runner.py
+
+Install:
+$> sudo apt-get install python3-scipy
+
+Usage:
+$> ./analyze_metrics.py <filename.csv> [<filename2.csv> ...]
+$> ./analyze_metrics.py --help
+"""
+
+import argparse
+import csv
+import itertools
+import os
+import subprocess
+import sys
+import tempfile
+from typing import Any, List, Dict, Iterable, TextIO, Tuple
+
+from scipy import stats as sc
+import numpy as np
+
+
+# These CSV columns are considered labels. Everything after them in the same row are metrics.
+_LABEL_COLUMNS=['packages', 'readaheads', 'compiler_filters']
+# The metric series with the 'cold' readahead is the baseline.
+# All others (warm, jit, etc) are the potential improvements.
+
+#fixme: this should probably be an option
+_BASELINE=('readaheads', 'cold')
+# ignore this for some statistic calculations
+_IGNORE_PAIR=('readaheads', 'warm')
+_PLOT_SUBKEY='readaheads'
+_PLOT_GROUPKEY='packages'
+_PLOT_DATA_INDEX = 0
+_DELTA=50
+_DELTA2=100
+_PVALUE_THRESHOLD=0.10
+_debug = False  # See -d/--debug flag.
+
+def parse_options(argv: List[str] = None):
+  """Parse command line arguments and return an argparse Namespace object."""
+  parser = argparse.ArgumentParser(description="Perform statistical analysis on measurements produced by app_start_runner.py.")
+  parser.add_argument('input_files', metavar='file.csv', nargs='+', help='CSV file produced by app_startup_runner.py')
+
+  parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='Add extra debugging output')
+  parser.add_argument('-os', '--output-samples', dest='output_samples', default='/dev/null', action='store', help='Store CSV for per-sample data')
+  parser.add_argument('-oc', '--output-comparable', dest='output_comparable', default='/dev/null', action='store', help='Output CSV for comparable against baseline')
+  parser.add_argument('-ocs', '--output-comparable-significant', dest='output_comparable_significant', default='/dev/null', action='store', help='Output CSV for comparable against baseline (significant only)')
+  parser.add_argument('-pt', '--pvalue-threshold', dest='pvalue_threshold', type=float, default=_PVALUE_THRESHOLD, action='store')
+  parser.add_argument('-dt', '--delta-threshold', dest='delta_threshold', type=int, default=_DELTA, action='store')
+
+  return parser.parse_args(argv)
+
+def _debug_print(*args, **kwargs):
+  """Print the args to sys.stderr if the --debug/-d flag was passed in."""
+  global _debug
+  if _debug:
+    print(*args, **kwargs, file=sys.stderr)
+
+def _expand_gen_repr(args):
+  new_args_list = []
+  for i in args:
+    # detect iterable objects that do not have their own override of __str__
+    if hasattr(i, '__iter__'):
+      to_str = getattr(i, '__str__')
+      if to_str.__objclass__ == object:
+        # the repr for a generator is just type+address, expand it out instead.
+        new_args_list.append([_expand_gen_repr([j])[0] for j in i])
+        continue
+    # normal case: uses the built-in to-string
+    new_args_list.append(i)
+  return new_args_list
+
+def _debug_print_gen(*args, **kwargs):
+  """Like _debug_print but will turn any iterable args into a list."""
+  if not _debug:
+    return
+
+  new_args_list = _expand_gen_repr(args)
+  _debug_print(*new_args_list, **kwargs)
+
+def read_headers(input_file: TextIO) -> Tuple[List[str], List[str]]:
+  _debug_print("read_headers for file: ", input_file.name)
+  csv_reader = csv.reader(input_file)
+
+  label_num_columns = len(_LABEL_COLUMNS)
+
+  try:
+    header = next(csv_reader)
+  except StopIteration:
+    header = None
+  _debug_print('header', header)
+
+  if not header:
+    return (None, None)
+
+  labels = header[0:label_num_columns]
+  data = header[label_num_columns:]
+
+  return (labels, data)
+
+def read_labels_and_data(input_file: TextIO) -> Iterable[Tuple[List[str], List[int]]]:
+  _debug_print("print_analysis for file: ", input_file.name)
+  csv_reader = csv.reader(input_file)
+
+  # Skip the header because it doesn't contain any data.
+  # To get the header see read_headers function.
+  try:
+    header = next(csv_reader)
+  except StopIteration:
+    header = None
+
+  label_num_columns = len(_LABEL_COLUMNS)
+
+  for row in csv_reader:
+    if len(row) > 0 and row[0][0] == ';':
+      _debug_print("skip comment line", row)
+      continue
+
+    labels = row[0:label_num_columns]
+    data = [int(i) for i in row[label_num_columns:]]
+
+#    _debug_print("labels:", labels)
+#    _debug_print("data:", data)
+
+    yield (labels, data)
+
+def group_metrics_by_label(it: Iterable[Tuple[List[str], List[int]]]):
+  prev_labels = None
+  data_2d = []
+
+  for label_list, data_list in it:
+    if prev_labels != label_list:
+      if prev_labels:
+#        _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
+        yield (prev_labels, data_2d)
+      data_2d = []
+
+    data_2d.append(data_list)
+    prev_labels = label_list
+
+  if prev_labels:
+#    _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
+    yield (prev_labels, data_2d)
+
+def data_to_numpy(it: Iterable[Tuple[List[str], List[List[int]]]]) -> Iterable[Tuple[List[str], Any]]:
+  for label_list, data_2d in it:
+    yield (label_list, np.asarray(data_2d, dtype=int))
+
+def iterate_columns(np_data_2d):
+  for col in range(np_data_2d.shape[1]):
+    col_as_array = np_data_2d[:, col]
+    yield col_as_array
+
+def confidence_interval(np_data_2d, percent=0.95):
+  """
+  Given some data [[a,b,c],[d,e,f,]...]
+
+  We assume the same metric is in the column (e.g. [a,d])
+  and that data in the rows (e.g. [b,e]) are separate metric values.
+
+  We then calculate the CI for each metric individually returning it as a list of tuples.
+  """
+  arr = []
+  for col_2d in iterate_columns(np_data_2d):
+    mean = col_2d.mean()
+    sigma = col_2d.std()
+
+    ci = sc.norm.interval(percent, loc=mean, scale=sigma / np.sqrt(len(col_2d)))
+    arr.append(ci)
+
+  # TODO: This seems to be returning NaN when all the samples have the same exact value
+  # (e.g. stddev=0, which can trivially happen when sample count = 1).
+
+  return arr
+
+def print_analysis(it, label_header: List[str], data_header: List[str], output_samples: str):
+  print(label_header)
+
+  with open(output_samples, "w") as output_file:
+
+    csv_writer = csv.writer(output_file)
+    csv_writer.writerow(label_header + ['mean', 'std', 'confidence_interval_a', 'confidence_interval_b'])
+
+    for label_list, np_data_2d in it:
+      print("**********************")
+      print(label_list)
+      print()
+      print("      ", data_header)
+      # aggregate computation column-wise
+      print("Mean: ", np_data_2d.mean(axis=0))
+      print("Std:  ", np_data_2d.std(axis=0))
+      print("CI95%:", confidence_interval(np_data_2d))
+      print("SEM:  ", stats_standard_error_one(np_data_2d, axis=0))
+
+      #ci = confidence_interval(np_data_2d)[_PLOT_DATA_INDEX]
+      sem = stats_standard_error_one(np_data_2d, axis=0)[_PLOT_DATA_INDEX]
+      mean = np_data_2d.mean(axis=0)[_PLOT_DATA_INDEX]
+
+      ci = (mean - sem, mean + sem)
+
+      csv_writer.writerow(label_list + [mean, np_data_2d.std(axis=0)[_PLOT_DATA_INDEX], ci[0], ci[1]])
+
+def from_file_group_by_labels(input_file):
+  (label_header, data_header) = read_headers(input_file)
+  label_data_iter = read_labels_and_data(input_file)
+  grouped_iter = group_metrics_by_label(label_data_iter)
+  grouped_numpy_iter = data_to_numpy(grouped_iter)
+
+  return grouped_numpy_iter, label_header, data_header
+
+def list_without_index(list, index):
+  return list[:index] + list[index+1:]
+
+def group_by_without_baseline_key(grouped_numpy_iter, label_header):
+  """
+  Data is considered comparable if the only difference is the baseline key
+  (i.e. the readahead is different but the package, compilation filter, etc, are the same).
+
+  Returns iterator that's grouped by the non-baseline labels to an iterator of
+  (label_list, data_2d).
+  """
+  baseline_index = label_header.index(_BASELINE[0])
+
+  def get_label_without_baseline(tpl):
+    label_list, _ = tpl
+    return list_without_index(label_list, baseline_index)
+  # [['pkgname', 'compfilter', 'warm'], [data]]
+  # [['pkgname', 'compfilter', 'cold'], [data2]]
+  # [['pkgname2', 'compfilter', 'warm'], [data3]]
+  #
+  #   ->
+  # ( [['pkgname', 'compfilter', 'warm'], [data]]      # ignore baseline label change.
+  #   [['pkgname', 'compfilter', 'cold'], [data2]] ),  # split here because the pkgname changed.
+  # ( [['pkgname2', 'compfilter', 'warm'], [data3]] )
+  for group_info, it in itertools.groupby(grouped_numpy_iter, key = get_label_without_baseline):
+    yield it
+
+  # TODO: replace this messy manual iteration/grouping with pandas
+
+def iterate_comparable_metrics(without_baseline_iter, label_header):
+  baseline_index = label_header.index(_BASELINE[0])
+  baseline_value = _BASELINE[1]
+
+  _debug_print("iterate comparables")
+
+  def is_baseline_fun(tp):
+    ll, dat = tp
+    return ll[baseline_index] == baseline_value
+
+  # iterating here when everything but the baseline key is the same.
+  for it in without_baseline_iter:
+    it1, it2 = itertools.tee(it)
+
+    # find all the baseline data.
+    baseline_filter_it = filter(is_baseline_fun, it1)
+
+    # find non-baseline data.
+    nonbaseline_filter_it = itertools.filterfalse(is_baseline_fun, it2)
+
+    yield itertools.product(baseline_filter_it, nonbaseline_filter_it)
+
+def stats_standard_error_one(a, axis):
+  a_std = a.std(axis=axis, ddof=0)
+  a_len = a.shape[axis]
+
+  return a_std / np.sqrt(a_len)
+
+def stats_standard_error(a, b, axis):
+  a_std = a.std(axis=axis, ddof=0)
+  b_std = b.std(axis=axis, ddof=0)
+
+  a_len = a.shape[axis]
+  b_len = b.shape[axis]
+
+  temp1 = a_std*a_std/a_len
+  temp2 = b_std*b_std/b_len
+
+  return np.sqrt(temp1 + temp2)
+
+def stats_tvalue(a, b, axis, delta = 0):
+  a_mean = a.mean(axis=axis)
+  b_mean = b.mean(axis=axis)
+
+  return (a_mean - b_mean - delta) / stats_standard_error(a, b, axis)
+
+def stats_pvalue(a, b, axis, delta, left:bool = False):
+  """
+  Single-tailed 2-sample t-test.
+
+  Returns p-value for the null hypothesis: mean(a) - mean(b) >= delta.
+  :param a: numpy 2d array
+  :param b: numpy 2d array
+  :param axis: which axis to do the calculations across
+  :param delta: test value of mean differences
+  :param left: if true then use <= delta instead of >= delta
+  :return: p-value
+  """
+  # implement our own pvalue calculation because the built-in t-test (t,p values)
+  # only offer delta=0 , e.g. m1-m1 ? 0
+  # we are however interested in m1-m2 ? delta
+  t_value = stats_tvalue(a, b, axis, delta)
+
+  # 2-sample degrees of freedom is using the array sizes - 2.
+  dof = a.shape[axis] + b.shape[axis] - 2
+
+  if left:
+    # left tailed test. e.g. m1-m2 <= delta
+    return sc.t.cdf(t_value, dof)
+  else:
+    # right tailed test. e.g. m1-m2 >= delta
+    return sc.t.sf(t_value, dof)
+  # a left+right tailed test is a 2-tail t-test and can be done using ttest_ind for delta=0
+
+def print_comparable_analysis(comparable_metrics_iter, label_header, data_header, output_comparable: str, output_comparable_significant: str):
+  baseline_value = _BASELINE[1]
+  baseline_index = label_header.index(_BASELINE[0])
+
+  old_baseline_label_list = None
+  delta = _DELTA
+  filter_value = _IGNORE_PAIR[1]
+  filter_index = label_header.index(_IGNORE_PAIR[0])
+
+  pvalue_threshold = _PVALUE_THRESHOLD
+  ci_threshold = (1 - _PVALUE_THRESHOLD) * 100.0
+
+  with open(output_comparable, "w") as output_file:
+
+    csv_writer = csv.writer(output_file)
+    csv_writer.writerow(label_header + ['mean', 'mean_diff', 'sem', 'pvalue_2tailed', 'pvalue_gt%d' %(_DELTA), 'pvalue_gt%d' %(_DELTA2)])
+
+    print("------------------------------------------------------------------")
+    print("Comparison against the baseline %s = %s" %(_BASELINE, baseline_value))
+    print("--- Right-tailed t-test checks if the baseline >= current %s by at least %d" %(_BASELINE[0], delta))
+    print()
+
+    global_stats = {'better_than_delta': [], 'better_than_delta_p95': []}
+
+    for nested_it in comparable_metrics_iter:
+      print("************************")
+
+      better_than_delta = []
+      better_than_delta_p95 = []
+
+      saw_baseline_once = False
+
+      for ((baseline_label_list, baseline_np_data_2d), (rest_label_list, rest_np_data_2d)) in nested_it:
+        _debug_print("baseline_label_list:", baseline_label_list)
+        _debug_print("baseline_np_data_2d:", baseline_np_data_2d)
+        _debug_print("rest_label_list:", rest_label_list)
+        _debug_print("rest_np_data_2d:", rest_np_data_2d)
+
+        mean_diff = baseline_np_data_2d.mean(axis=0) - rest_np_data_2d.mean(axis=0)
+        # 2-sample 2-tailed t-test with delta=0
+        # e.g. "Is it true that usually the two sample means are different?"
+        t_statistic, t_pvalue = sc.ttest_ind(baseline_np_data_2d, rest_np_data_2d, axis=0)
+
+        # 2-sample 1-tailed t-test with delta=50
+        # e.g. "Is it true that usually the sample means better than 50ms?"
+        t2 = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
+        p2 = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
+
+        t2_b = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
+        p2_b = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
+
+        print("%s vs %s" %(rest_label_list, baseline_value))
+        print("                           ", data_header)
+        print("Mean Difference:           ", mean_diff)
+        print("T-test (2-tailed) != 0:      t=%s, p=%s" %(t_statistic, t_pvalue))
+        print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA, t2, p2))
+        print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA2, t2_b, p2_b))
+
+        def write_out_values(label_list, *args):
+          csv_writer.writerow(label_list + [i[_PLOT_DATA_INDEX] for i in args])
+
+        sem = stats_standard_error(baseline_np_data_2d, rest_np_data_2d, axis=0)
+        if saw_baseline_once == False:
+          saw_baseline_once = True
+          base_sem = stats_standard_error_one(baseline_np_data_2d, axis=0)
+          write_out_values(baseline_label_list, baseline_np_data_2d.mean(axis=0), [0], base_sem, [None], [None], [None])
+        write_out_values(rest_label_list, rest_np_data_2d.mean(axis=0), mean_diff, sem, t_pvalue, p2, p2_b)
+
+        # now do the global statistics aggregation
+
+        if rest_label_list[filter_index] == filter_value:
+          continue
+
+        if mean_diff > delta:
+          better_than_delta.append((mean_diff, p2, rest_label_list))
+
+          if p2 <= pvalue_threshold:
+            better_than_delta_p95.append((mean_diff, rest_label_list))
+
+      if better_than_delta:
+        global_stats['better_than_delta'].append(better_than_delta)
+      if better_than_delta_p95:
+        global_stats['better_than_delta_p95'].append(better_than_delta_p95)
+
+    print("------------------------")
+    print("Global statistics:")
+    print("//// Rows with %s=%s are ignored here." %_IGNORE_PAIR)
+    print("- # of results with mean diff better than delta(%d)       = %d" %(delta, len(global_stats['better_than_delta'])))
+    print("    > (meandiff, pvalue, labels)")
+    for i in global_stats['better_than_delta']:
+      print("    > %s" %i)
+    print("- # of results with mean diff better than delta(%d) CI%d%% = %d" %(delta, ci_threshold, len(global_stats['better_than_delta_p95'])))
+    print("    > (meandiff, labels)")
+    for i in global_stats['better_than_delta_p95']:
+      print("    > %s" %i)
+
+def main():
+  global _debug
+  global _DELTA
+  global _PVALUE_THRESHOLD
+
+  opts = parse_options()
+  _debug = opts.debug
+  _debug_print("parsed options: ", opts)
+
+  _PVALUE_THRESHOLD = opts.pvalue_threshold or _PVALUE_THRESHOLD
+
+  for file_name in opts.input_files:
+    with open(file_name, 'r') as input_file:
+      (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
+      print_analysis(grouped_numpy_iter, label_header, data_header, opts.output_samples)
+
+    with open(file_name, 'r') as input_file:
+      (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
+      without_baseline_iter = group_by_without_baseline_key(grouped_numpy_iter, label_header)
+      #_debug_print_gen(without_baseline_iter)
+
+      comparable_metrics_iter = iterate_comparable_metrics(without_baseline_iter, label_header)
+      print_comparable_analysis(comparable_metrics_iter, label_header, data_header, opts.output_comparable, opts.output_comparable_significant)
+
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(main())