diff options
Diffstat (limited to 'scripts/hiddenapi/merge_csv.py')
-rwxr-xr-x | scripts/hiddenapi/merge_csv.py | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/scripts/hiddenapi/merge_csv.py b/scripts/hiddenapi/merge_csv.py new file mode 100755 index 000000000..a65326c51 --- /dev/null +++ b/scripts/hiddenapi/merge_csv.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# +# Copyright (C) 2018 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Merge multiple CSV files, possibly with different columns. +""" + +import argparse +import csv +import io +import heapq +import itertools +import operator + +from zipfile import ZipFile + +args_parser = argparse.ArgumentParser(description='Merge given CSV files into a single one.') +args_parser.add_argument('--header', help='Comma separated field names; ' + 'if missing determines the header from input files.') +args_parser.add_argument('--zip_input', help='Treat files as ZIP archives containing CSV files to merge.', + action="store_true") +args_parser.add_argument('--key_field', help='The name of the field by which the rows should be sorted. ' + 'Must be in the field names. ' + 'Will be the first field in the output. ' + 'All input files must be sorted by that field.') +args_parser.add_argument('--output', help='Output file for merged CSV.', + default='-', type=argparse.FileType('w')) +args_parser.add_argument('files', nargs=argparse.REMAINDER) +args = args_parser.parse_args() + + +def dict_reader(input): + return csv.DictReader(input, delimiter=',', quotechar='|') + +csv_readers = [] +if not(args.zip_input): + for file in args.files: + csv_readers.append(dict_reader(open(file, 'r'))) +else: + for file in args.files: + with ZipFile(file) as zip: + for entry in zip.namelist(): + if entry.endswith('.uau'): + csv_readers.append(dict_reader(io.TextIOWrapper(zip.open(entry, 'r')))) + +if args.header: + fieldnames = args.header.split(',') +else: + headers = {} + # Build union of all columns from source files: + for reader in csv_readers: + for fieldname in reader.fieldnames: + headers[fieldname] = "" + fieldnames = list(headers.keys()) + +# By default chain the csv readers together so that the resulting output is +# the concatenation of the rows from each of them: +all_rows = itertools.chain.from_iterable(csv_readers) + +if len(csv_readers) > 0: + keyField = args.key_field + if keyField: + assert keyField in fieldnames, ( + "--key_field {} not found, must be one of {}\n").format( + keyField, ",".join(fieldnames)) + # Make the key field the first field in the output + keyFieldIndex = fieldnames.index(args.key_field) + fieldnames.insert(0, fieldnames.pop(keyFieldIndex)) + # Create an iterable that performs a lazy merge sort on the csv readers + # sorting the rows by the key field. + all_rows = heapq.merge(*csv_readers, key=operator.itemgetter(keyField)) + +# Write all rows from the input files to the output: +writer = csv.DictWriter(args.output, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, + dialect='unix', fieldnames=fieldnames) +writer.writeheader() + +# Read all the rows from the input and write them to the output in the correct +# order: +for row in all_rows: + writer.writerow(row) |