12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- #!/usr/bin/env python3
- import csv
- from pathlib import Path
- import argparse
- def parse_arguments():
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument('input_file',
- type=Path,
- help='PRODCOM csv file to split')
- parser.add_argument('output_path',
- type=Path,
- help='path to save country csv files')
- return parser.parse_args()
- def split_bulk_csv(csvfile, output_path):
- # Category -> open file lookup
- outputs = {}
- output_name = Path(csvfile).stem
- last_decl = None
- fixed_values = None
- fixed_value_columns = ["DATAFLOW", "LAST UPDATE", "freq"]
- with open(csvfile) as fin:
- csvin = csv.DictReader(fin)
- assert csvin.fieldnames is not None
- # These are always the same so don't bother keeping
- assert csvin.fieldnames[:3] == fixed_value_columns
- var_columns = csvin.fieldnames[3:]
- for row in csvin:
- decl = row['decl']
- year = row['TIME_PERIOD']
- if fixed_values is None:
- fixed_values = {x: row[x] for x in fixed_value_columns}
- else:
- if any(row[x] != fixed_values[x] for x in fixed_value_columns):
- raise ValueError("Unexpected 'fixed' value: %s", row)
- if decl != last_decl:
- # close all the files to prevent too many open files
- for fout, _ in outputs.values():
- fout.close()
- print("done", decl)
- last_decl = decl
- # if necessary open a new file and write the header
- if (decl, year) not in outputs:
- fout = open(output_path / '{}_{}-{}.csv'.format(output_name, decl, year), 'w')
- dw = csv.DictWriter(fout, fieldnames=var_columns)
- dw.writeheader()
- outputs[decl, year] = fout, dw
- # write the row
- outputs[decl, year][1].writerow({k: row[k] for k in var_columns})
- if __name__ == "__main__":
- args = parse_arguments()
- split_bulk_csv(args.input_file, args.output_path)
|