probs-lab
/
prodcom-data


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
							#!/usr/bin/env python3

import csv
from pathlib import Path
import argparse


def parse_arguments():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('input_file',
                        type=Path,
                        help='PRODCOM csv file to split')
    parser.add_argument('output_path',
                        type=Path,
                        help='path to save country csv files')
    return parser.parse_args()


def split_bulk_csv(csvfile, output_path):
    # Category -> open file lookup
    outputs = {}
    output_name = Path(csvfile).stem
    last_decl = None
    fixed_values = None
    fixed_value_columns = ["DATAFLOW", "LAST UPDATE", "freq"]
    with open(csvfile) as fin:
        csvin = csv.DictReader(fin)
        assert csvin.fieldnames is not None
        # These are always the same so don't bother keeping
        assert csvin.fieldnames[:3] == fixed_value_columns
        var_columns = csvin.fieldnames[3:]
        for row in csvin:
            decl = row['decl']
            year = row['TIME_PERIOD']

            if fixed_values is None:
                fixed_values = {x: row[x] for x in fixed_value_columns}
            else:
                if any(row[x] != fixed_values[x] for x in fixed_value_columns):
                    raise ValueError("Unexpected 'fixed' value: %s", row)

            if decl != last_decl:
                # close all the files to prevent too many open files
                for fout, _ in outputs.values():
                    fout.close()
                print("done", decl)
            last_decl = decl

            # if necessary open a new file and write the header
            if (decl, year) not in outputs:
                fout = open(output_path / '{}_{}-{}.csv'.format(output_name, decl, year), 'w')
                dw = csv.DictWriter(fout, fieldnames=var_columns)
                dw.writeheader()
                outputs[decl, year] = fout, dw

            # write the row
            outputs[decl, year][1].writerow({k: row[k] for k in var_columns})


if __name__ == "__main__":
    args = parse_arguments()
    split_bulk_csv(args.input_file, args.output_path)