split_by_country_year.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. #!/usr/bin/env python3
  2. import csv
  3. from pathlib import Path
  4. import argparse
  5. def parse_arguments():
  6. parser = argparse.ArgumentParser(description=__doc__)
  7. parser.add_argument('input_file',
  8. type=Path,
  9. help='PRODCOM csv file to split')
  10. parser.add_argument('output_path',
  11. type=Path,
  12. help='path to save country csv files')
  13. return parser.parse_args()
  14. def split_bulk_csv(csvfile, output_path):
  15. # Category -> open file lookup
  16. outputs = {}
  17. output_name = Path(csvfile).stem
  18. last_decl = None
  19. fixed_values = None
  20. fixed_value_columns = ["DATAFLOW", "LAST UPDATE", "freq"]
  21. with open(csvfile) as fin:
  22. csvin = csv.DictReader(fin)
  23. assert csvin.fieldnames is not None
  24. # These are always the same so don't bother keeping
  25. assert csvin.fieldnames[:3] == fixed_value_columns
  26. var_columns = csvin.fieldnames[3:]
  27. for row in csvin:
  28. decl = row['decl']
  29. year = row['TIME_PERIOD']
  30. if fixed_values is None:
  31. fixed_values = {x: row[x] for x in fixed_value_columns}
  32. else:
  33. if any(row[x] != fixed_values[x] for x in fixed_value_columns):
  34. raise ValueError("Unexpected 'fixed' value: %s", row)
  35. if decl != last_decl:
  36. # close all the files to prevent too many open files
  37. for fout, _ in outputs.values():
  38. fout.close()
  39. print("done", decl)
  40. last_decl = decl
  41. # if necessary open a new file and write the header
  42. if (decl, year) not in outputs:
  43. fout = open(output_path / '{}_{}-{}.csv'.format(output_name, decl, year), 'w')
  44. dw = csv.DictWriter(fout, fieldnames=var_columns)
  45. dw.writeheader()
  46. outputs[decl, year] = fout, dw
  47. # write the row
  48. outputs[decl, year][1].writerow({k: row[k] for k in var_columns})
  49. if __name__ == "__main__":
  50. args = parse_arguments()
  51. split_bulk_csv(args.input_file, args.output_path)