dodo.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #!/usr/bin/env python3
  2. """
  3. Task definitions for doit.
  4. Run the steps to generate and query our datasets using RDFox.
  5. """
  6. import csv
  7. from pathlib import Path
  8. from os import path
  9. from doit.tools import LongRunning
  10. rdfox_path = "RDFox" # change this if RDFox is not in your path
  11. dir_path = path.dirname(path.realpath(__file__))
  12. print(f"Running {rdfox_path} from {dir_path}")
  13. data_csv = [
  14. 'data/Object_table_for_ontoloy.csv',
  15. 'data/PRC_2017_2016.csv',
  16. 'data/PRD_2016_20200617_185122.csv',
  17. 'data/PRD_2017_20200617_185035.csv',
  18. 'data/PRODCOM2016DATA.csv',
  19. 'data/PRODCOM2017DATA.csv',
  20. 'data/PRODCOM2018DATA.csv',
  21. ]
  22. probs_original_data = [
  23. 'data/probs_original_data.nt.gz'
  24. ]
  25. def task_preprocess():
  26. """Converts 'raw' data into CSV files for RDFox."""
  27. raw_data_path = Path("raw_data")
  28. raw_data_files = [
  29. raw_data_path / Path(f).name for f in data_csv
  30. ]
  31. raw_data_files = [
  32. f for f in raw_data_files if f.exists()
  33. ]
  34. return {
  35. 'file_dep': ['scripts/preprocess.py'] + raw_data_files,
  36. 'targets': data_csv,
  37. 'actions': ['python scripts/preprocess.py'],
  38. }
  39. # List of (data_type, csv_filename) pairs to be converted
  40. DATA_FILES = [
  41. ("prodcom", 'data/PRODCOM2016DATA.csv'),
  42. ("prodcom", 'data/PRODCOM2017DATA.csv'),
  43. ("prodcom", 'data/PRODCOM2018DATA.csv'),
  44. ("prodcom_correspondence", 'data/PRC_2017_2016.csv'),
  45. ("prodcom_list", 'data/PRD_2017_20200617_185035.csv'),
  46. ("prodcom_list", 'data/PRD_2016_20200617_185122.csv'),
  47. ]
  48. DATA_BULK = [
  49. ("prodcom_bulk_sold" , 'bulk_data/sold_production', 'outputs/sold_production'),
  50. ("prodcom_bulk_total", 'bulk_data/total_production', 'outputs/total_production'),
  51. ]
  52. def task_convert_data():
  53. """Reads CSV files, runs all the rules, and converts all of them into RDF."""
  54. for data_type, csv_file in DATA_FILES:
  55. csv_file = Path(csv_file)
  56. target_file = Path("outputs") / (csv_file.stem + ".nt.gz")
  57. yield {
  58. 'name': csv_file.stem,
  59. 'file_dep': [
  60. csv_file,
  61. f"scripts/load_data_{data_type}.rdfox",
  62. f"scripts/map_{data_type}.dlog",
  63. ],
  64. 'targets': [
  65. target_file
  66. ],
  67. 'actions': [
  68. [
  69. "python",
  70. "scripts/convert_data.py",
  71. data_type,
  72. csv_file,
  73. target_file,
  74. ]
  75. ],
  76. }
  77. def task_convert_bulk():
  78. for data_type, csv_dir, output_dir in DATA_BULK:
  79. bulk_data_path = Path(csv_dir)
  80. bulk_data_files = bulk_data_path.glob("*.csv")
  81. for csv_file in bulk_data_files:
  82. target_file = Path(output_dir) / (csv_file.stem + ".nt.gz")
  83. yield {
  84. 'name': csv_file.stem,
  85. 'file_dep': [
  86. csv_file,
  87. "scripts/load_data_prodcom_bulk.rdfox",
  88. f"scripts/map_{data_type}.dlog",
  89. ],
  90. 'targets': [
  91. target_file
  92. ],
  93. 'actions': [
  94. [
  95. "python",
  96. "scripts/convert_data.py",
  97. data_type,
  98. csv_file,
  99. target_file,
  100. ]
  101. ],
  102. }