dodo.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #!/usr/bin/env python3
  2. """
  3. Task definitions for doit.
  4. Run the steps to generate and query our datasets using RDFox.
  5. """
  6. from pathlib import Path
  7. from os import path
  8. from doit.tools import LongRunning
  9. rdfox_path = "RDFox" # change this if RDFox is not in your path
  10. dir_path = path.dirname(path.realpath(__file__))
  11. print(f"Running {rdfox_path} from {dir_path}")
  12. data_csv = [
  13. 'data/Object_table_for_ontoloy.csv',
  14. 'data/PRC_2017_2016.csv',
  15. 'data/PRD_2016_20200617_185122.csv',
  16. 'data/PRD_2017_20200617_185035.csv',
  17. 'data/PRODCOM2016DATA.csv',
  18. 'data/PRODCOM2017DATA.csv',
  19. 'data/PRODCOM2018DATA.csv',
  20. ]
  21. probs_original_data = [
  22. 'data/probs_original_data.nt.gz'
  23. ]
  24. def task_preprocess():
  25. """Converts 'raw' data into CSV files for RDFox."""
  26. raw_data_path = Path("raw_data")
  27. raw_data_files = [
  28. raw_data_path / Path(f).name for f in data_csv
  29. ]
  30. raw_data_files = [
  31. f for f in raw_data_files if f.exists()
  32. ]
  33. return {
  34. 'file_dep': ['scripts/preprocess.py'] + raw_data_files,
  35. 'targets': data_csv,
  36. 'actions': ['python scripts/preprocess.py'],
  37. }
  38. # List of (data_type, csv_filename) pairs to be converted
  39. DATA_FILES = [
  40. ("prodcom", 'data/PRODCOM2016DATA.csv'),
  41. ("prodcom", 'data/PRODCOM2017DATA.csv'),
  42. ("prodcom", 'data/PRODCOM2018DATA.csv'),
  43. ("prodcom_correspondence", 'data/PRC_2017_2016.csv'),
  44. ("prodcom_list", 'data/PRD_2017_20200617_185035.csv'),
  45. ("prodcom_list", 'data/PRD_2016_20200617_185122.csv'),
  46. ]
  47. def task_convert_data():
  48. """Reads CSV files, runs all the rules, and converts all of them into RDF."""
  49. for data_type, csv_file in DATA_FILES:
  50. csv_file = Path(csv_file)
  51. target_file = Path("outputs") / (csv_file.stem + ".nt.gz")
  52. yield {
  53. 'name': csv_file.stem,
  54. 'file_dep': [
  55. csv_file,
  56. f"scripts/load_data_{data_type}.rdfox",
  57. f"scripts/map_{data_type}.dlog",
  58. ],
  59. 'targets': [
  60. target_file
  61. ],
  62. 'actions': [
  63. [
  64. "python",
  65. "scripts/convert_data.py",
  66. data_type,
  67. csv_file,
  68. target_file,
  69. ]
  70. ],
  71. }
  72. # def task_data_conversion():
  73. # """Reads CSV files, runs all the rules, and converts all of them into RDF."""
  74. # return {
  75. # 'file_dep': ontology_ffs + data_csv + [
  76. # 'scripts/shared/setup-RDFox.rdfox',
  77. # 'scripts/shared/init-conversion.rdfox',
  78. # 'scripts/data-conversion/input.rdfox',
  79. # 'scripts/data-conversion/load_data.rdfox',
  80. # 'scripts/data-conversion/map.dlog',
  81. # 'scripts/data-conversion/master.rdfox',
  82. # 'scripts/data-conversion/master-pipeline.rdfox',
  83. # 'scripts/data-conversion/output.rdfox',
  84. # 'scripts/data-conversion/save_data.rdfox',
  85. # 'scripts/data-conversion/unit_conversion.dlog',
  86. # ],
  87. # 'targets': probs_original_data,
  88. # 'actions':
  89. # [
  90. # f'{rdfox_path} -sandbox-directory {dir_path} sandbox {dir_path} scripts/data-conversion/master'
  91. # ],
  92. # }