Ver Fonte

Tidy up doit tasks.

Rick Lupton há 1 ano atrás
pai
commit
a6046603f8
2 ficheiros alterados com 52 adições e 48 exclusões
  1. 8 0
      DEVELOPING.md
  2. 44 48
      dodo.py

+ 8 - 0
DEVELOPING.md

@@ -0,0 +1,8 @@
+# Converting the data
+
+To convert the data run `doit run convert_data`.
+
+The results will be in the `outputs/` folder.
+
+To test the expected values are present, run `pytest`.
+

+ 44 - 48
dodo.py

@@ -24,16 +24,6 @@ data_csv = [
     'data/PRODCOM2016DATA.csv',
     'data/PRODCOM2017DATA.csv',
     'data/PRODCOM2018DATA.csv',
-    'data/metrics_units.csv',
-    'data/unfcc_object_hierarchy.csv',
-    'data/unfcc_process_hierarchy.csv',
-    'data/unfcc_processed_USA_ghg.csv'
-]
-
-PRODCOM_DATA_CSV = [
-    'data/PRODCOM2016DATA.csv',
-    'data/PRODCOM2017DATA.csv',
-    'data/PRODCOM2018DATA.csv',
 ]
 
 probs_original_data = [
@@ -43,61 +33,67 @@ probs_original_data = [
 
 def task_preprocess():
     """Converts 'raw' data into CSV files for RDFox."""
+    raw_data_path = Path("raw_data")
+    raw_data_files = [
+        raw_data_path / Path(f).name for f in data_csv
+    ]
+    raw_data_files = [
+        f for f in raw_data_files if f.exists()
+    ]
     return {
-        'file_dep': [
-            'scripts/preprocess.py',
-
-            # TODO: generate this list by replacing "data/" with "raw_data/" in
-            # the `data_csv` list
-            'raw_data/ct-2018-exports.csv',
-            'raw_data/ct-2018-imports.csv',
-            'raw_data/HSCodeandDescription_2017.csv',
-            'raw_data/Object_table_for_ontoloy.csv',
-            'raw_data/PRD_2016_20200617_185122.csv',
-            'raw_data/PRD_2017_20200617_185035.csv',
-            'raw_data/PRODCOM2016DATA.csv',
-            'raw_data/PRODCOM2017DATA.csv',
-            'raw_data/PRODCOM2018DATA.csv'
-        ],
+        'file_dep': ['scripts/preprocess.py'] + raw_data_files,
         'targets': data_csv,
-        'actions': [
-            'python scripts/preprocess.py'
-        ],
+        'actions': ['python scripts/preprocess.py'],
     }
 
-def task_data_conversion_prodcom():
+
+
+# List of (data_type, csv_filename) pairs to be converted
+DATA_FILES = [
+    ("prodcom", 'data/PRODCOM2016DATA.csv'),
+    ("prodcom", 'data/PRODCOM2017DATA.csv'),
+    ("prodcom", 'data/PRODCOM2018DATA.csv'),
+    ("comtrade", 'data/ct-2018-exports.csv'),
+    ("prodcom_correspondence", 'data/PRC_2017_2016.csv'),
+    ("prodcom_list", 'data/PRD_2017_20200617_185035.csv'),
+]
+
+# TODO: add to this list for other data files
+#
+# (1) COMTRADE data ("ct-2018-exports" etc)
+#
+# (2) Classification code definitions "PRD_20XX..."
+#
+# (3) Correspondence tables "PRC_2017_2016.csv"
+#
+
+
+def task_convert_data():
     """Reads CSV files, runs all the rules, and converts all of them into RDF."""
-    for data_file in PRODCOM_DATA_CSV:
-        data_file = Path(data_file)
-        target_file = Path("outputs") / (data_file.stem + ".nt.gz")
+    for data_type, csv_file in DATA_FILES:
+        csv_file = Path(csv_file)
+        target_file = Path("outputs") / (csv_file.stem + ".nt.gz")
         yield {
-            'name': data_file.stem,
+            'name': csv_file.stem,
             'file_dep': [
-                data_file,
-                "scripts/load_data_prodcom.rdfox",
-                "scripts/map_prodcom.dlog",
+                csv_file,
+                f"scripts/load_data_{data_type}.rdfox",
+                f"scripts/map_{data_type}.dlog",
+            ],
+            'targets': [
+                target_file
             ],
-            'targets': [target_file],
             'actions': [
                 [
                     "python",
                     "scripts/convert_data.py",
-                    "prodcom",
-                    data_file,
+                    data_type,
+                    csv_file,
                     target_file,
                 ]
             ],
         }
 
-# TODO: repeat this type of tasks for other data files
-#
-# (1) COMTRADE data ("ct-2018-exports" etc)
-#
-# (2) Classification code definitions "PRD_20XX..."
-#
-# (3) Correspondence tables "PRC_2017_2016.csv"
-#
-# all in parallel (separately)
 
 
 # def task_data_conversion():