há 1 ano atrás · a6046603f8
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -0,0 +1,8 @@
 
				+# Converting the data
			
 
				+
			
 
				+To convert the data run `doit run convert_data`.
			
 
				+
			
 
				+The results will be in the `outputs/` folder.
			
 
				+
			
 
				+To test the expected values are present, run `pytest`.
			
 
				+
			
--- a/dodo.py
+++ b/dodo.py
@@ -24,16 +24,6 @@ data_csv = [
 
				     'data/PRODCOM2016DATA.csv',
			
 
				     'data/PRODCOM2017DATA.csv',
			
 
				     'data/PRODCOM2018DATA.csv',
			
 
				-    'data/metrics_units.csv',
			
 
				-    'data/unfcc_object_hierarchy.csv',
			
 
				-    'data/unfcc_process_hierarchy.csv',
			
 
				-    'data/unfcc_processed_USA_ghg.csv'
			
 
				-]
			
 
				-
			
 
				-PRODCOM_DATA_CSV = [
			
 
				-    'data/PRODCOM2016DATA.csv',
			
 
				-    'data/PRODCOM2017DATA.csv',
			
 
				-    'data/PRODCOM2018DATA.csv',
			
 
				 ]
			
 
				 
			
 
				 probs_original_data = [
			
@@ -43,61 +33,67 @@ probs_original_data = [
 
				 
			
 
				 def task_preprocess():
			
 
				     """Converts 'raw' data into CSV files for RDFox."""
			
 
				+    raw_data_path = Path("raw_data")
			
 
				+    raw_data_files = [
			
 
				+        raw_data_path / Path(f).name for f in data_csv
			
 
				+    ]
			
 
				+    raw_data_files = [
			
 
				+        f for f in raw_data_files if f.exists()
			
 
				+    ]
			
 
				     return {
			
 
				-        'file_dep': [
			
 
				-            'scripts/preprocess.py',
			
 
				-
			
 
				-            # TODO: generate this list by replacing "data/" with "raw_data/" in
			
 
				-            # the `data_csv` list
			
 
				-            'raw_data/ct-2018-exports.csv',
			
 
				-            'raw_data/ct-2018-imports.csv',
			
 
				-            'raw_data/HSCodeandDescription_2017.csv',
			
 
				-            'raw_data/Object_table_for_ontoloy.csv',
			
 
				-            'raw_data/PRD_2016_20200617_185122.csv',
			
 
				-            'raw_data/PRD_2017_20200617_185035.csv',
			
 
				-            'raw_data/PRODCOM2016DATA.csv',
			
 
				-            'raw_data/PRODCOM2017DATA.csv',
			
 
				-            'raw_data/PRODCOM2018DATA.csv'
			
 
				-        ],
			
 
				+        'file_dep': ['scripts/preprocess.py'] + raw_data_files,
			
 
				         'targets': data_csv,
			
 
				-        'actions': [
			
 
				-            'python scripts/preprocess.py'
			
 
				-        ],
			
 
				+        'actions': ['python scripts/preprocess.py'],
			
 
				     }
			
 
				 
			
 
				-def task_data_conversion_prodcom():
			
 
				+
			
 
				+
			
 
				+# List of (data_type, csv_filename) pairs to be converted
			
 
				+DATA_FILES = [
			
 
				+    ("prodcom", 'data/PRODCOM2016DATA.csv'),
			
 
				+    ("prodcom", 'data/PRODCOM2017DATA.csv'),
			
 
				+    ("prodcom", 'data/PRODCOM2018DATA.csv'),
			
 
				+    ("comtrade", 'data/ct-2018-exports.csv'),
			
 
				+    ("prodcom_correspondence", 'data/PRC_2017_2016.csv'),
			
 
				+    ("prodcom_list", 'data/PRD_2017_20200617_185035.csv'),
			
 
				+]
			
 
				+
			
 
				+# TODO: add to this list for other data files
			
 
				+#
			
 
				+# (1) COMTRADE data ("ct-2018-exports" etc)
			
 
				+#
			
 
				+# (2) Classification code definitions "PRD_20XX..."
			
 
				+#
			
 
				+# (3) Correspondence tables "PRC_2017_2016.csv"
			
 
				+#
			
 
				+
			
 
				+
			
 
				+def task_convert_data():
			
 
				     """Reads CSV files, runs all the rules, and converts all of them into RDF."""
			
 
				-    for data_file in PRODCOM_DATA_CSV:
			
 
				-        data_file = Path(data_file)
			
 
				-        target_file = Path("outputs") / (data_file.stem + ".nt.gz")
			
 
				+    for data_type, csv_file in DATA_FILES:
			
 
				+        csv_file = Path(csv_file)
			
 
				+        target_file = Path("outputs") / (csv_file.stem + ".nt.gz")
			
 
				         yield {
			
 
				-            'name': data_file.stem,
			
 
				+            'name': csv_file.stem,
			
 
				             'file_dep': [
			
 
				-                data_file,
			
 
				-                "scripts/load_data_prodcom.rdfox",
			
 
				-                "scripts/map_prodcom.dlog",
			
 
				+                csv_file,
			
 
				+                f"scripts/load_data_{data_type}.rdfox",
			
 
				+                f"scripts/map_{data_type}.dlog",
			
 
				+            ],
			
 
				+            'targets': [
			
 
				+                target_file
			
 
				             ],
			
 
				-            'targets': [target_file],
			
 
				             'actions': [
			
 
				                 [
			
 
				                     "python",
			
 
				                     "scripts/convert_data.py",
			
 
				-                    "prodcom",
			
 
				-                    data_file,
			
 
				+                    data_type,
			
 
				+                    csv_file,
			
 
				                     target_file,
			
 
				                 ]
			
 
				             ],
			
 
				         }
			
 
				 
			
 
				-# TODO: repeat this type of tasks for other data files
			
 
				-#
			
 
				-# (1) COMTRADE data ("ct-2018-exports" etc)
			
 
				-#
			
 
				-# (2) Classification code definitions "PRD_20XX..."
			
 
				-#
			
 
				-# (3) Correspondence tables "PRC_2017_2016.csv"
			
 
				-#
			
 
				-# all in parallel (separately)
			
 
				 
			
 
				 
			
 
				 # def task_data_conversion():