|
@@ -24,16 +24,6 @@ data_csv = [
|
|
|
'data/PRODCOM2016DATA.csv',
|
|
|
'data/PRODCOM2017DATA.csv',
|
|
|
'data/PRODCOM2018DATA.csv',
|
|
|
- 'data/metrics_units.csv',
|
|
|
- 'data/unfcc_object_hierarchy.csv',
|
|
|
- 'data/unfcc_process_hierarchy.csv',
|
|
|
- 'data/unfcc_processed_USA_ghg.csv'
|
|
|
-]
|
|
|
-
|
|
|
-PRODCOM_DATA_CSV = [
|
|
|
- 'data/PRODCOM2016DATA.csv',
|
|
|
- 'data/PRODCOM2017DATA.csv',
|
|
|
- 'data/PRODCOM2018DATA.csv',
|
|
|
]
|
|
|
|
|
|
probs_original_data = [
|
|
@@ -43,61 +33,67 @@ probs_original_data = [
|
|
|
|
|
|
def task_preprocess():
|
|
|
"""Converts 'raw' data into CSV files for RDFox."""
|
|
|
+ raw_data_path = Path("raw_data")
|
|
|
+ raw_data_files = [
|
|
|
+ raw_data_path / Path(f).name for f in data_csv
|
|
|
+ ]
|
|
|
+ raw_data_files = [
|
|
|
+ f for f in raw_data_files if f.exists()
|
|
|
+ ]
|
|
|
return {
|
|
|
- 'file_dep': [
|
|
|
- 'scripts/preprocess.py',
|
|
|
-
|
|
|
- # TODO: generate this list by replacing "data/" with "raw_data/" in
|
|
|
- # the `data_csv` list
|
|
|
- 'raw_data/ct-2018-exports.csv',
|
|
|
- 'raw_data/ct-2018-imports.csv',
|
|
|
- 'raw_data/HSCodeandDescription_2017.csv',
|
|
|
- 'raw_data/Object_table_for_ontoloy.csv',
|
|
|
- 'raw_data/PRD_2016_20200617_185122.csv',
|
|
|
- 'raw_data/PRD_2017_20200617_185035.csv',
|
|
|
- 'raw_data/PRODCOM2016DATA.csv',
|
|
|
- 'raw_data/PRODCOM2017DATA.csv',
|
|
|
- 'raw_data/PRODCOM2018DATA.csv'
|
|
|
- ],
|
|
|
+ 'file_dep': ['scripts/preprocess.py'] + raw_data_files,
|
|
|
'targets': data_csv,
|
|
|
- 'actions': [
|
|
|
- 'python scripts/preprocess.py'
|
|
|
- ],
|
|
|
+ 'actions': ['python scripts/preprocess.py'],
|
|
|
}
|
|
|
|
|
|
-def task_data_conversion_prodcom():
|
|
|
+
|
|
|
+
|
|
|
+# List of (data_type, csv_filename) pairs to be converted
|
|
|
+DATA_FILES = [
|
|
|
+ ("prodcom", 'data/PRODCOM2016DATA.csv'),
|
|
|
+ ("prodcom", 'data/PRODCOM2017DATA.csv'),
|
|
|
+ ("prodcom", 'data/PRODCOM2018DATA.csv'),
|
|
|
+ ("comtrade", 'data/ct-2018-exports.csv'),
|
|
|
+ ("prodcom_correspondence", 'data/PRC_2017_2016.csv'),
|
|
|
+ ("prodcom_list", 'data/PRD_2017_20200617_185035.csv'),
|
|
|
+]
|
|
|
+
|
|
|
+# TODO: add to this list for other data files
|
|
|
+#
|
|
|
+# (1) COMTRADE data ("ct-2018-exports" etc)
|
|
|
+#
|
|
|
+# (2) Classification code definitions "PRD_20XX..."
|
|
|
+#
|
|
|
+# (3) Correspondence tables "PRC_2017_2016.csv"
|
|
|
+#
|
|
|
+
|
|
|
+
|
|
|
+def task_convert_data():
|
|
|
"""Reads CSV files, runs all the rules, and converts all of them into RDF."""
|
|
|
- for data_file in PRODCOM_DATA_CSV:
|
|
|
- data_file = Path(data_file)
|
|
|
- target_file = Path("outputs") / (data_file.stem + ".nt.gz")
|
|
|
+ for data_type, csv_file in DATA_FILES:
|
|
|
+ csv_file = Path(csv_file)
|
|
|
+ target_file = Path("outputs") / (csv_file.stem + ".nt.gz")
|
|
|
yield {
|
|
|
- 'name': data_file.stem,
|
|
|
+ 'name': csv_file.stem,
|
|
|
'file_dep': [
|
|
|
- data_file,
|
|
|
- "scripts/load_data_prodcom.rdfox",
|
|
|
- "scripts/map_prodcom.dlog",
|
|
|
+ csv_file,
|
|
|
+ f"scripts/load_data_{data_type}.rdfox",
|
|
|
+ f"scripts/map_{data_type}.dlog",
|
|
|
+ ],
|
|
|
+ 'targets': [
|
|
|
+ target_file
|
|
|
],
|
|
|
- 'targets': [target_file],
|
|
|
'actions': [
|
|
|
[
|
|
|
"python",
|
|
|
"scripts/convert_data.py",
|
|
|
- "prodcom",
|
|
|
- data_file,
|
|
|
+ data_type,
|
|
|
+ csv_file,
|
|
|
target_file,
|
|
|
]
|
|
|
],
|
|
|
}
|
|
|
|
|
|
-# TODO: repeat this type of tasks for other data files
|
|
|
-#
|
|
|
-# (1) COMTRADE data ("ct-2018-exports" etc)
|
|
|
-#
|
|
|
-# (2) Classification code definitions "PRD_20XX..."
|
|
|
-#
|
|
|
-# (3) Correspondence tables "PRC_2017_2016.csv"
|
|
|
-#
|
|
|
-# all in parallel (separately)
|
|
|
|
|
|
|
|
|
# def task_data_conversion():
|