Browse Source

Update scripts to convert PRODCOM years separately

Using probs_runner rather than directly running the scripts like we used
to in the probs-docs repository.
Rick Lupton 1 year ago
parent
commit
25d2ffbb0c

+ 4 - 0
data/PRODCOM2016DATA_defs.dlog

@@ -0,0 +1,4 @@
+[:CurrentImport, :hasTimePeriod, :TimePeriod_YearOf2016] .
+[:CurrentImport, :partOfDataset, ufpc:PRODCOM2016DATA] .
+[:CurrentImport, :useDataPrefix, ufpcd2016:] .
+[:CurrentImport, :useObjectPrefix, ufpc2016:] .

+ 4 - 0
data/PRODCOM2017DATA_defs.dlog

@@ -0,0 +1,4 @@
+[:CurrentImport, :hasTimePeriod, :TimePeriod_YearOf2017] .
+[:CurrentImport, :partOfDataset, ufpc:PRODCOM2017DATA] .
+[:CurrentImport, :useDataPrefix, ufpcd2017:] .
+[:CurrentImport, :useObjectPrefix, ufpc2017:] .

+ 4 - 0
data/PRODCOM2018DATA_defs.dlog

@@ -0,0 +1,4 @@
+[:CurrentImport, :hasTimePeriod, :TimePeriod_YearOf2018] .
+[:CurrentImport, :partOfDataset, ufpc:PRODCOM2018DATA] .
+[:CurrentImport, :useDataPrefix, ufpcd2018:] .
+[:CurrentImport, :useObjectPrefix, ufpc2017:] .

data/additional_info.ttl → data/old/additional_info.ttl


+ 63 - 147
dodo.py

@@ -5,6 +5,7 @@ Task definitions for doit.
 Run the steps to generate and query our datasets using RDFox.
 """
 
+from pathlib import Path
 from os import path
 from doit.tools import LongRunning
 
@@ -29,41 +30,15 @@ data_csv = [
     'data/unfcc_processed_USA_ghg.csv'
 ]
 
-ontology_ffs = [
-    'data/probs.fss'
+PRODCOM_DATA_CSV = [
+    'data/PRODCOM2016DATA.csv',
+    'data/PRODCOM2017DATA.csv',
+    'data/PRODCOM2018DATA.csv',
 ]
 
 probs_original_data = [
     'data/probs_original_data.nt.gz'
 ]
-probs_enhanced_data = [
-    'data/probs_enhanced_data.nt.gz'
-]
-
-reasoning_input = ontology_ffs + probs_enhanced_data + [
-    'scripts/shared/setup-RDFox.rdfox',
-    'scripts/shared/init-reasoning.rdfox',
-    'scripts/reasoning/input.rdfox',
-    'scripts/reasoning/process.rdfox',
-    'scripts/reasoning/rules.dlog'
-]
-
-
-def task_ontology_conversion():
-    """Converts the Turtle ontology into Functional-Style OWL."""
-    return {
-        'file_dep': [
-            'ontology/probs.ttl',
-            'scripts/ontology-conversion/master.rdfox',
-            'scripts/shared/setup-RDFox.rdfox',
-            'scripts/ontology-conversion/init.rdfox',
-            'scripts/ontology-conversion/convert.rdfox'
-        ],
-        'targets': ontology_ffs,
-        'actions': [
-            f'{rdfox_path} -sandbox-directory {dir_path} sandbox {dir_path} scripts/ontology-conversion/master'
-        ],
-    }
 
 
 def task_preprocess():
@@ -71,10 +46,11 @@ def task_preprocess():
     return {
         'file_dep': [
             'scripts/preprocess.py',
+
+            # TODO: generate this list by replacing "data/" with "raw_data/" in
+            # the `data_csv` list
             'raw_data/ct-2018-exports.csv',
             'raw_data/ct-2018-imports.csv',
-            'raw_data/EHS_Houseing_stock_2018_byAge.csv',
-            'raw_data/EHS_Houseing_stock_2018_byType.csv',
             'raw_data/HSCodeandDescription_2017.csv',
             'raw_data/Object_table_for_ontoloy.csv',
             'raw_data/PRD_2016_20200617_185122.csv',
@@ -89,119 +65,59 @@ def task_preprocess():
         ],
     }
 
-
-def task_data_conversion():
-    """Reads CSV files, runs all the rules, and converts all of them into RDF."""
-    return {
-        'file_dep': ontology_ffs + data_csv + [
-            'scripts/shared/setup-RDFox.rdfox',
-            'scripts/shared/init-conversion.rdfox',
-            'scripts/data-conversion/input.rdfox',
-            'scripts/data-conversion/load_data.rdfox',
-            'scripts/data-conversion/map.dlog',
-            'scripts/data-conversion/master.rdfox',
-            'scripts/data-conversion/master-pipeline.rdfox',
-            'scripts/data-conversion/output.rdfox',
-            'scripts/data-conversion/save_data.rdfox',
-            'scripts/data-conversion/unit_conversion.dlog',
-        ],
-        'targets': probs_original_data,
-        'actions':
-        [
-            f'{rdfox_path} -sandbox-directory {dir_path} sandbox {dir_path} scripts/data-conversion/master'
-        ],
-    }
-
-
-def task_data_enhancement():
+def task_data_conversion_prodcom():
     """Reads CSV files, runs all the rules, and converts all of them into RDF."""
-    return {
-        'file_dep': ontology_ffs + data_csv + probs_original_data + [
-            'scripts/shared/setup-RDFox.rdfox',
-            'scripts/shared/init-conversion.rdfox',
-            'scripts/data-enhancement/input.rdfox',
-            'scripts/data-enhancement/master.rdfox',
-            'scripts/data-enhancement/master-pipeline.rdfox',
-            'scripts/data-enhancement/process.rdfox',
-            'scripts/data-enhancement/rules.dlog',
-            'scripts/data-enhancement/save_data.rdfox',
-            'scripts/data-enhancement/export_enhanced_data.rq',
-            'scripts/data-enhancement/equivalence_composition/y_compatibility_signature.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_composition_hierarchy.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_connected_bounds.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_connected_members.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_connected_wdf.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_empty_observations.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_equivalence.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_f_.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_f_stats.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_i_.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_i_stats.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_m_.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_m_stats.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_main.rdfox',
-            'scripts/data-enhancement/equivalence_composition/y_missing_measurement.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_ordering.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_parallel_world.dlog',
-            'scripts/data-enhancement/equivalence_composition/y_query_.rdfox',
-            'scripts/data-enhancement/equivalence_composition/y_query_stats.rdfox',
-        ],
-        'targets': probs_enhanced_data,
-        'actions':
-        [
-            f'{rdfox_path} -sandbox-directory {dir_path} sandbox {dir_path} scripts/data-enhancement/master'
-        ],
-    }
-
-
-def task_test_queries():
-    """Reads the RDF file with the data, answers some queries."""
-    return {
-        'file_dep': reasoning_input + [
-            'scripts/test-queries/master.rdfox',
-            'scripts/test-queries/master-pipeline.rdfox',
-            'scripts/test-queries/run_queries.rdfox',
-            'scripts/test-queries/queries/query18.rq',
-            'scripts/test-queries/queries/query18g.rq',
-            'scripts/test-queries/queries/query26.rq',
-            'scripts/test-queries/queries/query32.rq',
-            'scripts/test-queries/queries/query33.rq',
-            'scripts/test-queries/queries/queryAllObs.rq',
-            'scripts/test-queries/queries/queryHMC.rq',
-            'scripts/test-queries/queries/queryHMO.rq',
-            'scripts/test-queries/queries/queryHMObs.rq',
-            'scripts/test-queries/queries/queryROC.rq',
-            'scripts/test-queries/queries/queryWDF.rq',
-        ],
-        'targets': [
-            'output/query18.csv',
-            'output/query18g.csv',
-            'output/query26.csv',
-            'output/query32.csv',
-            'output/query33.csv',
-            'output/queryAllObs.csv',
-            'output/queryHMC.csv',
-            'output/queryHMO.csv',
-            'output/queryHMObs.csv',
-            'output/queryROC.csv',
-            'output/queryWDF.csv',
-        ],
-        'actions': [
-            [rdfox_path, "-sandbox-directory", dir_path, "sandbox", dir_path, "scripts/test-queries/master"]
-        ],
-    }
-
-
-def task_reasoning():
-    """Reads the RDF file with the data and starts the RDFox endpoint."""
-    cmd = [rdfox_path, "-sandbox-directory", dir_path, "sandbox", dir_path, "scripts/reasoning/master"]
-    return {
-        'file_dep': reasoning_input + [
-            'scripts/reasoning/master.rdfox',
-            'scripts/reasoning/master-pipeline.rdfox',
-        ],
-        "uptodate": [False],
-        'actions': [
-            LongRunning(cmd, shell=False)
-        ],
-    }
+    for data_file in PRODCOM_DATA_CSV:
+        data_file = Path(data_file)
+        target_file = Path("outputs") / (data_file.stem + ".nt.gz")
+        yield {
+            'name': data_file.stem,
+            'file_dep': [
+                data_file,
+                "scripts/load_data_prodcom.rdfox",
+                "scripts/map_prodcom.dlog",
+            ],
+            'targets': [target_file],
+            'actions': [
+                [
+                    "python",
+                    "scripts/convert_data.py",
+                    "prodcom",
+                    data_file,
+                    target_file,
+                ]
+            ],
+        }
+
+# TODO: repeat this type of tasks for other data files
+#
+# (1) COMTRADE data ("ct-2018-exports" etc)
+#
+# (2) Classification code definitions "PRD_20XX..."
+#
+# (3) Correspondence tables "PRC_2017_2016.csv"
+#
+# all in parallel (separately)
+
+
+# def task_data_conversion():
+#     """Reads CSV files, runs all the rules, and converts all of them into RDF."""
+#     return {
+#         'file_dep': ontology_ffs + data_csv + [
+#             'scripts/shared/setup-RDFox.rdfox',
+#             'scripts/shared/init-conversion.rdfox',
+#             'scripts/data-conversion/input.rdfox',
+#             'scripts/data-conversion/load_data.rdfox',
+#             'scripts/data-conversion/map.dlog',
+#             'scripts/data-conversion/master.rdfox',
+#             'scripts/data-conversion/master-pipeline.rdfox',
+#             'scripts/data-conversion/output.rdfox',
+#             'scripts/data-conversion/save_data.rdfox',
+#             'scripts/data-conversion/unit_conversion.dlog',
+#         ],
+#         'targets': probs_original_data,
+#         'actions':
+#         [
+#             f'{rdfox_path} -sandbox-directory {dir_path} sandbox {dir_path} scripts/data-conversion/master'
+#         ],
+#     }

+ 2 - 1
environment.yml

@@ -7,4 +7,5 @@ dependencies:
   - rdflib=5.*
   - pytest=7.*
   - pip:
-      - https://github.com/ukfires/probs-runner/archive/3c114a1567c9829fad7fc7e7a0a30ff362be4a7f.zip
+      - https://github.com/ukfires/probs-runner/archive/87e98e3043b243b88e5f0a0658df2167bc4bbd06.zip
+      # - probs-ontology=2.*

+ 22 - 18
scripts/convert_data.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-"""Convert UNFCCC DI API csv files to PRObs Observations.
+"""Convert data csv files to PRObs Observations.
 
 """
 
@@ -16,11 +16,10 @@ setup_logging_to_console()
 
 
 CODE_DIR = Path(__file__).parent
-SCRIPT_SOURCE_DIR = CODE_DIR / "../vendor/probs-ontology"
-
 
 def parse_arguments():
     parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("type", help="data type of convert")
     parser.add_argument('input_file',
                         type=Path,
                         help='UNFCCC API csv file to convert')
@@ -30,33 +29,38 @@ def parse_arguments():
     return parser.parse_args()
 
 
-@log_to_file("build/log_convert_data_unfccc.txt")
-def convert_data_unfccc(data_csv, output_path):
+@log_to_file("build/log_convert_data.txt")
+def convert_data_prodcom(data_csv, output_path):
     # Process each country's CSV file, using the shared set of loading rules
     # and shared process hierarchy definitions.
-    annex = data_csv.parent.stem  # annexI or non-annexI
-    category_datasource = Datasource.from_files([
-        Path(f"outputs/data_classifications_{annex}.nt.gz")
-    ])
+    #
+    # data_csv is e.g. "PRODCOM2016DATA.csv"
+    #
+    # Assume that there is a file named e.g. "PRODCOM2016DATA_defs.dlog" with
+    # definitions of the prefixes and time period and region to use.
     datasource = Datasource.from_files(
         input_files={
-            "data.csv.gz": data_csv,
+            "data.csv": data_csv,
             "metrics_units.csv": CODE_DIR / "metrics_units.csv",
         },
         load_data_script=[
-            CODE_DIR / "load_data.rdfox",
+            CODE_DIR / "load_data_prodcom.rdfox",
         ],
         rules=[
-            CODE_DIR / f"_def_{annex}.dlog",
-            CODE_DIR / "map_generic.dlog",
-            CODE_DIR / "map_probs.dlog",
+            # This could be done within the script by passing a StringIO object
+            # with the rules in it, generated on the fly from arguments.
+            data_csv.parent / (data_csv.stem + "_defs.dlog"),
+            CODE_DIR / "map_prodcom.dlog",
         ],
     )
-    probs_convert_data([category_datasource, datasource],
-                       output_path,
-                       script_source_dir=SCRIPT_SOURCE_DIR)
+    probs_convert_data([datasource], output_path)
 
 
 if __name__ == "__main__":
     args = parse_arguments()
-    convert_data_unfccc(args.input_file, args.output_file)
+    # TODO: extend this for comtrade etc
+    if args.type == "prodcom":
+        convert_data_prodcom(args.input_file, args.output_file)
+    else:
+        print("Unknown data type to convert: %r" % args.type)
+        sys.exit(1)

+ 29 - 0
scripts/load_data_prodcom.rdfox

@@ -0,0 +1,29 @@
+######################################################
+###                      PRODCOM Data              ###
+######################################################
+
+dsource register "PRODCOM_DATA"                              \
+    type    delimitedFile                                       \
+    file    "$(dir.datasource)/data.csv"              \
+    header  true                                                \
+    quote   '"'
+
+tupletable create ufrd:PRODCOM_DATA                          \
+    dataSourceName  "PRODCOM_DATA"                           \
+    "columns"       4                                           \
+    "1"             "{ID}"                                      \
+    "1.datatype"    "string"                                    \
+    "2"             "{PRCCODE}"                                 \
+    "2.datatype"    "string"                                    \
+    "3"             "{PRODQNT}"                                 \
+    "3.datatype"    "xsd:decimal"                               \
+    "3.if-empty"    "absent"                                    \
+    "4"             "{QNTUNIT}"                                 \
+    "4.datatype"    "string"	                                \
+    "4.if-empty"    "default"	                                \
+    "4.default"     "unknown"
+
+    # 1 ID generated for unique observations
+    # 2 This is the ClassificationCode and related to 1. by `objectDefinedBy`
+    # 3 MeasurementValue (of measurement of observation) - the measurement is implicit here. If blank then the value was witheld (but is not 0 necessarily)
+    # 4 Unit of Measure

+ 26 - 0
scripts/map_prodcom.dlog

@@ -0,0 +1,26 @@
+:DirectObservation[?ID] ,
+[?ID, :objectDirectlyDefinedBy, ?PRCCode] ,
+# TODO Generalise the region (not just the UK)
+[?ID, :hasRegion, gnd:2635167] ,
+[?ID, :hasTimePeriod, ?TimePeriod] ,
+[?ID, :hasRole, :SoldProduction] ,
+[?ID, :partOfDataset, ?Dataset] ,
+[?ID, :bound, :ExactBound] ,
+ufu:NG(?ID, ufu:unit, ?UnitID)
+        :- ufrd:PRODCOM_DATA(?IDstring, ?PRCCODEstring, ?PRODQNT, ?QNTUNIT),
+
+        [:CurrentImport, :hasTimePeriod, ?TimePeriod],
+        [:CurrentImport, :partOfDataset, ?Dataset],
+        [:CurrentImport, :useDataPrefix, ?DataPrefix],  # STR(ufpcd2016:)
+        [:CurrentImport, :useObjectPrefix, ?ObjectPrefix],  # STR(ufpc2016:)
+
+        # TODO Fix the prefix with the year
+        BIND(IRI(CONCAT(STR(?DataPrefix), "Observation-", SHA256(?IDstring))) AS ?ID) ,
+        BIND(IRI(CONCAT(STR(?ObjectPrefix), "Object-", SHA256(?PRCCODEstring))) AS ?PRCCode) ,
+        BIND(IRI(CONCAT(STR(:), "Unit-", SHA256(?QNTUNIT))) AS ?UnitID) .
+
+# if ?PRODQNT is not "absent"
+ufu:NG(?ID, ufu:measurementUnit, ?PRODQNT)
+        :- ufrd:PRODCOM_DATA(?IDstring, ?PRCCODEstring, ?PRODQNT, ?QNTUNIT), FILTER(BOUND(?PRODQNT)),
+        [:CurrentImport, :useDataPrefix, ?DataPrefix],  # STR(ufpcd2016:)
+        BIND(IRI(CONCAT(STR(?DataPrefix), "Observation-", SHA256(?IDstring))) AS ?ID) .

data/metrics_units.csv → scripts/metrics_units.csv