Browse Source

Add RDFox scripts/rules to load PRODCOM bulk data

Only the first 5000 lines of the data file are included in this commit.
Rick Lupton 9 months ago
parent
commit
d5c3f42535

File diff suppressed because it is too large
+ 5001 - 0
raw_data/ds-056120_linear.csv


+ 0 - 0
raw_data/ds-056120_linear_defs.dlog


+ 1 - 1
scripts/convert_data.py

@@ -53,7 +53,7 @@ def convert_data_prodcom(data_type, data_csv, output_path):
         map_file : CODE_DIR / map_file,
         defs_file : data_csv.parent / defs_file,
     }
-    if data_type == "prodcom":
+    if data_type in ("prodcom", "prodcom_new"):
        load_data.append(load_units)
        input_files[units_file] = CODE_DIR / units_file
     datasource = Datasource.from_files(

+ 29 - 0
scripts/load_data_prodcom_new.rdfox

@@ -0,0 +1,29 @@
+PREFIX ufpc:                <http://w3id.org/probs-lab/data/prodcom/>
+PREFIX ufpcd:               <http://w3id.org/probs-lab/data/prodcom_data/>
+PREFIX gnd:                 <https://sws.geonames.org/>
+
+######################################################
+###                      PRODCOM Data              ###
+######################################################
+
+dsource register "PRODCOM_DATA_NEW"                             \
+    type    delimitedFile                                       \
+    file    "$(dir.datasource)/data.csv"                        \
+    header  true                                                \
+    quote   '"'
+
+tupletable create ufrd:PRODCOM_DATA_NEW                         \
+    dataSourceName  "PRODCOM_DATA_NEW"                          \
+    "columns"       6                                           \
+    "1"             "{decl}"                                    \
+    "1.datatype"    "string"                                    \
+    "2"             "{TIME_PERIOD}"                             \
+    "2.datatype"    "string"                                    \
+    "3"             "{prccode}"                                 \
+    "3.datatype"    "string"                                    \
+    "4"             "{indicators}"                              \
+    "4.datatype"    "string"                                    \
+    "5"             "{OBS_VALUE}"                               \
+    "5.datatype"    "string"     	                            \
+    "6"             "{OBS_FLAG}"                                \
+    "6.datatype"    "string"

+ 53 - 0
scripts/map_prodcom_new.dlog

@@ -0,0 +1,53 @@
+# Basic info from "PRODQNT"
+
+:DirectObservation[?ID] ,
+[?ID, :objectDirectlyDefinedBy, ?Object] ,
+[?ID, :hasRegion, ?Region] ,
+[?ID, :hasTimePeriod, ?TimePeriod] ,
+[?ID, :hasRole, :SoldProduction] ,
+[?ID, :partOfDataset, ufpc:PRODCOM] ,
+[?ID, :bound, :ExactBound]
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PRODQNT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, ?Indicators)))) AS ?ID),
+        BIND(IRI(CONCAT(STR(ufpc:), "Object-", ?PRCCODE)) AS ?Object) ,
+        BIND(IRI(CONCAT(STR(gnd:), "TODO-GN-CODE-", ?Decl)) AS ?Region),
+        BIND(IRI(CONCAT(STR(:), "TimePeriod_YearOf", ?Year)) AS ?TimePeriod) .
+
+
+# Additional info from other rows about units and flags
+
+ufu:NG(?ID, ufu:measurementUnit, ?Measurement)
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PRODQNT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, ?Indicators)))) AS ?ID) ,
+        :DirectObservation[?ID] ,
+
+        # Don't use the measurement value when there is a "confidential" flag,
+        # this means that a zero value really means "missing"
+        NOT [?ID, :measurementFlag, ":C"],
+
+        BIND(xsd:decimal(?ObsValue) AS ?Measurement).
+
+
+ufu:NG(?ID, ufu:unit, ?UnitID)
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "QNTUNIT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, "PRODQNT")))) AS ?ID) ,
+        :DirectObservation[?ID] ,
+        BIND(IRI(CONCAT(STR(:), "Unit-", SHA256(?ObsValue))) AS ?UnitID) .
+
+
+[?ID, :measurementFlag, ?ObsValue]
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PQNTFLAG"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, "PRODQNT")))) AS ?ID) ,
+        :DirectObservation[?ID] .
+
+
+# EU prefix: "http://data.europa.eu/qw1/prodcom2021/"
+