Переглянути джерело

Add RDFox scripts/rules to load PRODCOM bulk data

Only the first 5000 lines of the data file are included in this commit.
Rick Lupton 10 місяців тому
батько
коміт
d5c3f42535

Різницю між файлами не показано, бо вона завелика
+ 5001 - 0
raw_data/ds-056120_linear.csv


+ 0 - 0
raw_data/ds-056120_linear_defs.dlog


+ 1 - 1
scripts/convert_data.py

@@ -53,7 +53,7 @@ def convert_data_prodcom(data_type, data_csv, output_path):
         map_file : CODE_DIR / map_file,
         map_file : CODE_DIR / map_file,
         defs_file : data_csv.parent / defs_file,
         defs_file : data_csv.parent / defs_file,
     }
     }
-    if data_type == "prodcom":
+    if data_type in ("prodcom", "prodcom_new"):
        load_data.append(load_units)
        load_data.append(load_units)
        input_files[units_file] = CODE_DIR / units_file
        input_files[units_file] = CODE_DIR / units_file
     datasource = Datasource.from_files(
     datasource = Datasource.from_files(

+ 29 - 0
scripts/load_data_prodcom_new.rdfox

@@ -0,0 +1,29 @@
+PREFIX ufpc:                <http://w3id.org/probs-lab/data/prodcom/>
+PREFIX ufpcd:               <http://w3id.org/probs-lab/data/prodcom_data/>
+PREFIX gnd:                 <https://sws.geonames.org/>
+
+######################################################
+###                      PRODCOM Data              ###
+######################################################
+
+dsource register "PRODCOM_DATA_NEW"                             \
+    type    delimitedFile                                       \
+    file    "$(dir.datasource)/data.csv"                        \
+    header  true                                                \
+    quote   '"'
+
+tupletable create ufrd:PRODCOM_DATA_NEW                         \
+    dataSourceName  "PRODCOM_DATA_NEW"                          \
+    "columns"       6                                           \
+    "1"             "{decl}"                                    \
+    "1.datatype"    "string"                                    \
+    "2"             "{TIME_PERIOD}"                             \
+    "2.datatype"    "string"                                    \
+    "3"             "{prccode}"                                 \
+    "3.datatype"    "string"                                    \
+    "4"             "{indicators}"                              \
+    "4.datatype"    "string"                                    \
+    "5"             "{OBS_VALUE}"                               \
+    "5.datatype"    "string"     	                            \
+    "6"             "{OBS_FLAG}"                                \
+    "6.datatype"    "string"

+ 53 - 0
scripts/map_prodcom_new.dlog

@@ -0,0 +1,53 @@
+# Basic info from "PRODQNT"
+
+:DirectObservation[?ID] ,
+[?ID, :objectDirectlyDefinedBy, ?Object] ,
+[?ID, :hasRegion, ?Region] ,
+[?ID, :hasTimePeriod, ?TimePeriod] ,
+[?ID, :hasRole, :SoldProduction] ,
+[?ID, :partOfDataset, ufpc:PRODCOM] ,
+[?ID, :bound, :ExactBound]
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PRODQNT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, ?Indicators)))) AS ?ID),
+        BIND(IRI(CONCAT(STR(ufpc:), "Object-", ?PRCCODE)) AS ?Object) ,
+        BIND(IRI(CONCAT(STR(gnd:), "TODO-GN-CODE-", ?Decl)) AS ?Region),
+        BIND(IRI(CONCAT(STR(:), "TimePeriod_YearOf", ?Year)) AS ?TimePeriod) .
+
+
+# Additional info from other rows about units and flags
+
+ufu:NG(?ID, ufu:measurementUnit, ?Measurement)
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PRODQNT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, ?Indicators)))) AS ?ID) ,
+        :DirectObservation[?ID] ,
+
+        # Don't use the measurement value when there is a "confidential" flag,
+        # this means that a zero value really means "missing"
+        NOT [?ID, :measurementFlag, ":C"],
+
+        BIND(xsd:decimal(?ObsValue) AS ?Measurement).
+
+
+ufu:NG(?ID, ufu:unit, ?UnitID)
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "QNTUNIT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, "PRODQNT")))) AS ?ID) ,
+        :DirectObservation[?ID] ,
+        BIND(IRI(CONCAT(STR(:), "Unit-", SHA256(?ObsValue))) AS ?UnitID) .
+
+
+[?ID, :measurementFlag, ?ObsValue]
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PQNTFLAG"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, "PRODQNT")))) AS ?ID) ,
+        :DirectObservation[?ID] .
+
+
+# EU prefix: "http://data.europa.eu/qw1/prodcom2021/"
+