Prechádzať zdrojové kódy

Add RDFox scripts/rules to load PRODCOM bulk data

Only the first 5000 lines of the data file are included in this commit.
Rick Lupton 10 mesiacov pred
rodič
commit
d5c3f42535

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 5001 - 0
raw_data/ds-056120_linear.csv


+ 0 - 0
raw_data/ds-056120_linear_defs.dlog


+ 1 - 1
scripts/convert_data.py

@@ -53,7 +53,7 @@ def convert_data_prodcom(data_type, data_csv, output_path):
         map_file : CODE_DIR / map_file,
         defs_file : data_csv.parent / defs_file,
     }
-    if data_type == "prodcom":
+    if data_type in ("prodcom", "prodcom_new"):
        load_data.append(load_units)
        input_files[units_file] = CODE_DIR / units_file
     datasource = Datasource.from_files(

+ 29 - 0
scripts/load_data_prodcom_new.rdfox

@@ -0,0 +1,29 @@
+PREFIX ufpc:                <http://w3id.org/probs-lab/data/prodcom/>
+PREFIX ufpcd:               <http://w3id.org/probs-lab/data/prodcom_data/>
+PREFIX gnd:                 <https://sws.geonames.org/>
+
+######################################################
+###                      PRODCOM Data              ###
+######################################################
+
+dsource register "PRODCOM_DATA_NEW"                             \
+    type    delimitedFile                                       \
+    file    "$(dir.datasource)/data.csv"                        \
+    header  true                                                \
+    quote   '"'
+
+tupletable create ufrd:PRODCOM_DATA_NEW                         \
+    dataSourceName  "PRODCOM_DATA_NEW"                          \
+    "columns"       6                                           \
+    "1"             "{decl}"                                    \
+    "1.datatype"    "string"                                    \
+    "2"             "{TIME_PERIOD}"                             \
+    "2.datatype"    "string"                                    \
+    "3"             "{prccode}"                                 \
+    "3.datatype"    "string"                                    \
+    "4"             "{indicators}"                              \
+    "4.datatype"    "string"                                    \
+    "5"             "{OBS_VALUE}"                               \
+    "5.datatype"    "string"     	                            \
+    "6"             "{OBS_FLAG}"                                \
+    "6.datatype"    "string"

+ 53 - 0
scripts/map_prodcom_new.dlog

@@ -0,0 +1,53 @@
+# Basic info from "PRODQNT"
+
+:DirectObservation[?ID] ,
+[?ID, :objectDirectlyDefinedBy, ?Object] ,
+[?ID, :hasRegion, ?Region] ,
+[?ID, :hasTimePeriod, ?TimePeriod] ,
+[?ID, :hasRole, :SoldProduction] ,
+[?ID, :partOfDataset, ufpc:PRODCOM] ,
+[?ID, :bound, :ExactBound]
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PRODQNT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, ?Indicators)))) AS ?ID),
+        BIND(IRI(CONCAT(STR(ufpc:), "Object-", ?PRCCODE)) AS ?Object) ,
+        BIND(IRI(CONCAT(STR(gnd:), "TODO-GN-CODE-", ?Decl)) AS ?Region),
+        BIND(IRI(CONCAT(STR(:), "TimePeriod_YearOf", ?Year)) AS ?TimePeriod) .
+
+
+# Additional info from other rows about units and flags
+
+ufu:NG(?ID, ufu:measurementUnit, ?Measurement)
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PRODQNT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, ?Indicators)))) AS ?ID) ,
+        :DirectObservation[?ID] ,
+
+        # Don't use the measurement value when there is a "confidential" flag,
+        # this means that a zero value really means "missing"
+        NOT [?ID, :measurementFlag, ":C"],
+
+        BIND(xsd:decimal(?ObsValue) AS ?Measurement).
+
+
+ufu:NG(?ID, ufu:unit, ?UnitID)
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "QNTUNIT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, "PRODQNT")))) AS ?ID) ,
+        :DirectObservation[?ID] ,
+        BIND(IRI(CONCAT(STR(:), "Unit-", SHA256(?ObsValue))) AS ?UnitID) .
+
+
+[?ID, :measurementFlag, ?ObsValue]
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PQNTFLAG"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, "PRODQNT")))) AS ?ID) ,
+        :DirectObservation[?ID] .
+
+
+# EU prefix: "http://data.europa.eu/qw1/prodcom2021/"
+