Преглед изворни кода

Add RDFox scripts/rules to load PRODCOM bulk data

Only the first 5000 lines of the data file are included in this commit.
Rick Lupton пре 10 месеци
родитељ
комит
d5c3f42535

Разлика између датотеке није приказан због своје велике величине
+ 5001 - 0
raw_data/ds-056120_linear.csv


+ 0 - 0
raw_data/ds-056120_linear_defs.dlog


+ 1 - 1
scripts/convert_data.py

@@ -53,7 +53,7 @@ def convert_data_prodcom(data_type, data_csv, output_path):
         map_file : CODE_DIR / map_file,
         defs_file : data_csv.parent / defs_file,
     }
-    if data_type == "prodcom":
+    if data_type in ("prodcom", "prodcom_new"):
        load_data.append(load_units)
        input_files[units_file] = CODE_DIR / units_file
     datasource = Datasource.from_files(

+ 29 - 0
scripts/load_data_prodcom_new.rdfox

@@ -0,0 +1,29 @@
+PREFIX ufpc:                <http://w3id.org/probs-lab/data/prodcom/>
+PREFIX ufpcd:               <http://w3id.org/probs-lab/data/prodcom_data/>
+PREFIX gnd:                 <https://sws.geonames.org/>
+
+######################################################
+###                      PRODCOM Data              ###
+######################################################
+
+dsource register "PRODCOM_DATA_NEW"                             \
+    type    delimitedFile                                       \
+    file    "$(dir.datasource)/data.csv"                        \
+    header  true                                                \
+    quote   '"'
+
+tupletable create ufrd:PRODCOM_DATA_NEW                         \
+    dataSourceName  "PRODCOM_DATA_NEW"                          \
+    "columns"       6                                           \
+    "1"             "{decl}"                                    \
+    "1.datatype"    "string"                                    \
+    "2"             "{TIME_PERIOD}"                             \
+    "2.datatype"    "string"                                    \
+    "3"             "{prccode}"                                 \
+    "3.datatype"    "string"                                    \
+    "4"             "{indicators}"                              \
+    "4.datatype"    "string"                                    \
+    "5"             "{OBS_VALUE}"                               \
+    "5.datatype"    "string"     	                            \
+    "6"             "{OBS_FLAG}"                                \
+    "6.datatype"    "string"

+ 53 - 0
scripts/map_prodcom_new.dlog

@@ -0,0 +1,53 @@
+# Basic info from "PRODQNT"
+
+:DirectObservation[?ID] ,
+[?ID, :objectDirectlyDefinedBy, ?Object] ,
+[?ID, :hasRegion, ?Region] ,
+[?ID, :hasTimePeriod, ?TimePeriod] ,
+[?ID, :hasRole, :SoldProduction] ,
+[?ID, :partOfDataset, ufpc:PRODCOM] ,
+[?ID, :bound, :ExactBound]
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PRODQNT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, ?Indicators)))) AS ?ID),
+        BIND(IRI(CONCAT(STR(ufpc:), "Object-", ?PRCCODE)) AS ?Object) ,
+        BIND(IRI(CONCAT(STR(gnd:), "TODO-GN-CODE-", ?Decl)) AS ?Region),
+        BIND(IRI(CONCAT(STR(:), "TimePeriod_YearOf", ?Year)) AS ?TimePeriod) .
+
+
+# Additional info from other rows about units and flags
+
+ufu:NG(?ID, ufu:measurementUnit, ?Measurement)
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PRODQNT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, ?Indicators)))) AS ?ID) ,
+        :DirectObservation[?ID] ,
+
+        # Don't use the measurement value when there is a "confidential" flag,
+        # this means that a zero value really means "missing"
+        NOT [?ID, :measurementFlag, ":C"],
+
+        BIND(xsd:decimal(?ObsValue) AS ?Measurement).
+
+
+ufu:NG(?ID, ufu:unit, ?UnitID)
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "QNTUNIT"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, "PRODQNT")))) AS ?ID) ,
+        :DirectObservation[?ID] ,
+        BIND(IRI(CONCAT(STR(:), "Unit-", SHA256(?ObsValue))) AS ?UnitID) .
+
+
+[?ID, :measurementFlag, ?ObsValue]
+        :-
+        ufrd:PRODCOM_DATA_NEW(?Decl, ?Year, ?PRCCODE, ?Indicators, ?ObsValue, ?ObsFlag),
+        FILTER(?Indicators = "PQNTFLAG"),
+        BIND(IRI(CONCAT(STR(ufpcd:), ?Year, "/Observation-", SHA256(CONCAT(?Decl, ?Year, ?PRCCODE, "PRODQNT")))) AS ?ID) ,
+        :DirectObservation[?ID] .
+
+
+# EU prefix: "http://data.europa.eu/qw1/prodcom2021/"
+