123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- #!/usr/bin/env python3
- """
- Task definitions for doit.
- Run the steps to generate and query our datasets using RDFox.
- """
-
- import csv
- from pathlib import Path
- from os import path
- from doit.tools import LongRunning
- rdfox_path = "RDFox" # change this if RDFox is not in your path
- dir_path = path.dirname(path.realpath(__file__))
- print(f"Running {rdfox_path} from {dir_path}")
- data_csv = [
- 'data/Object_table_for_ontoloy.csv',
- 'data/PRC_2017_2016.csv',
- 'data/PRD_2016_20200617_185122.csv',
- 'data/PRD_2017_20200617_185035.csv',
- 'data/PRODCOM2016DATA.csv',
- 'data/PRODCOM2017DATA.csv',
- 'data/PRODCOM2018DATA.csv',
- ]
- probs_original_data = [
- 'data/probs_original_data.nt.gz'
- ]
- def task_preprocess():
- """Converts 'raw' data into CSV files for RDFox."""
- raw_data_path = Path("raw_data")
- raw_data_files = [
- raw_data_path / Path(f).name for f in data_csv
- ]
- raw_data_files = [
- f for f in raw_data_files if f.exists()
- ]
- return {
- 'file_dep': ['scripts/preprocess.py'] + raw_data_files,
- 'targets': data_csv,
- 'actions': ['python scripts/preprocess.py'],
- }
-
- # List of (data_type, csv_filename) pairs to be converted
- DATA_FILES = [
- ("prodcom", 'data/PRODCOM2016DATA.csv'),
- ("prodcom", 'data/PRODCOM2017DATA.csv'),
- ("prodcom", 'data/PRODCOM2018DATA.csv'),
- ("prodcom_correspondence", 'data/PRC_2017_2016.csv'),
- ("prodcom_list", 'data/PRD_2017_20200617_185035.csv'),
- ("prodcom_list", 'data/PRD_2016_20200617_185122.csv'),
- ]
- DATA_BULK = [
- ("prodcom_bulk_sold" , 'bulk_data/sold_production', 'outputs/sold_production'),
- ("prodcom_bulk_total", 'bulk_data/total_production', 'outputs/total_production'),
- ]
- def task_convert_data():
- """Reads CSV files, runs all the rules, and converts all of them into RDF."""
- for data_type, csv_file in DATA_FILES:
- csv_file = Path(csv_file)
- target_file = Path("outputs") / (csv_file.stem + ".nt.gz")
- yield {
- 'name': csv_file.stem,
- 'file_dep': [
- csv_file,
- f"scripts/load_data_{data_type}.rdfox",
- f"scripts/map_{data_type}.dlog",
- ],
- 'targets': [
- target_file
- ],
- 'actions': [
- [
- "python",
- "scripts/convert_data.py",
- data_type,
- csv_file,
- target_file,
- ]
- ],
- }
- def task_convert_bulk():
- for data_type, csv_dir, output_dir in DATA_BULK:
- bulk_data_path = Path(csv_dir)
- bulk_data_files = bulk_data_path.glob("*.csv")
- for csv_file in bulk_data_files:
- target_file = Path(output_dir) / (csv_file.stem + ".nt.gz")
- yield {
- 'name': csv_file.stem,
- 'file_dep': [
- csv_file,
- "scripts/load_data_prodcom_bulk.rdfox",
- f"scripts/map_{data_type}.dlog",
- ],
- 'targets': [
- target_file
- ],
- 'actions': [
- [
- "python",
- "scripts/convert_data.py",
- data_type,
- csv_file,
- target_file,
- ]
- ],
- }
-
-
|