123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- #!/usr/bin/env python3
- """
- Task definitions for doit.
- Run the steps to generate and query our datasets using RDFox.
- """
- from pathlib import Path
- from os import path
- from doit.tools import LongRunning
- rdfox_path = "RDFox" # change this if RDFox is not in your path
- dir_path = path.dirname(path.realpath(__file__))
- print(f"Running {rdfox_path} from {dir_path}")
- data_csv = [
- 'data/Object_table_for_ontoloy.csv',
- 'data/PRC_2017_2016.csv',
- 'data/PRD_2016_20200617_185122.csv',
- 'data/PRD_2017_20200617_185035.csv',
- 'data/PRODCOM2016DATA.csv',
- 'data/PRODCOM2017DATA.csv',
- 'data/PRODCOM2018DATA.csv',
- ]
- probs_original_data = [
- 'data/probs_original_data.nt.gz'
- ]
- def task_preprocess():
- """Converts 'raw' data into CSV files for RDFox."""
- raw_data_path = Path("raw_data")
- raw_data_files = [
- raw_data_path / Path(f).name for f in data_csv
- ]
- raw_data_files = [
- f for f in raw_data_files if f.exists()
- ]
- return {
- 'file_dep': ['scripts/preprocess.py'] + raw_data_files,
- 'targets': data_csv,
- 'actions': ['python scripts/preprocess.py'],
- }
- # List of (data_type, csv_filename) pairs to be converted
- DATA_FILES = [
- ("prodcom", 'data/PRODCOM2016DATA.csv'),
- ("prodcom", 'data/PRODCOM2017DATA.csv'),
- ("prodcom", 'data/PRODCOM2018DATA.csv'),
- ("prodcom_correspondence", 'data/PRC_2017_2016.csv'),
- ("prodcom_list", 'data/PRD_2017_20200617_185035.csv'),
- ("prodcom_list", 'data/PRD_2016_20200617_185122.csv'),
- ]
- def task_convert_data():
- """Reads CSV files, runs all the rules, and converts all of them into RDF."""
- for data_type, csv_file in DATA_FILES:
- csv_file = Path(csv_file)
- target_file = Path("outputs") / (csv_file.stem + ".nt.gz")
- yield {
- 'name': csv_file.stem,
- 'file_dep': [
- csv_file,
- f"scripts/load_data_{data_type}.rdfox",
- f"scripts/map_{data_type}.dlog",
- ],
- 'targets': [
- target_file
- ],
- 'actions': [
- [
- "python",
- "scripts/convert_data.py",
- data_type,
- csv_file,
- target_file,
- ]
- ],
- }
- # def task_data_conversion():
- # """Reads CSV files, runs all the rules, and converts all of them into RDF."""
- # return {
- # 'file_dep': ontology_ffs + data_csv + [
- # 'scripts/shared/setup-RDFox.rdfox',
- # 'scripts/shared/init-conversion.rdfox',
- # 'scripts/data-conversion/input.rdfox',
- # 'scripts/data-conversion/load_data.rdfox',
- # 'scripts/data-conversion/map.dlog',
- # 'scripts/data-conversion/master.rdfox',
- # 'scripts/data-conversion/master-pipeline.rdfox',
- # 'scripts/data-conversion/output.rdfox',
- # 'scripts/data-conversion/save_data.rdfox',
- # 'scripts/data-conversion/unit_conversion.dlog',
- # ],
- # 'targets': probs_original_data,
- # 'actions':
- # [
- # f'{rdfox_path} -sandbox-directory {dir_path} sandbox {dir_path} scripts/data-conversion/master'
- # ],
- # }
|