123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- #!/user/bin/env python
- # coding=utf-8
- """
- @author: yannansu
- @created at: 23.09.21 16:07
- This module loads and reads data from `raw_data/subject_idx.yaml`
- Example usage:
- LD = LoadData('test', data_path='pilot_data')
- activ_df = LD.read_activity()
- df = LD.read_data()
- """
- import numpy as np
- import pandas as pd
- import datetime
- import os
- import yaml
- from data_analysis.yml2dict import yml2dict
- class LoadData:
- def __init__(self, sub, data_path=None,
- sel_cfg=None, sel_par=None,
- sel_ses=None, rm_ses=None, start_date=None,
- sel_ses_idx=None):
- """
- Select and load data.
- :param sub: subject name
- :param data_path: data repository, default: 'data'
- :param sel_cfg: selected config keywords
- :param sel_par: selected param keywords
- :param sel_ses: selected session keywords
- :param rm_ses: removed session keywords
- :param start_date: selected starting date, e.g. '20210000
- :param sel_ses_idx: selected session index
- """
- self.sub = sub
- self.data_path = data_path
- if self.data_path is None:
- self.data_path = 'data'
- self.sel_cfg = sel_cfg
- self.sel_par = sel_par
- self.sel_ses = sel_ses
- self.rm_ses = rm_ses
- self.start_date = start_date
- self.sel_ses_idx = sel_ses_idx
- def read_activity(self):
- """
- Read subject activity log with given selectors.
- :return: a dataframe listing a summary of the selected session
- """
- # Read xrl file line by line
- activ_log = yml2dict(os.path.join(self.data_path, self.sub, self.sub + '.yaml'))
- activ_df = pd.DataFrame(activ_log).T
- # Select finished blocks
- activ_df = activ_df[activ_df.status == 'completed']
- # Filter data by input selector
- if self.sel_cfg is not None:
- cfg_pattern = '|'.join(self.sel_cfg)
- activ_df = activ_df[activ_df.cfg_file.str.contains(cfg_pattern)]
- if self.sel_par is not None:
- par_pattern = '|'.join(self.sel_par)
- activ_df = activ_df[activ_df.par_file.str.contains(par_pattern)]
- # Restrict specific sessions by date or time
- if self.sel_ses is not None:
- activ_df = activ_df.filter(like=self.sel_ses, axis=0)
- if self.rm_ses is not None:
- activ_df = activ_df.drop(self.rm_ses, axis=0)
- if self.start_date is not None:
- activ_df = activ_df[activ_df.index >= self.start_date]
- if self.sel_ses_idx is not None:
- activ_df = activ_df.groupby('par_file').nth(self.sel_ses_idx)
- activ_df = activ_df.reset_index()
- if 'index' in activ_df.columns:
- activ_df = activ_df.drop(columns=['index'])
- return activ_df
- def read_data(self, save_csv=None):
- """
- Read data from selected session.
- :param save_csv: '*.csv', save data in csv 'data/subject_xx/*.csv' if not None
- :return: a dataframe of trial-based data of all selected sessions
- """
- # Read activity log
- activ_df = self.read_activity()
- # Read data files
- yml_list = activ_df.data_file.to_list()
- df_list = []
- # Separate trials and save to dataframe
- for block_idx, yml in zip(activ_df.block_idx, yml_list):
- yml_data = pd.DataFrame({k: v for k, v in yml2dict(yml).items()
- if (k.startswith('trial') and any(c.isdigit() for c in k))
- }).T
- yml_data['block_index'] = block_idx
- yml_data['sub'] = self.sub
- yml_data = yml_data.reset_index(drop=True)
- df_list.append(yml_data)
- df = pd.concat(df_list, ignore_index=True)
- if save_csv is not None:
- df.to_csv(os.path.join(self.data_path, self.sub, save_csv))
- return df
- """
- # Test
- LD = LoadData('test', data_path='pilot_data')
- activ_df = LD.read_activity()
- df = LD.read_data()
- """
- # LoadData('s1', data_path='pilot2_data', sel_par='config/pilot2_param.yaml', start_date='20211207').read_data()
|