helper.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. #
  2. # MIT License
  3. #
  4. # Copyright (c) 2019 Keisuke Sehara
  5. #
  6. # Permission is hereby granted, free of charge, to any person obtaining a copy
  7. # of this software and associated documentation files (the "Software"), to deal
  8. # in the Software without restriction, including without limitation the rights
  9. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. # copies of the Software, and to permit persons to whom the Software is
  11. # furnished to do so, subject to the following conditions:
  12. #
  13. # The above copyright notice and this permission notice shall be included in all
  14. # copies or substantial portions of the Software.
  15. #
  16. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22. # SOFTWARE.
  23. #
  24. import sys as _sys
  25. import json as _js
  26. import re as _re
  27. from pathlib import Path as _Path
  28. import collections as _cl
  29. DATASETS_METADATA_FILE = 'datasets_metadata.json'
  30. HOW_TO_USE = """
  31. ------
  32. This 'helper.py' is written to work at the **root directory of the dataset**.
  33. 1. please make sure that the directory structure of the dataset remains unchanged
  34. (you can miss data files, though).
  35. 2. please reposition this file inside the root directory (where you can find
  36. '{DATASETS_METADATA_FILE}').
  37. 3. change the current directory to the root directory of the dataset.
  38. 4. from a Python session, run `import helper`.
  39. """
  40. DATE_PATTERN = _re.compile(r'[0-9]{4}-[0-9]{2}-[0-9]{2}$')
  41. rootdir = _Path(__file__).parent
  42. def __read_root_metadata(rootdir):
  43. rootdir = _Path(rootdir)
  44. if not rootdir.is_dir():
  45. raise RuntimeError(f"not a directory: {rootdir}")
  46. metadata_file = rootdir / DATASETS_METADATA_FILE
  47. if not metadata_file.is_file():
  48. raise RuntimeError(f"not a file: {metadata_file}")
  49. with open(metadata_file, 'r') as src:
  50. return _js.load(src, object_hook=_cl.OrderedDict)
  51. def __errormsg(msg):
  52. print(f"***{msg} {HOW_TO_USE}", file=_sys.stderr)
  53. root_metadata = None
  54. try:
  55. root_metadata = __read_root_metadata(rootdir)
  56. except RuntimeError as e:
  57. __errormsg(f"failed to read from '{DATASETS_METADATA_FILE}' ({e})")
  58. def describe_datasets(indent=2):
  59. if root_metadata is None:
  60. return __errormsg("metadata has not been initialized properly.")
  61. if isinstance(indent, int):
  62. indent = ' '*indent
  63. if len(root_metadata) > 0:
  64. print("Available datasets")
  65. for ds_name, ds_desc in root_metadata.items():
  66. print(f"--------------------\n\ndataset '{ds_name}':")
  67. desc = ds_desc.get('description', None)
  68. if desc:
  69. print(f"{indent*1}(description)")
  70. print(f"{indent*2}{desc}")
  71. domains = ds_desc.get("domains", {})
  72. if len(domains) > 0:
  73. print(f"{indent*1}(domains)")
  74. for key, dom_desc in domains.items():
  75. suffix = dom_desc.get('suffix', '')
  76. if len(suffix.strip()) == 0:
  77. suffix = 'no suffix'
  78. desc = dom_desc.get('description', '(no description)')
  79. print(f"{indent*2}- domain '{key}' ({suffix})")
  80. print(f"{indent*3}{desc}")
  81. else:
  82. print(f"{indent*1}(no available domains)")
  83. else:
  84. print("***no datasets available in this directory!", file=_sys.stderr)
  85. pathspec = _cl.namedtuple('pathspec', ('context', 'path'))
  86. class context:
  87. _parameters = ('dataset', 'subject', 'date', 'domain', 'file')
  88. _retrievable = ('datasets', 'subjects', 'dates', 'domains', 'files')
  89. def __init__(self):
  90. self.__cached = {}
  91. def __getattr__(self, name):
  92. if name in self._parameters:
  93. if name == 'file':
  94. raise NameError("use 'files' to retrieve file paths")
  95. return parameter(self, name)
  96. elif name in self._retrievable:
  97. return self.retrieve(name)
  98. def get_datasets(self, as_spec=True, recalculate=False):
  99. if ('datasets' not in self.__cached.keys()) or (recalculate == True):
  100. dss = []
  101. for ds_name in root_metadata.keys():
  102. spec = pathspec(dict(dataset=ds_name), rootdir / ds_name)
  103. if spec.path.is_dir():
  104. if self.__validate__('dataset', ds_name):
  105. dss.append(spec)
  106. self.__cached['datasets'] = dss
  107. self.__cached['dataset_names'] = [item.context['dataset'] for item in dss]
  108. if as_spec == True:
  109. return tuple(self.__cached['datasets'])
  110. else:
  111. return tuple(self.__cached['dataset_names'])
  112. def get_subjects(self, as_spec=True, recalculate=False):
  113. if ('subjects' not in self.__cached.keys()) or (recalculate == True):
  114. subs = []
  115. for ds in self.get_datasets(as_spec=True, recalculate=recalculate):
  116. for child in ds.path.iterdir():
  117. if not child.is_dir():
  118. continue
  119. if self.__validate__('subject', child.name):
  120. cxt = ds.context.copy()
  121. cxt['subject'] = child.name
  122. spec = pathspec(cxt, child)
  123. subs.append(spec)
  124. self.__cached['subjects'] = subs
  125. self.__cached['subject_names'] = sorted(set(item.context['subject'] for item in subs))
  126. if as_spec == True:
  127. return tuple(self.__cached['subjects'])
  128. else:
  129. return tuple(self.__cached['subject_names'])
  130. def get_dates(self, as_spec=True, recalculate=False):
  131. if ('dates' not in self.__cached.keys()) or (recalculate == True):
  132. dates = []
  133. for sub in self.get_subjects(as_spec=True, recalculate=recalculate):
  134. for child in sub.path.iterdir():
  135. if not child.is_dir():
  136. continue
  137. is_session = DATE_PATTERN.search(child.name)
  138. if not is_session:
  139. continue
  140. date = is_session.group(0)
  141. if self.__validate__('date', date):
  142. cxt = sub.context.copy()
  143. cxt['date'] = date
  144. spec = pathspec(cxt, child)
  145. dates.append(spec)
  146. self.__cached['dates'] = dates
  147. self.__cached['date_values'] = sorted(set(item.context['date'] for item in dates))
  148. if as_spec == True:
  149. return tuple(self.__cached['dates'])
  150. else:
  151. return tuple(self.__cached['date_values'])
  152. def get_domains(self, as_spec=True, recalculate=False):
  153. if ('domains' not in self.__cached.keys()) or (recalculate == True):
  154. doms = []
  155. for date in self.get_dates(as_spec=True, recalculate=recalculate):
  156. for child in date.path.iterdir():
  157. if not child.is_dir():
  158. continue
  159. dom = child.name
  160. if self.__validate__('domain', dom):
  161. cxt = date.context.copy()
  162. cxt['domain'] = dom
  163. spec = pathspec(cxt, child)
  164. doms.append(spec)
  165. self.__cached['domains'] = doms
  166. self.__cached['domain_names'] = sorted(set(item.context['domain'] for item in doms))
  167. if as_spec == True:
  168. return tuple(self.__cached['domains'])
  169. else:
  170. return tuple(self.__cached['domain_names'])
  171. def get_files(self, as_spec=True, recalculate=False):
  172. if ('files' not in self.__cached.keys()) or (recalculate == True):
  173. files = []
  174. for dom in self.get_domains(as_spec=True, recalculate=recalculate):
  175. for child in dom.path.iterdir():
  176. spec = pathspec(dom.context.copy(), child)
  177. files.append(spec)
  178. self.__cached['files'] = files
  179. self.__cached['file_paths'] = sorted(str(item.path) for item in files)
  180. if as_spec == True:
  181. return tuple(self.__cached['files'])
  182. else:
  183. return tuple(self.__cached['file_paths'])
  184. def retrieve(self, param, recalculate=False):
  185. if root_metadata is None:
  186. return __errormsg("metadata has not been initialized properly.")
  187. options = dict(as_spec=False, recalculate=recalculate)
  188. if param == 'datasets':
  189. return self.get_datasets(**options)
  190. elif param == 'subjects':
  191. return self.get_subjects(**options)
  192. elif param == 'dates':
  193. return self.get_dates(**options)
  194. elif param == 'domains':
  195. return self.get_domains(**options)
  196. elif param == 'files':
  197. return self.get_files(**options)
  198. else:
  199. raise ValueError(f"unknown object type for retieval: {param}")
  200. def __validate__(self, param, value):
  201. raise NotImplementedError(f"{self.__class__.__name__}.__validate__")
  202. class _datasets(context):
  203. """manages file retrieval from datasets."""
  204. def __init__(self):
  205. super().__init__()
  206. def __repr__(self):
  207. return '<any>'
  208. def __validate__(self, param, value):
  209. return True
  210. class parameter:
  211. """manages contexts."""
  212. def __init__(self, parent, name):
  213. self.__parent = parent
  214. self.__name = name
  215. def __getattr__(self, name):
  216. if name == 'name':
  217. return self.__name
  218. def __repr__(self):
  219. parent = repr(self.__parent)
  220. return f"{parent}.{self.__name}"
  221. def __cond__(self, op, name):
  222. if not isinstance(name, str):
  223. raise ValueError(f"cannot compare to {name.__class__} (expected a string)")
  224. return conditional(op, self, name)
  225. def __eq__(self, name):
  226. return self.__cond__('eq', name)
  227. def __ne__(self, name):
  228. return self.__cond__('ne', name)
  229. def __gt__(self, name):
  230. return self.__cond__('gt', name)
  231. def __lt__(self, name):
  232. return self.__cond__('lt', name)
  233. def __ge__(self, name):
  234. return self.__cond__('ge', name)
  235. def __le__(self, name):
  236. return self.__cond__('le', name)
  237. class conditional(context):
  238. """manages conditions in contexts."""
  239. _opcodes = dict(eq='==',
  240. ne='!=',
  241. gt='>',
  242. ge='>=',
  243. lt='<',
  244. le='<=')
  245. _ops = {
  246. 'eq': (lambda _x, _v: _x == _v),
  247. 'ne': (lambda _x, _v: _x != _v),
  248. 'gt': (lambda _x, _v: _x > _v),
  249. 'ge': (lambda _x, _v: _x >= _v),
  250. 'lt': (lambda _x, _v: _x < _v),
  251. 'le': (lambda _x, _v: _x <= _v)
  252. }
  253. def __init__(self, op, param, value):
  254. super().__init__()
  255. self.__op = op
  256. self.__param = param
  257. self.__value = value
  258. def __getattr__(self, name):
  259. if name == 'opcode':
  260. opcode = self._opcodes.get(self.__op, None)
  261. if opcode:
  262. return opcode
  263. else:
  264. raise ValueError(f'unknown operation: {op}')
  265. else:
  266. return super().__getattr__(name)
  267. def __join__(self, op, other):
  268. if not isinstance(other, context):
  269. raise ValueError(f"cannot join {other.__class__} (expected conditional or joined)")
  270. return joined(op, self, other)
  271. def __add__(self, other):
  272. return self.__join__('add', other)
  273. def __mul__(self, other):
  274. return self.__join__('mul', other)
  275. def __repr__(self):
  276. return f"({self.__param} {self.opcode} {repr(self.__value)})"
  277. def __validate__(self, param, value):
  278. if param != self.__param.name:
  279. return True
  280. op = self._ops.get(self.__op, None)
  281. if op:
  282. return op(value, self.__value)
  283. else:
  284. raise ValueError(f'unknown operation: {op}')
  285. class joined(context):
  286. """joins two contexts."""
  287. def __init__(self, op, set1, set2):
  288. super().__init__()
  289. self.__op = op
  290. self.__set1 = set1
  291. self.__set2 = set2
  292. def __getattr__(self, name):
  293. if name == 'opcode':
  294. op = self.__op
  295. if op == 'mul':
  296. return '*'
  297. elif op == 'add':
  298. return '+'
  299. else:
  300. raise ValueError(f'unknown operation: {op}')
  301. else:
  302. return super().__getattr__(name)
  303. def __repr__(self):
  304. return f"({self.__set1} {self.opcode} {self.__set2})"
  305. def __validate__(self, param, value):
  306. cond1 = self.__set1.__validate__(param, value)
  307. cond2 = self.__set2.__validate__(param, value)
  308. op = self.__op
  309. if op == 'mul':
  310. return (cond1 and cond2)
  311. elif op == 'add':
  312. return (cond1 or cond2)
  313. else:
  314. raise ValueError(f'unknown operation: {op}')
  315. ### start script upon import
  316. if __name__ != '__main__':
  317. if root_metadata is not None:
  318. describe_datasets()
  319. datasets = _datasets()