mrrcreator.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. import logging
  2. import time
  3. from typing import Dict, Optional, Tuple
  4. from uuid import UUID
  5. from dataladmetadatamodel.metadata import ExtractorConfiguration, Metadata
  6. from dataladmetadatamodel.metadatarootrecord import MetadataRootRecord
  7. from tools.metadata_creator.filetreecreator import create_file_tree
  8. from tools.metadata_creator.utils import get_dataset_id, get_dataset_version, \
  9. read_datasets
  10. mdc_logger = logging.getLogger("metadata_creator")
  11. def create_metadata_root_record(mapper_family,
  12. realm,
  13. dataset_id: UUID,
  14. dataset_version: str,
  15. dataset_path: str,
  16. relative_path: str,
  17. parameter_set_count: int
  18. ) -> Optional[MetadataRootRecord]:
  19. file_tree = create_file_tree(
  20. mapper_family,
  21. realm,
  22. dataset_path,
  23. parameter_set_count
  24. )
  25. metadata = Metadata()
  26. for count in range(parameter_set_count):
  27. parameters = {
  28. "ds_parameter_0": f"value_0.{count}",
  29. "ds_parameter_1": f"value_1.{count}",
  30. }
  31. metadata.add_extractor_run(
  32. time.time(),
  33. "dataset-core-extractor",
  34. "datasetcreator.py",
  35. "support@datalad.org",
  36. ExtractorConfiguration("1.2.3", parameters),
  37. {
  38. "info": f"dataset-level test metadata for parameter set #{count}",
  39. "path": relative_path
  40. }
  41. )
  42. mrr = MetadataRootRecord(
  43. mapper_family,
  44. realm,
  45. dataset_id,
  46. dataset_version,
  47. Connector.from_object(metadata),
  48. Connector.from_object(file_tree)
  49. )
  50. mrr.save()
  51. mrr.dataset_level_metadata.purge()
  52. mrr.file_tree.purge()
  53. return mrr
  54. def get_dataset_id_version(path: str) -> Tuple[Optional[UUID], Optional[str]]:
  55. dataset_id = get_dataset_id(path)
  56. if dataset_id is None:
  57. mdc_logger.error(f"cannot determine id of dataset at {path}")
  58. dataset_version = get_dataset_version(path)
  59. if dataset_version is None:
  60. mdc_logger.error(f"cannot determine version of dataset at {path}")
  61. return dataset_id, dataset_version
  62. def create_mrrs_from_dataset(mapper: str,
  63. realm: str,
  64. root_path: str,
  65. parameter_set_count: int
  66. ) -> Dict[Tuple[UUID, str, str], MetadataRootRecord]:
  67. result = dict()
  68. for relative_path, entry in read_datasets(root_path):
  69. dataset_path = entry.path
  70. dataset_id, dataset_version = get_dataset_id_version(dataset_path)
  71. if dataset_id is None or dataset_version is None:
  72. mdc_logger.info(f"ignoring dataset at {dataset_version} because version or id could not be read")
  73. continue
  74. mrr = create_metadata_root_record(
  75. mapper,
  76. realm,
  77. dataset_id,
  78. dataset_version,
  79. dataset_path,
  80. relative_path,
  81. parameter_set_count)
  82. result[(dataset_id, dataset_version, relative_path)] = mrr
  83. return result