metadata.py 10 KB


  1. import copy
  2. import json
  3. import time
  4. from typing import (
  5. Dict,
  6. Generator,
  7. Iterable,
  8. List,
  9. Optional,
  10. Tuple
  11. )
  12. from dataladmetadatamodel import (
  13. JSONObject,
  14. check_serialized_version,
  15. version_string
  16. )
  17. from dataladmetadatamodel.mappableobject import MappableObject
  18. from dataladmetadatamodel.mapper.reference import Reference
  19. class ParameterDict(dict):
  20. def __hash__(self):
  21. return hash(tuple(sorted(self.items())))
  22. class ExtractorConfiguration:
  23. """
  24. Holds a single configuration for an extractor.
  25. Ensures that a version number is at least given.
  26. """
  27. def __init__(self,
  28. version: str,
  29. parameter: Dict[str, JSONObject]):
  30. self.version = version
  31. self.parameter = ParameterDict(parameter)
  32. def to_json_obj(self) -> JSONObject:
  33. return {
  34. "@": dict(
  35. type="ExtractorConfiguration",
  36. version=version_string
  37. ),
  38. "version": self.version,
  39. "parameter": self.parameter
  40. }
  41. def to_json_str(self) -> str:
  42. return json.dumps(self.to_json_obj())
  43. def __eq__(self, other):
  44. return (
  45. self.version == other.version
  46. and self.parameter == other.parameter
  47. )
  48. def __hash__(self):
  49. return hash((self.version, self.parameter))
  50. @classmethod
  51. def from_json_obj(cls, obj: JSONObject) -> "ExtractorConfiguration":
  52. assert obj["@"]["type"] == "ExtractorConfiguration"
  53. check_serialized_version(obj)
  54. return cls(
  55. obj["version"],
  56. obj["parameter"]
  57. )
  58. @classmethod
  59. def from_json_str(cls, json_str: str) -> "ExtractorConfiguration":
  60. return cls.from_json_obj(json.loads(json_str))
  61. class MetadataInstance:
  62. """
  63. A single metadata instance. It is associated
  64. with provenance information, i.e. time stamp,
  65. author, author_email, with a configuration, i.e.
  66. parameters, and with a source that points to
  67. the metadata itself.
  68. """
  69. def __init__(self,
  70. time_stamp,
  71. author_name,
  72. author_email,
  73. configuration: ExtractorConfiguration,
  74. metadata_content: JSONObject):
  75. self.time_stamp = time_stamp
  76. self.author_name = author_name
  77. self.author_email = author_email
  78. self.configuration = configuration
  79. self.metadata_content = metadata_content
  80. def to_json_obj(self) -> JSONObject:
  81. return {
  82. "@": dict(
  83. type="MetadataInstance",
  84. version=version_string
  85. ),
  86. "time_stamp": self.time_stamp,
  87. "author": self.author_name,
  88. "author_email": self.author_email,
  89. "configuration": self.configuration.to_json_obj(),
  90. "metadata_content": self.metadata_content
  91. }
  92. def to_json_str(self) -> str:
  93. return json.dumps(self.to_json_obj())
  94. def __eq__(self, other):
  95. return (
  96. self.time_stamp == other.time_stamp
  97. and self.author_name == other.author_name
  98. and self.author_email == other.author_email
  99. and self.configuration == other.configuration
  100. and self.metadata_content == other.metadata_content
  101. )
  102. @classmethod
  103. def from_json_obj(cls, obj: JSONObject) -> "MetadataInstance":
  104. assert obj["@"]["type"] == "MetadataInstance"
  105. check_serialized_version(obj)
  106. return cls(
  107. obj["time_stamp"],
  108. obj["author"],
  109. obj["author_email"],
  110. ExtractorConfiguration.from_json_obj(obj["configuration"]),
  111. obj["metadata_content"]
  112. )
  113. @classmethod
  114. def from_json_str(cls, json_str: str) -> "MetadataInstance":
  115. return cls.from_json_obj(json.loads(json_str))
  116. class MetadataInstanceSet:
  117. """
  118. A set of metadata instances, i.e. extractor
  119. run information records. Each instance is
  120. identified by its configuration, i.e. an
  121. instance of ExtractorConfiguration.
  122. """
  123. def __init__(self,
  124. initial_metadata_instances: Optional[Iterable[MetadataInstance]] = None):
  125. self.parameter_set = list()
  126. self._instances = dict()
  127. for metadata_instance in initial_metadata_instances or []:
  128. self.add_metadata_instance(metadata_instance)
  129. def __iter__(self):
  130. yield from self._instances.values()
  131. def add_metadata_instance(self, metadata_instance: MetadataInstance):
  132. if metadata_instance.configuration not in self.parameter_set:
  133. self.parameter_set.append(metadata_instance.configuration)
  134. instance_key = self.parameter_set.index(metadata_instance.configuration)
  135. self._instances[instance_key] = metadata_instance
  136. @property
  137. def instances(self) -> Dict:
  138. return self._instances
  139. @property
  140. def configurations(self) -> List[ExtractorConfiguration]:
  141. return self.parameter_set[:]
  142. def get_instance_for_configuration_index(self, index: int):
  143. return self._instances[index]
  144. def get_instance_for_configuration(self, configuration: ExtractorConfiguration):
  145. return self._instances[self.parameter_set.index(configuration)]
  146. def to_json_obj(self) -> JSONObject:
  147. return {
  148. "@": dict(
  149. type="MetadataInstanceSet",
  150. version=version_string
  151. ),
  152. "parameter_set": [
  153. configuration.to_json_obj()
  154. for configuration in self.parameter_set
  155. ],
  156. "instance_set": {
  157. instance_key: instance.to_json_obj()
  158. for instance_key, instance in self._instances.items()
  159. }
  160. }
  161. def to_json_str(self) -> str:
  162. return json.dumps(self.to_json_obj())
  163. @classmethod
  164. def from_json_obj(cls, obj: JSONObject) -> "MetadataInstanceSet":
  165. assert obj["@"]["type"] == "MetadataInstanceSet"
  166. check_serialized_version(obj)
  167. metadata_instance_set = cls()
  168. metadata_instance_set.parameter_set = [
  169. ExtractorConfiguration.from_json_obj(json_obj)
  170. for json_obj in obj["parameter_set"]
  171. ]
  172. metadata_instance_set._instances = {
  173. int(configuration_id): MetadataInstance.from_json_obj(json_obj)
  174. for configuration_id, json_obj in obj["instance_set"].items()
  175. }
  176. return metadata_instance_set
  177. @classmethod
  178. def from_json_str(cls, json_str: str) -> "MetadataInstanceSet":
  179. return cls.from_json_obj(json.loads(json_str))
  180. def __eq__(self, other: "MetadataInstanceSet"):
  181. return sorted(self.parameter_set) == sorted(other.parameter_set) \
  182. and self._instances == other._instances
  183. class Metadata(MappableObject):
  184. """
  185. Holds entries for all metadata of a single object.
  186. Metadata is identified on the first level by its
  187. format-name, i.e. the extractor-name. For each
  188. extractor there is a set of configurations and
  189. associated metadata, i.e. objects that contain
  190. the extractor result, aka the real metadata.
  191. """
  192. def __init__(self,
  193. realm: Optional[str] = None,
  194. reference: Optional[Reference] = None):
  195. assert isinstance(realm, (type(None), str))
  196. assert isinstance(reference, (type(None), Reference))
  197. super().__init__(realm, reference)
  198. self.instance_sets: Dict[str, MetadataInstanceSet] = dict()
  199. def __eq__(self, other):
  200. return self.instance_sets == other.instance_sets
  201. @staticmethod
  202. def get_empty_instance(realm: Optional[str] = None,
  203. reference: Optional[Reference] = None):
  204. return Metadata(realm, reference)
  205. def purge_impl(self):
  206. self.instance_sets = dict()
  207. def modifiable_sub_objects_impl(self) -> Iterable[MappableObject]:
  208. return []
  209. @property
  210. def extractors(self) -> Generator[str, None, None]:
  211. yield from self.instance_sets.keys()
  212. @property
  213. def extractor_runs(self) -> Generator[Tuple[str, MetadataInstanceSet], None, None]:
  214. yield from self.instance_sets.items()
  215. def extractor_runs_for_extractor(self, extractor_name: str) -> MetadataInstanceSet:
  216. return self.instance_sets[extractor_name]
  217. def add_extractor_run(self,
  218. time_stamp: Optional[float],
  219. extractor_name: str,
  220. author_name: str,
  221. author_email: str,
  222. configuration: ExtractorConfiguration,
  223. metadata_content: JSONObject):
  224. self.touch()
  225. instance_set = self.instance_sets.get(
  226. extractor_name,
  227. MetadataInstanceSet())
  228. instance_set.add_metadata_instance(
  229. MetadataInstance(
  230. time_stamp if time_stamp is not None else time.time(),
  231. author_name,
  232. author_email,
  233. configuration,
  234. metadata_content))
  235. self.instance_sets[extractor_name] = instance_set
  236. def to_json(self) -> str:
  237. return json.dumps({
  238. "@": dict(
  239. type="Metadata",
  240. version=version_string
  241. ),
  242. "instance_sets": {
  243. format_name: instance_set.to_json_obj()
  244. for format_name, instance_set in self.instance_sets.items()
  245. }
  246. })
  247. def init_from_json(self, json_str) -> None:
  248. obj = json.loads(json_str)
  249. check_serialized_version(obj)
  250. assert obj["@"]["type"] == "Metadata"
  251. for format_name, instance_set_json_obj in obj["instance_sets"].items():
  252. self.instance_sets[format_name] = \
  253. MetadataInstanceSet.from_json_obj(instance_set_json_obj)
  254. @classmethod
  255. def from_json(cls, json_str: str) -> "Metadata":
  256. metadata = cls(None)
  257. metadata.init_from_json(json_str)
  258. metadata.mapped = True
  259. metadata.set_unsaved()
  260. return metadata
  261. def deepcopy_impl(self,
  262. new_mapper_family: Optional[str] = None,
  263. new_destination: Optional[str] = None,
  264. **kwargs) -> "Metadata":
  265. copied_metadata = Metadata()
  266. for extractor_name, instance_set in self.instance_sets.items():
  267. copied_metadata.instance_sets[extractor_name] = copy.deepcopy(instance_set)
  268. copied_metadata.write_out(new_destination)
  269. copied_metadata.purge()
  270. return copied_metadata