metadata.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. import copy
  2. import json
  3. import time
  4. from typing import (
  5. Dict,
  6. Generator,
  7. Iterable,
  8. List,
  9. Optional,
  10. Tuple
  11. )
  12. from dataladmetadatamodel import (
  13. JSONObject,
  14. check_serialized_version,
  15. version_string
  16. )
  17. from dataladmetadatamodel.mappableobject import MappableObject
  18. from dataladmetadatamodel.mapper.reference import Reference
  19. class ParameterDict(dict):
  20. def __hash__(self):
  21. return hash(tuple(sorted(self.items())))
  22. class ExtractorConfiguration:
  23. """
  24. Holds a single configuration for an extractor.
  25. Ensures that a version number is at least given.
  26. """
  27. def __init__(self,
  28. version: str,
  29. parameter: Dict[str, JSONObject]):
  30. self.version = version
  31. self.parameter = ParameterDict(parameter)
  32. def to_json_obj(self) -> JSONObject:
  33. return {
  34. "@": dict(
  35. type="ExtractorConfiguration",
  36. version=version_string
  37. ),
  38. "version": self.version,
  39. "parameter": self.parameter
  40. }
  41. def to_json_str(self) -> str:
  42. return json.dumps(self.to_json_obj())
  43. def __eq__(self, other):
  44. return (
  45. self.version == other.version
  46. and self.parameter == other.parameter
  47. )
  48. def __hash__(self):
  49. return hash((self.version, self.parameter))
  50. @classmethod
  51. def from_json_obj(cls, obj: JSONObject) -> "ExtractorConfiguration":
  52. assert obj["@"]["type"] == "ExtractorConfiguration"
  53. check_serialized_version(obj)
  54. return cls(
  55. obj["version"],
  56. obj["parameter"]
  57. )
  58. @classmethod
  59. def from_json_str(cls, json_str: str) -> "ExtractorConfiguration":
  60. return cls.from_json_obj(json.loads(json_str))
  61. class MetadataInstance:
  62. """
  63. A single metadata instance. It is associated
  64. with provenance information, i.e. time stamp,
  65. author, author_email, with a configuration, i.e.
  66. parameters, and with a source that points to
  67. the metadata itself.
  68. """
  69. def __init__(self,
  70. time_stamp,
  71. author_name,
  72. author_email,
  73. configuration: ExtractorConfiguration,
  74. metadata_content: JSONObject):
  75. self.time_stamp = time_stamp
  76. self.author_name = author_name
  77. self.author_email = author_email
  78. self.configuration = configuration
  79. self.metadata_content = metadata_content
  80. def to_json_obj(self) -> JSONObject:
  81. return {
  82. "@": dict(
  83. type="MetadataInstance",
  84. version=version_string
  85. ),
  86. "time_stamp": self.time_stamp,
  87. "author": self.author_name,
  88. "author_email": self.author_email,
  89. "configuration": self.configuration.to_json_obj(),
  90. "metadata_content": self.metadata_content
  91. }
  92. def to_json_str(self) -> str:
  93. return json.dumps(self.to_json_obj())
  94. def __eq__(self, other):
  95. return (
  96. self.time_stamp == other.time_stamp
  97. and self.author_name == other.author_name
  98. and self.author_email == other.author_email
  99. and self.configuration == other.configuration
  100. and self.metadata_content == other.metadata_content
  101. )
  102. @classmethod
  103. def from_json_obj(cls, obj: JSONObject) -> "MetadataInstance":
  104. assert obj["@"]["type"] == "MetadataInstance"
  105. check_serialized_version(obj)
  106. return cls(
  107. obj["time_stamp"],
  108. obj["author"],
  109. obj["author_email"],
  110. ExtractorConfiguration.from_json_obj(obj["configuration"]),
  111. obj["metadata_content"]
  112. )
  113. @classmethod
  114. def from_json_str(cls, json_str: str) -> "MetadataInstance":
  115. return cls.from_json_obj(json.loads(json_str))
  116. class MetadataInstanceSet:
  117. """
  118. A set of metadata instances, i.e. extractor
  119. run information records. Each instance is
  120. identified by its configuration, i.e. an
  121. instance of ExtractorConfiguration.
  122. """
  123. def __init__(self,
  124. initial_metadata_instances: Optional[Iterable[MetadataInstance]] = None):
  125. self.parameter_set = list()
  126. self.instances = dict()
  127. for metadata_instance in initial_metadata_instances or []:
  128. self.add_metadata_instance(metadata_instance)
  129. def __iter__(self):
  130. yield from self.instances.values()
  131. def add_metadata_instance(self, metadata_instance: MetadataInstance):
  132. if metadata_instance.configuration not in self.parameter_set:
  133. self.parameter_set.append(metadata_instance.configuration)
  134. instance_key = self.parameter_set.index(metadata_instance.configuration)
  135. self.instances[instance_key] = metadata_instance
  136. def get_instances(self) -> Generator[MetadataInstance, None, None]:
  137. yield from self.instances.values()
  138. def get_configurations(self) -> List[ExtractorConfiguration]:
  139. return self.parameter_set[:]
  140. def get_instance_for_configuration_index(self, index: int):
  141. return self.instances[index]
  142. def get_instance_for_configuration(self, configuration: ExtractorConfiguration):
  143. return self.instances[self.parameter_set.index(configuration)]
  144. def to_json_obj(self) -> JSONObject:
  145. return {
  146. "@": dict(
  147. type="MetadataInstanceSet",
  148. version=version_string
  149. ),
  150. "parameter_set": [
  151. configuration.to_json_obj()
  152. for configuration in self.parameter_set
  153. ],
  154. "instance_set": {
  155. instance_key: instance.to_json_obj()
  156. for instance_key, instance in self.instances.items()
  157. }
  158. }
  159. def to_json_str(self) -> str:
  160. return json.dumps(self.to_json_obj())
  161. @classmethod
  162. def from_json_obj(cls, obj: JSONObject) -> "MetadataInstanceSet":
  163. assert obj["@"]["type"] == "MetadataInstanceSet"
  164. check_serialized_version(obj)
  165. metadata_instance_set = cls()
  166. metadata_instance_set.parameter_set = [
  167. ExtractorConfiguration.from_json_obj(json_obj)
  168. for json_obj in obj["parameter_set"]
  169. ]
  170. metadata_instance_set.instances = {
  171. int(configuration_id): MetadataInstance.from_json_obj(json_obj)
  172. for configuration_id, json_obj in obj["instance_set"].items()
  173. }
  174. return metadata_instance_set
  175. @classmethod
  176. def from_json_str(cls, json_str: str) -> "MetadataInstanceSet":
  177. return cls.from_json_obj(json.loads(json_str))
  178. def __eq__(self, other: "MetadataInstanceSet"):
  179. return sorted(self.parameter_set) == sorted(other.parameter_set) \
  180. and self.instances == other.instances
  181. class Metadata(MappableObject):
  182. """
  183. Holds entries for all metadata of a single object.
  184. Metadata is identified on the first level by its
  185. format-name, i.e. the extractor-name. For each
  186. extractor there is a set of configurations and
  187. associated metadata, i.e. objects that contain
  188. the extractor result, aka the real metadata.
  189. """
  190. def __init__(self,
  191. reference: Optional[Reference] = None):
  192. assert isinstance(reference, (type(None), Reference))
  193. super().__init__(reference)
  194. self.instance_sets: Dict[str, MetadataInstanceSet] = dict()
  195. def __eq__(self, other):
  196. return self.instance_sets == other.instance_sets
  197. @staticmethod
  198. def get_empty_instance(reference: Optional[Reference] = None):
  199. return Metadata(reference)
  200. def purge_impl(self):
  201. self.instance_sets = dict()
  202. def get_modifiable_sub_objects_impl(self) -> Iterable[MappableObject]:
  203. return []
  204. def extractors(self) -> Generator[str, None, None]:
  205. yield from self.instance_sets.keys()
  206. def extractor_runs(self) -> Generator[Tuple[str, MetadataInstanceSet], None, None]:
  207. yield from self.instance_sets.items()
  208. def extractor_runs_for_extractor(self, extractor_name: str) -> MetadataInstanceSet:
  209. return self.instance_sets[extractor_name]
  210. def add_extractor_run(self,
  211. time_stamp: Optional[float],
  212. extractor_name: str,
  213. author_name: str,
  214. author_email: str,
  215. configuration: ExtractorConfiguration,
  216. metadata_content: JSONObject):
  217. self.touch()
  218. instance_set = self.instance_sets.get(
  219. extractor_name,
  220. MetadataInstanceSet())
  221. instance_set.add_metadata_instance(
  222. MetadataInstance(
  223. time_stamp if time_stamp is not None else time.time(),
  224. author_name,
  225. author_email,
  226. configuration,
  227. metadata_content))
  228. self.instance_sets[extractor_name] = instance_set
  229. def to_json(self) -> str:
  230. return json.dumps({
  231. "@": dict(
  232. type="Metadata",
  233. version=version_string
  234. ),
  235. "instance_sets": {
  236. format_name: instance_set.to_json_obj()
  237. for format_name, instance_set in self.instance_sets.items()
  238. }
  239. })
  240. def init_from_json(self, json_str) -> None:
  241. obj = json.loads(json_str)
  242. check_serialized_version(obj)
  243. assert obj["@"]["type"] == "Metadata"
  244. for format_name, instance_set_json_obj in obj["instance_sets"].items():
  245. self.instance_sets[format_name] = \
  246. MetadataInstanceSet.from_json_obj(instance_set_json_obj)
  247. @classmethod
  248. def from_json(cls, json_str: str) -> "Metadata":
  249. metadata = cls(None)
  250. metadata.init_from_json(json_str)
  251. metadata.mapped = True
  252. metadata.set_unsaved()
  253. return metadata
  254. def deepcopy_impl(self,
  255. new_mapper_family: Optional[str] = None,
  256. new_destination: Optional[str] = None,
  257. **kwargs) -> "Metadata":
  258. copied_metadata = Metadata()
  259. for extractor_name, instance_set in self.instance_sets.items():
  260. copied_metadata.instance_sets[extractor_name] = copy.deepcopy(instance_set)
  261. copied_metadata.write_out(new_destination)
  262. copied_metadata.purge()
  263. return copied_metadata