Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

test_filetree.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. import subprocess
  2. import tempfile
  3. import time
  4. import unittest
  5. from pathlib import Path
  6. from typing import cast
  7. from unittest.mock import patch
  8. from dataladmetadatamodel.filetree import FileTree
  9. from dataladmetadatamodel.metadata import (
  10. ExtractorConfiguration,
  11. Metadata,
  12. MetadataInstance,
  13. )
  14. from dataladmetadatamodel.mapper.gitmapper.gitblobcache import hash_blob
  15. from dataladmetadatamodel.mapper.gitmapper.objectreference import flush_object_references
  16. from dataladmetadatamodel.metadatapath import MetadataPath
  17. from dataladmetadatamodel.tests.utils import (
  18. assert_file_trees_equal,
  19. create_file_tree_with_metadata,
  20. get_location,
  21. )
  22. default_paths = [
  23. MetadataPath("/a/b/c"),
  24. MetadataPath("a/b/a"),
  25. MetadataPath("b"),
  26. MetadataPath("c/d/e"),
  27. MetadataPath("a/x"),
  28. ]
  29. class TestFileTree(unittest.TestCase):
  30. def test_add_metadata(self):
  31. file_tree = create_file_tree_with_metadata(default_paths, [
  32. Metadata()
  33. for _ in default_paths])
  34. returned_entries = tuple(file_tree.get_paths_recursive())
  35. returned_paths = [entry[0] for entry in returned_entries]
  36. self.assertEqual(sorted(default_paths), sorted(returned_paths))
  37. for returned_path, returned_metadata in [
  38. (entry[0], entry[1])
  39. for entry in returned_entries]:
  40. self.assertEqual(returned_metadata, Metadata())
  41. def test_add_extractor_run(self):
  42. file_tree = create_file_tree_with_metadata(default_paths, [
  43. Metadata()
  44. for _ in default_paths])
  45. author_name = "Karl-Test"
  46. author_email = author_name + "@test.com"
  47. extractor_name = "test_extractor"
  48. extractor_configuration = ExtractorConfiguration(
  49. "extractor_version_1",
  50. {"key1": "value1"})
  51. metadata_content = {"key0": "this is metadata"}
  52. file_tree.add_extractor_run(
  53. default_paths[0],
  54. 1.2,
  55. extractor_name,
  56. author_name,
  57. author_email,
  58. extractor_configuration,
  59. metadata_content)
  60. metadata = file_tree.get_metadata(default_paths[0])
  61. self.assertIsNotNone(metadata)
  62. stored_metadata = metadata.extractor_runs_for_extractor(extractor_name)
  63. self.assertEqual(
  64. stored_metadata.parameter_set,
  65. [extractor_configuration])
  66. self.assertEqual(
  67. stored_metadata._instances[0],
  68. MetadataInstance(
  69. 1.2,
  70. author_name,
  71. author_email,
  72. extractor_configuration,
  73. metadata_content))
  74. class TestReferenceCreation(unittest.TestCase):
  75. def test_object_reference_creation(self):
  76. file_tree = create_file_tree_with_metadata(
  77. default_paths,
  78. [
  79. Metadata()
  80. for _ in default_paths
  81. ]
  82. )
  83. with \
  84. patch("dataladmetadatamodel.mapper.gitmapper."
  85. "metadatamapper.git_save_str") as save_str, \
  86. patch("dataladmetadatamodel.mapper.gitmapper."
  87. "mtreenodemapper.git_save_tree_node") as save_tree_node, \
  88. patch("dataladmetadatamodel.mapper.gitmapper."
  89. "metadatarootrecordmapper.git_save_json") as save_json, \
  90. patch("dataladmetadatamodel.mapper.gitmapper."
  91. "gitblobcache.git_save_file_list") as file_list_save, \
  92. patch("dataladmetadatamodel.mtreeproxy."
  93. "add_tree_reference") as add_tree_ref:
  94. save_str.return_value = get_location(1)
  95. save_tree_node.return_value = get_location(2)
  96. save_json.return_value = get_location(3)
  97. file_list_save.side_effect = lambda r, l: [
  98. hash_blob(open(e, "rb").read())
  99. for e in l
  100. ]
  101. file_tree.write_out("/tmp/t1")
  102. # We expect one call for the dataset-tree itself
  103. # and one call for each file-tree, one of which
  104. # is anchored at each dataset path
  105. add_tree_ref.assert_called_once()
  106. class TestMapping(unittest.TestCase):
  107. def test_adding(self):
  108. # check file tree adding is working
  109. with tempfile.TemporaryDirectory() as metadata_store:
  110. subprocess.run(["git", "init", metadata_store])
  111. file_tree = create_file_tree_with_metadata(
  112. default_paths,
  113. [Metadata() for _ in default_paths])
  114. reference = file_tree.write_out(metadata_store)
  115. file_tree = cast(FileTree, FileTree(
  116. realm=metadata_store,
  117. reference=reference).read_in())
  118. additional_paths = [MetadataPath(f"x/y.{n}") for n in range(10)]
  119. for additional_path in additional_paths:
  120. file_tree.add_metadata(additional_path, Metadata())
  121. reference = file_tree.write_out()
  122. file_tree = FileTree(realm=metadata_store, reference=reference).read_in()
  123. read_paths = [pair[0] for pair in file_tree.get_paths_recursive()]
  124. for path in default_paths + additional_paths:
  125. self.assertIn(path, read_paths)
  126. def test_adding_to_massive_tree(self):
  127. # check file tree adding is working
  128. with tempfile.TemporaryDirectory() as metadata_store:
  129. subprocess.run(["git", "init", metadata_store])
  130. file_tree = FileTree()
  131. start_time = time.time()
  132. for first_part in range(10):
  133. for second_part in range(10):
  134. for third_part in range(10):
  135. metadata_path = MetadataPath(f"{first_part:03}/"
  136. f"{second_part:03}/"
  137. f"{third_part:03}")
  138. file_tree.add_metadata(metadata_path, Metadata())
  139. initialisation_duration = time.time() - start_time
  140. print(f"Initialised: {initialisation_duration:4f}")
  141. start_time = time.time()
  142. reference = file_tree.write_out(metadata_store)
  143. write_out_duration = time.time() - start_time
  144. print(f"Written out: {write_out_duration:4f}")
  145. start_time = time.time()
  146. file_tree = FileTree(realm=metadata_store, reference=reference).read_in()
  147. read_in_duration = time.time() - start_time
  148. print(f"Read in: {read_in_duration:4f}")
  149. start_time = time.time()
  150. file_tree.add_metadata(MetadataPath("5/5/xxx"), Metadata())
  151. add_duration = time.time() - start_time
  152. print(f"Added single entry: {add_duration:4f}")
  153. start_time = time.time()
  154. file_tree.write_out()
  155. write_out_2nd_duration = time.time() - start_time
  156. print(f"Written out single entry: {write_out_2nd_duration:4f}")
  157. def test_shallow_file_tree_mapping(self):
  158. # assert that file trees content is not mapped by default
  159. with tempfile.TemporaryDirectory() as metadata_store:
  160. subprocess.run(["git", "init", metadata_store])
  161. paths = [
  162. MetadataPath("a"),
  163. MetadataPath("b")]
  164. file_tree = FileTree()
  165. for path in paths:
  166. metadata = Metadata()
  167. file_tree.add_metadata(path, metadata)
  168. file_tree.unget_metadata(metadata, metadata_store)
  169. reference = file_tree.write_out(metadata_store)
  170. flush_object_references(Path(metadata_store))
  171. new_file_tree = FileTree(realm=metadata_store, reference=reference).read_in()
  172. self.assertFalse(new_file_tree.mtree.child_nodes["a"].mapped)
  173. class TestDeepCopy(unittest.TestCase):
  174. def test_copy_from_memory(self):
  175. with \
  176. tempfile.TemporaryDirectory() as original_dir, \
  177. tempfile.TemporaryDirectory() as copy_dir:
  178. subprocess.run(["git", "init", original_dir])
  179. subprocess.run(["git", "init", copy_dir])
  180. file_tree = FileTree()
  181. for path in ["a/b/c/d", "a/b/d", "a/x"]:
  182. file_tree.add_metadata(
  183. MetadataPath(path),
  184. Metadata())
  185. file_tree_copy = file_tree.deepcopy(new_destination=copy_dir)
  186. assert_file_trees_equal(self, file_tree, file_tree_copy, True)
  187. def test_copy_from_backend(self):
  188. with \
  189. tempfile.TemporaryDirectory() as original_dir, \
  190. tempfile.TemporaryDirectory() as copy_dir:
  191. subprocess.run(["git", "init", original_dir])
  192. subprocess.run(["git", "init", copy_dir])
  193. paths = [
  194. MetadataPath("a/b/c/d"),
  195. MetadataPath("a/b/d"),
  196. MetadataPath("a/x")]
  197. file_tree = FileTree()
  198. for path in paths:
  199. metadata = Metadata()
  200. file_tree.add_metadata(path, metadata)
  201. file_tree.unget_metadata(metadata, original_dir)
  202. file_tree.write_out(original_dir)
  203. file_tree_copy = file_tree.deepcopy(new_destination=copy_dir)
  204. file_tree_copy.read_in()
  205. assert_file_trees_equal(self, file_tree, file_tree_copy, True)
  206. if __name__ == '__main__':
  207. unittest.main()