Преглед изворни кода

NF: add DatasetTree and its mapper

Christian Mönch пре 3 година
родитељ
комит
6429186ba8

+ 3 - 0
.gitignore

@@ -0,0 +1,3 @@
+
+# Python execution artefacts
+**/__pycache__

+ 62 - 0
model/datasettree.py

@@ -0,0 +1,62 @@
+import enum
+
+from .connector import ConnectedObject
+from .mapper import get_mapper
+from .metadatarootrecord import MetadataRootRecord
+from .treenode import TreeNode
+from .mapper.reference import Reference
+
+
+class NodeType(enum.Enum):
+    DIRECTORY = enum.auto()
+    DATASET = enum.auto()
+    INTERNAL = enum.auto()
+
+
+class DatasetTree(ConnectedObject, TreeNode):
+    def __init__(self,
+                 mapper_family: str,
+                 realm: str):
+
+        super(DatasetTree, self).__init__()
+        self.mapper_family = mapper_family
+        self.realm = realm
+
+    def node_type(self):
+        if self.is_leaf_node():
+            assert self.value is not None
+            return NodeType.DATASET
+        else:
+            if self.value is None:
+                return NodeType.DIRECTORY
+            else:
+                return NodeType.DATASET
+
+    def add_directory(self, name):
+        self.add_node(name, TreeNode())
+
+    def add_dataset(self, path, metadata_root_record: MetadataRootRecord):
+        dataset_node = self.get_node_at_path(path)
+        if dataset_node is None:
+            self.add_node_hierarchy(path, TreeNode(metadata_root_record))
+        else:
+            dataset_node.value = metadata_root_record
+
+    def get_metadata_root_record(self, path: str):
+        return self.get_node_at_path(path).value
+
+    def save(self, force_write: bool = False) -> Reference:
+        """
+        Persists the dataset tree. First save connected
+        components in all metadata root records, if they
+        are mapped or modified. Then persist the dataset-tree
+        itself, with the class mapper.
+        """
+        file_node_set = self.get_paths_recursive(False)
+        for _, _, file_node in file_node_set:
+            mrr_reference = file_node.value.save(force_write)
+
+        return Reference(
+            self.mapper_family,
+            "DatasetTree",
+            get_mapper(self.mapper_family, "DatasetTree")(self.realm).unmap(self))

+ 5 - 2
model/filetree.py

@@ -16,8 +16,11 @@ class FileTree(ConnectedObject, TreeNode):
         self.mapper_family = mapper_family
         self.realm = realm
 
-    def add_directory(self, path):
-        self.add_node(path, TreeNode())
+    def add_directory(self, name):
+        self.add_node(name, TreeNode())
+
+    def add_file(self, path):
+        self.add_node_hierarchy(path, TreeNode())
 
     def add_metadata(self, path: str, metadata: Optional[Metadata] = None):
         self.add_node_hierarchy(path, TreeNode(value=Connector.from_object(metadata)))

+ 2 - 0
model/mapper/gitmapper/__init__.py

@@ -1,4 +1,5 @@
 
+from .datasettreemapper import DatasetTreeGitMapper
 from .filetreemapper import FileTreeGitMapper
 from .metadatamapper import MetadataGitMapper
 from .metadatarootrecordmapper import MetadataRootRecordGitMapper
@@ -9,6 +10,7 @@ from .versionlistmapper import VersionListGitMapper
 
 
 GIT_MAPPER_FAMILY = {
+    "DatasetTree": DatasetTreeGitMapper,
     "FileTree": FileTreeGitMapper,
     "Metadata": MetadataGitMapper,
     "MetadataRootRecord": MetadataRootRecordGitMapper,

+ 83 - 0
model/mapper/gitmapper/datasettreemapper.py

@@ -0,0 +1,83 @@
+
+from .gittools import git_ls_tree_recursive, git_save_tree
+from .metadatarootrecordmapper import MetadataRootRecordGitMapper
+from ..basemapper import BaseMapper
+from ..reference import Reference
+
+
+DATALAD_ROOT_RECORD_NAME = ".datalad_mrr"
+
+
+class DatasetTreeGitMapper(BaseMapper):
+
+    def _unmap_metadata_root_record(self, metadata_root_record) -> str:
+        return MetadataRootRecordGitMapper(self.realm).unmap(metadata_root_record)
+
+    def _save_dataset_tree(self, node: "TreeNode") -> str:
+        from model.metadatarootrecord import MetadataRootRecord
+
+        dir_entries = []
+        for name, child_node in node.child_nodes.items():
+            if child_node.is_leaf_node():
+
+                # Write a MetadataRootRecord object and add is as directory
+                # entry with the name DATALAD_ROOT_RECORD_NAME.
+                assert isinstance(child_node.value, MetadataRootRecord)
+                mrr_location = self._unmap_metadata_root_record(child_node.value)
+                location = git_save_tree(self.realm, [("100644", "blob", mrr_location, DATALAD_ROOT_RECORD_NAME)])
+
+                dir_entries.append(("040000", "tree", location, name))
+
+            elif child_node.value is not None:
+
+                assert isinstance(child_node.value, MetadataRootRecord)
+                mrr_location = self._unmap_metadata_root_record(child_node.value)
+                location = git_save_tree(self.realm, [("100644", "blob", mrr_location, DATALAD_ROOT_RECORD_NAME)])
+
+                dir_entries.append(("040000", "tree", location, name))
+                dir_entries.append(("040000", "tree", self._save_dataset_tree(child_node), name))
+
+            else:
+
+                dir_entries.append(("040000", "tree", self._save_dataset_tree(child_node), name))
+
+        return git_save_tree(self.realm, dir_entries)
+
+    def _map_metadata_root_record(self, location: str) -> "MetadataRootRecord":
+        return MetadataRootRecordGitMapper(self.realm).map(
+            Reference("git", "MetadataRootRecord", location)
+        )
+
+    def map(self, ref: Reference) -> "DatasetTree":
+        from model.datasettree import DatasetTree
+        from model.treenode import TreeNode
+
+        dataset_tree = DatasetTree("git", self.realm)
+
+        # List all leaf-nodes. Those should only end with the datalad
+        # root record-name. Add the hierarchy except the leaf-node,
+        # read the metadata root record from the leave node, and
+        # add it as value to the hierarchy.
+        for line in git_ls_tree_recursive(self.realm, ref.location):
+
+            _, _, location, path = line.split()
+            path_element = path.split("/")
+            assert path_element[-1] == DATALAD_ROOT_RECORD_NAME
+            metadata_root_record = self._map_metadata_root_record(location)
+
+            dataset_path = "/".join(path_element[:-1])
+            dataset_tree.add_node_hierarchy(
+                dataset_path,
+                TreeNode(metadata_root_record)
+            )
+        return dataset_tree
+
+    def unmap(self, obj) -> str:
+        """
+        Save DatasetTree as git tree with ".datatset_mrr"
+        nodes for each MetadataRootRecord.
+        """
+        from model.datasettree import DatasetTree
+
+        assert isinstance(obj, DatasetTree)
+        return self._save_dataset_tree(obj)

+ 6 - 3
model/mapper/gitmapper/filetreemapper.py

@@ -7,8 +7,9 @@ from ..reference import Reference
 
 class FileTreeGitMapper(BaseMapper):
 
-    def _save(self, node: "TreeNode") -> str:
+    def _save_file_tree(self, node: "TreeNode") -> str:
         from model.connector import Connector
+
         dir_entries = []
         for name, child_node in node.child_nodes.items():
             if child_node.is_leaf_node():
@@ -17,13 +18,14 @@ class FileTreeGitMapper(BaseMapper):
                 location = git_save_str(self.realm, child_node.value.reference.to_json())
                 dir_entries.append(("100644", "blob", location, name))
             else:
-                dir_entries.append(("040000", "tree", self._save(child_node), name))
+                dir_entries.append(("040000", "tree", self._save_file_tree(child_node), name))
         return git_save_tree(self.realm, dir_entries)
 
     def map(self, ref: Reference) -> "FileTree":
         from model.connector import Connector
         from model.filetree import FileTree
         from model.treenode import TreeNode
+
         file_tree = FileTree("git", self.realm)
         for line in git_ls_tree_recursive(self.realm, ref.location):
             _, _, location, path = line.split()
@@ -36,5 +38,6 @@ class FileTreeGitMapper(BaseMapper):
     def unmap(self, obj) -> str:
         """ Save FileTree as git file tree """
         from model.filetree import FileTree
+
         assert isinstance(obj, FileTree)
-        return self._save(obj)
+        return self._save_file_tree(obj)

+ 9 - 7
model/mapper/gitmapper/metadatarootrecordmapper.py

@@ -1,4 +1,5 @@
 from typing import Any
+from uuid import UUID
 
 from .gittools import git_load_json, git_save_json
 from ..basemapper import BaseMapper
@@ -18,12 +19,13 @@ class MetadataRootRecordGitMapper(BaseMapper):
         return MetadataRootRecord(
             "git",
             self.realm,
-            json_object["dataset_identifier"],
+            UUID(json_object["dataset_identifier"]),
+            json_object["dataset_version"],
             Connector.from_reference(
-                Reference.from_json(json_object["dataset_level_metadata"])
+                Reference.from_json_obj(json_object["dataset_level_metadata"])
             ),
             Connector.from_reference(
-                Reference.from_json(json_object["file_level_metadata"])
+                Reference.from_json_obj(json_object["file_level_metadata"])
             )
         )
 
@@ -31,15 +33,15 @@ class MetadataRootRecordGitMapper(BaseMapper):
         from model.metadatarootrecord import MetadataRootRecord
         assert isinstance(obj, MetadataRootRecord)
         json_object = {
-            "dataset_identifier": obj.dataset_identifier,
+            "dataset_identifier": str(obj.dataset_identifier),
+            "dataset_version": str(obj.dataset_version),
             "dataset_level_metadata": obj.dataset_level_metadata.save(
                 "git",
                 self.realm
-            ).to_json(),
+            ).to_json_obj(),
             "file_level_metadata": obj.file_tree.save(
                 "git",
                 self.realm
-            ).to_json()
+            ).to_json_obj()
         }
         return git_save_json(self.realm, json_object)
-

+ 14 - 6
model/mapper/reference.py

@@ -20,8 +20,11 @@ class Reference:
             f"class_name='{self.class_name}', "
             f"location={repr(self.location)})")
 
-    def to_json(self):  # TODO: rename to to_json_str
-        return json.dumps({
+    def to_json_str(self):  # TODO: rename to to_json_str
+        return json.dumps(self.to_json_obj())
+
+    def to_json_obj(self):
+        return {
             "@": dict(
                 type="Reference",
                 version="1.0",
@@ -29,14 +32,19 @@ class Reference:
             **dict(
                 mapper_family=self.mapper_family,
                 class_name=self.class_name,
-                location=self.location)})
+                location=self.location)
+        }
+
+    @classmethod
+    def from_json_str(cls, json_str: str) -> "Reference":
+        return cls.from_json_obj(json.loads(json_str))
 
     @classmethod
-    def from_json(cls, json_str: str) -> "Reference":   # TODO: rename to from_json_str
-        obj = json.loads(json_str)
+    def from_json_obj(cls, obj) -> "Reference":
         assert obj["@"]["type"] == "Reference"
         assert obj["@"]["version"] == "1.0"
         return cls(
             obj["mapper_family"],
             obj["class_name"],
-            obj["location"])
+            obj["location"]
+        )

+ 7 - 2
model/metadatarootrecord.py

@@ -10,12 +10,14 @@ class MetadataRootRecord(ConnectedObject):
                  mapper_family,
                  realm,
                  dataset_identifier: UUID,
+                 dataset_version: str,
                  dataset_level_metadata: Connector,
                  file_tree: Connector):
 
         self.mapper_family = mapper_family
         self.realm = realm
         self.dataset_identifier = dataset_identifier
+        self.dataset_version = dataset_version
         self.dataset_level_metadata = dataset_level_metadata
         self.file_tree = file_tree
 
@@ -26,8 +28,7 @@ class MetadataRootRecord(ConnectedObject):
         it saves the properties of the UUIDSet and the top-half
         of the connectors with the appropriate class mapper.
         """
-        self.file_tree.save(self.mapper_family, self.realm, force_write)
-        self.dataset_level_metadata.save(self.mapper_family, self.realm, force_write)
+        self.save_connected_components(force_write)
         return Reference(
             self.mapper_family,
             "MetadataRootRecord",
@@ -35,6 +36,10 @@ class MetadataRootRecord(ConnectedObject):
                 self.mapper_family,
                 "MetadataRootRecord")(self.realm).unmap(self))
 
+    def save_connected_components(self, force_write: bool = False):
+        self.file_tree.save(self.mapper_family, self.realm, force_write)
+        self.dataset_level_metadata.save(self.mapper_family, self.realm, force_write)
+
     def set_file_tree(self, file_tree: ConnectedObject):
         self.file_tree = Connector.from_object(file_tree)