123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- import hashlib
- from pathlib import Path
- from tempfile import TemporaryDirectory
- from typing import (
- Dict,
- List,
- Union,
- )
- from .gitbackend.subprocess import git_save_file_list
- def hash_blob(blob: Union[str, bytes]) -> str:
- if isinstance(blob, str):
- blob = bytearray(blob.encode())
- else:
- blob = bytearray(blob)
- object_to_hash = f"blob {len(blob)}".encode() + b"\x00" + blob
- return hashlib.sha1(object_to_hash).hexdigest()
- class GitBlobCache:
- def __init__(self,
- realm: str,
- maxsize: int = 2000):
- self.realm = realm
- self.maxsize = maxsize
- self.cached_objects: List[Union[str, bytes]] = list()
- self.flushed_objects: Dict[Union[str, bytes], str] = dict()
- self.temporary_directory = TemporaryDirectory()
- self.temp_dir_path = Path(self.temporary_directory.name)
- # Assert after member initialisation, so in case of an
- # exception the destructor does still work
- assert isinstance(realm, str)
- def __del__(self):
- if len(self.cached_objects) > 0:
- raise RuntimeError("deleting a non-flushed JSON object cache")
- self.temporary_directory.cleanup()
- def cache_blob(self,
- realm: str,
- blob: Union[str, bytes]):
- assert realm == self.realm, \
- "realm of cached object and realm of cache instance differ"
- if len(self.cached_objects) == self.maxsize:
- self.flush()
- expected_hash = hash_blob(blob)
- self.cached_objects.append((blob, expected_hash))
- return expected_hash
- def flush(self):
- """
- Write all cached objects to a git repository,
- associate the objects with their hash in
- self.cached_objects.
- Writing is done by creating files in a
- temporary directory and calling git hash-object
- with the list of files.
- :return: None
- """
- def check_hash(hash: str, expected_hash: str):
- assert hash == expected_hash
- file_list = []
- for index, (blob, expected_hash) in enumerate(self.cached_objects):
- temp_file_path = self.temp_dir_path / str(index)
- with temp_file_path.open("tw") as f:
- f.write(blob)
- file_list.append(str(temp_file_path))
- hash_values = git_save_file_list(self.realm, file_list)
- assert len(hash_values) == len(file_list), \
- f"hash value list length ({len(hash_values)}) and file list length " \
- f"({len(file_list)}) differ.\n{hash_values}\n{file_list}"
- hash_dict = {
- blob: hash_values[index]
- for index, (blob, expected_hash) in enumerate(self.cached_objects)
- if check_hash(hash_values[index], expected_hash)
- }
- self.flushed_objects.update(hash_dict)
- self.cached_objects = []
|