gitblobcache.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import hashlib
  2. from pathlib import Path
  3. from tempfile import TemporaryDirectory
  4. from typing import (
  5. Dict,
  6. List,
  7. Union,
  8. )
  9. from .gitbackend.subprocess import git_save_file_list
  10. def hash_blob(blob: Union[str, bytes]) -> str:
  11. if isinstance(blob, str):
  12. blob = bytearray(blob.encode())
  13. else:
  14. blob = bytearray(blob)
  15. object_to_hash = f"blob {len(blob)}".encode() + b"\x00" + blob
  16. return hashlib.sha1(object_to_hash).hexdigest()
  17. class GitBlobCache:
  18. def __init__(self,
  19. realm: str,
  20. maxsize: int = 2000):
  21. self.realm = realm
  22. self.maxsize = maxsize
  23. self.cached_objects: List[Union[str, bytes]] = list()
  24. self.flushed_objects: Dict[Union[str, bytes], str] = dict()
  25. self.temporary_directory = TemporaryDirectory()
  26. self.temp_dir_path = Path(self.temporary_directory.name)
  27. # Assert after member initialisation, so in case of an
  28. # exception the destructor does still work
  29. assert isinstance(realm, str)
  30. def __del__(self):
  31. if len(self.cached_objects) > 0:
  32. raise RuntimeError("deleting a non-flushed JSON object cache")
  33. self.temporary_directory.cleanup()
  34. def cache_blob(self,
  35. realm: str,
  36. blob: Union[str, bytes]):
  37. assert realm == self.realm, \
  38. "realm of cached object and realm of cache instance differ"
  39. if len(self.cached_objects) == self.maxsize:
  40. self.flush()
  41. expected_hash = hash_blob(blob)
  42. self.cached_objects.append((blob, expected_hash))
  43. return expected_hash
  44. def flush(self):
  45. """
  46. Write all cached objects to a git repository,
  47. associate the objects with their hash in
  48. self.cached_objects.
  49. Writing is done by creating files in a
  50. temporary directory and calling git hash-object
  51. with the list of files.
  52. :return: None
  53. """
  54. def check_hash(hash: str, expected_hash: str):
  55. assert hash == expected_hash
  56. file_list = []
  57. for index, (blob, expected_hash) in enumerate(self.cached_objects):
  58. temp_file_path = self.temp_dir_path / str(index)
  59. with temp_file_path.open("tw") as f:
  60. f.write(blob)
  61. file_list.append(str(temp_file_path))
  62. hash_values = git_save_file_list(self.realm, file_list)
  63. assert len(hash_values) == len(file_list), \
  64. f"hash value list length ({len(hash_values)}) and file list length " \
  65. f"({len(file_list)}) differ.\n{hash_values}\n{file_list}"
  66. hash_dict = {
  67. blob: hash_values[index]
  68. for index, (blob, expected_hash) in enumerate(self.cached_objects)
  69. if check_hash(hash_values[index], expected_hash)
  70. }
  71. self.flushed_objects.update(hash_dict)
  72. self.cached_objects = []