Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

lsdir.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. """DataLad GUI ls-dir helper"""
  2. __docformat__ = 'restructuredtext'
  3. import stat
  4. import logging
  5. from pathlib import Path
  6. from datalad.interface.base import Interface
  7. from datalad.interface.base import build_doc
  8. from datalad.support.param import Parameter
  9. from datalad.support.exceptions import CapturedException
  10. from datalad.interface.utils import eval_results
  11. from datalad.interface.results import get_status_dict
  12. from datalad.runner import (
  13. GitRunner,
  14. StdOutCapture,
  15. CommandError,
  16. )
  17. from datalad.dataset.gitrepo import GitRepo
  18. lgr = logging.getLogger('datalad.ext.gooey.lsdir')
  19. @build_doc
  20. class GooeyLsDir(Interface):
  21. """Internal helper for datalad-gooey"""
  22. _params_ = dict(
  23. path=Parameter(
  24. args=("path", ),
  25. doc="""""",
  26. )
  27. )
  28. @staticmethod
  29. @eval_results
  30. def __call__(path: Path or str):
  31. # This needs to be keep simple and as fast as anyhow possible.
  32. # anything that is not absolutely crucial to have should have
  33. # an inexpensive switch to turn it off (or be off by default.
  34. # This command is an internal helper of gooey, it has no ambition
  35. # to generalize, although the components it uses internally
  36. # might have applicability in a broader scope.
  37. # - this takes a single path as a mandatory argument
  38. # - this path must be a directory, if it exists
  39. # - this directory can be inside or outside of a dataset
  40. # - a result is returned for each item inside that is considered
  41. # "relevant" for gooey (ie. no content inside `.git` or `.git` itself
  42. # etc.
  43. path = Path(path)
  44. if not path.is_absolute():
  45. # make absolute
  46. # this is not a datasetmethod, we do not have to take the
  47. # "relative-to-dsroot" case into account
  48. path = Path.cwd() / path
  49. # for each item we report
  50. # - type (symlink, file, directory, dataset)
  51. # - state (untracked, clean, ...)
  52. for r in _list(path):
  53. r.update(action='gooey-lsdir')
  54. if 'status' not in r:
  55. r.update(status='ok')
  56. if r.get('type') == 'directory':
  57. # a directory could still be an untracked dataset,
  58. # run the cheapest possible standard test to tell them apart.
  59. try:
  60. is_repo = GitRepo.is_valid(r['path'])
  61. except PermissionError as e:
  62. ce = CapturedException(e)
  63. # could be read-protected
  64. r['status'] = 'error'
  65. r['exception'] = ce
  66. r['message'] = 'Permissions denied'
  67. yield r
  68. continue
  69. r['type'] = 'dataset' if is_repo else 'directory'
  70. yield r
  71. def _list(path: Path):
  72. try:
  73. yield from _lsfiles(path)
  74. # TODO dedicated exception?
  75. except CommandError as e:
  76. # not in a dataset
  77. ce = CapturedException(e)
  78. lgr.debug(
  79. 'git-ls-files failed, falling back on manual inspection: %s',
  80. ce)
  81. # TODO apply standard filtering of results
  82. yield from _iterdir(path)
  83. except PermissionError as e:
  84. yield get_status_dict(
  85. path=str(path),
  86. status='error',
  87. exception=CapturedException(e),
  88. )
  89. def _lsfiles(path: Path):
  90. from datalad.support.gitrepo import GitRepo
  91. import re
  92. # just to be able use _get_content_info_line_helper
  93. # without a GitRepo instance
  94. class _Dummy:
  95. def __init__(self, path):
  96. self.pathobj = path
  97. # stolen from GitRepo.get_content_info()
  98. props_re = re.compile(
  99. r'(?P<type>[0-9]+) (?P<sha>.*) (.*)\t(?P<fname>.*)$')
  100. # we use a plain runner to avoid the overhead of a GitRepo instance
  101. runner = GitRunner()
  102. ret = runner.run(
  103. ['git', 'ls-files',
  104. # we want them all
  105. '--cached', '--deleted', '--modified', '--others',
  106. # we want the type info
  107. '--stage',
  108. # given that we only want the immediate directory content
  109. # there is little point in exploring the content of subdir.
  110. # however, we still want to be able to list directories
  111. # that are wholly untracked, but still have content
  112. '--directory',
  113. # don't show the stuff that a user didn't want to see
  114. '--exclude-standard',
  115. # to satisfy the needs of _get_content_info_line_helper()
  116. '-z'],
  117. protocol=StdOutCapture,
  118. # run in the directory we want info on
  119. # and do not pass further path constraints
  120. # work around https://github.com/datalad/datalad/issues/7040
  121. cwd=str(path),
  122. )
  123. info = dict()
  124. GitRepo._get_content_info_line_helper(
  125. _Dummy(path),
  126. None,
  127. info,
  128. ret['stdout'].split('\0'),
  129. props_re,
  130. )
  131. subdirs_reported = set()
  132. entirely_untracked_dir = False
  133. for p, props in info.items():
  134. rpath_parts = p.relative_to(path).parts
  135. if len(rpath_parts) > 1:
  136. # subdirectory content: regret the time it took to process
  137. # it (ls-files cannot be prevented to list it)
  138. if rpath_parts[0] in subdirs_reported:
  139. # we had the pleasure already, nothing else todo
  140. continue
  141. yield dict(
  142. path=path / rpath_parts[0],
  143. type='directory',
  144. )
  145. # and ignore now
  146. subdirs_reported.add(rpath_parts[0])
  147. continue
  148. # we should never get a report on the parent dir we are listing.
  149. # this only happens, when it is itself entirely untracked.
  150. # setting this flag catches this condition (there will be no other
  151. # result), and enable mitigation
  152. entirely_untracked_dir = p == path
  153. if not entirely_untracked_dir:
  154. yield dict(
  155. path=str(p),
  156. type=props['type'],
  157. )
  158. if entirely_untracked_dir:
  159. # fall back on _iterdir() for wholly untracked directories
  160. yield from _iterdir(path)
  161. def _iterdir(path: Path):
  162. # anything reported from here will be state=untracked
  163. # figure out the type, as far as we need it
  164. # right now we do not detect a subdir to be a dataset
  165. # vs a directory, only directories
  166. for c in path.iterdir():
  167. if c.name == '.git':
  168. # we do not report on this special name
  169. continue
  170. # c could disappear while this is running. Example: temp files managed
  171. # by other processes.
  172. try:
  173. cmode = c.lstat().st_mode
  174. except FileNotFoundError as e:
  175. CapturedException(e)
  176. continue
  177. if stat.S_ISLNK(cmode):
  178. ctype = 'symlink'
  179. elif stat.S_ISDIR(cmode):
  180. ctype = 'directory'
  181. else:
  182. # the rest is a file
  183. # there could be fifos and sockets, etc.
  184. # but we do not recognize them here
  185. ctype = 'file'
  186. props = dict(
  187. path=str(c),
  188. type=ctype,
  189. )
  190. if type != 'directory':
  191. props['state'] = 'untracked'
  192. yield props