terminology.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. """
  2. Handles (deferred) loading of terminology data and access to it
  3. for odML documents
  4. """
  5. import datetime
  6. import os
  7. import sys
  8. import tempfile
  9. import threading
  10. try:
  11. import urllib.request as urllib2
  12. except ImportError:
  13. import urllib2
  14. from hashlib import md5
  15. from .tools.parser_utils import ParserException
  16. from .tools.xmlparser import XMLReader
  17. REPOSITORY_BASE = 'http://portal.g-node.org/odml/terminologies'
  18. REPOSITORY = '/'.join([REPOSITORY_BASE, 'v1.1', 'terminologies.xml'])
  19. CACHE_AGE = datetime.timedelta(days=1)
  20. def cache_load(url):
  21. """
  22. Load the url and store it in a temporary cache directory
  23. subsequent requests for this url will use the cached version
  24. """
  25. filename = '.'.join([md5(url.encode()).hexdigest(), os.path.basename(url)])
  26. cache_dir = os.path.join(tempfile.gettempdir(), "odml.cache")
  27. if not os.path.exists(cache_dir):
  28. try:
  29. os.makedirs(cache_dir)
  30. except OSError: # might happen due to concurrency
  31. if not os.path.exists(cache_dir):
  32. raise
  33. cache_file = os.path.join(cache_dir, filename)
  34. if not os.path.exists(cache_file) \
  35. or datetime.datetime.fromtimestamp(os.path.getmtime(cache_file)) < \
  36. datetime.datetime.now() - CACHE_AGE:
  37. try:
  38. data = urllib2.urlopen(url).read()
  39. if sys.version_info.major > 2:
  40. data = data.decode("utf-8")
  41. except Exception as e:
  42. print("failed loading '%s': %s" % (url, e))
  43. return
  44. fp = open(cache_file, "w")
  45. fp.write(str(data))
  46. fp.close()
  47. return open(cache_file)
  48. class Terminologies(dict):
  49. loading = {}
  50. def load(self, url):
  51. """
  52. Load and cache a terminology-url
  53. Returns the odml-document for the url
  54. """
  55. if url in self:
  56. return self[url]
  57. if url in self.loading:
  58. self.loading[url].join()
  59. self.loading.pop(url, None)
  60. return self.load(url)
  61. return self._load(url)
  62. def _load(self, url):
  63. # TODO also cache the data locally on disk
  64. # if url.startswith("http"): return None
  65. fp = cache_load(url)
  66. if fp is None:
  67. print("did not successfully load '%s'" % url)
  68. return
  69. try:
  70. term = XMLReader(filename=url, ignore_errors=True).from_file(fp)
  71. term.finalize()
  72. except ParserException as e:
  73. print("Failed to load %s due to parser errors" % url)
  74. print(' "%s"' % e)
  75. term = None
  76. self[url] = term
  77. return term
  78. def deferred_load(self, url):
  79. """
  80. Start a thread to load the terminology in background
  81. """
  82. if url in self or url in self.loading:
  83. return
  84. self.loading[url] = threading.Thread(target=self._load, args=(url,))
  85. self.loading[url].start()
  86. terminologies = Terminologies()
  87. load = terminologies.load
  88. deferred_load = terminologies.deferred_load
  89. if __name__ == "__main__":
  90. f = cache_load(REPOSITORY)