terminology.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. """
  2. Handles (deferred) loading of terminology data and access to it
  3. for odML documents
  4. """
  5. import os
  6. import tempfile
  7. import datetime
  8. import odml.tools.xmlparser
  9. from hashlib import md5
  10. try:
  11. import urllib.request as urllib2
  12. except ImportError:
  13. import urllib2
  14. import threading
  15. CACHE_AGE = datetime.timedelta(days=1)
  16. def cache_load(url):
  17. """
  18. load the url and store it in a temporary cache directory
  19. subsequent requests for this url will use the cached version
  20. """
  21. filename = '.'.join([md5(url.encode()).hexdigest(), os.path.basename(url)])
  22. cache_dir = os.path.join(tempfile.gettempdir(), "odml.cache")
  23. if not os.path.exists(cache_dir):
  24. try:
  25. os.makedirs(cache_dir)
  26. except OSError: # might happen due to concurrency
  27. if not os.path.exists(cache_dir):
  28. raise
  29. cache_file = os.path.join(cache_dir, filename)
  30. if not os.path.exists(cache_file) \
  31. or datetime.datetime.fromtimestamp(os.path.getmtime(cache_file)) < \
  32. datetime.datetime.now() - CACHE_AGE:
  33. try:
  34. data = urllib2.urlopen(url).read()
  35. except Exception as e:
  36. print("failed loading '%s': %s" % (url, e))
  37. return
  38. fp = open(cache_file, "w")
  39. fp.write(str(data))
  40. fp.close()
  41. return open(cache_file)
  42. class Terminologies(dict):
  43. loading = {}
  44. def load(self, url):
  45. """
  46. load and cache a terminology-url
  47. returns the odml-document for the url
  48. """
  49. if url in self:
  50. return self[url]
  51. if url in self.loading:
  52. self.loading[url].join()
  53. self.loading.pop(url, None)
  54. return self.load(url)
  55. return self._load(url)
  56. def _load(self, url):
  57. # TODO also cache the data locally on disk
  58. # if url.startswith("http"): return None
  59. fp = cache_load(url)
  60. if fp is None:
  61. print("did not successfully load '%s'" % url)
  62. return
  63. try:
  64. term = odml.tools.xmlparser.XMLReader(filename=url, ignore_errors=True).fromFile(fp)
  65. term.finalize()
  66. except odml.tools.xmlparser.ParserException as e:
  67. print("Failed to load %s due to parser errors" % url)
  68. print(' "%s"' % e.message)
  69. term = None
  70. self[url] = term
  71. return term
  72. def deferred_load(self, url):
  73. """
  74. start a thread to load the terminology in background
  75. """
  76. if url in self or url in self.loading:
  77. return
  78. self.loading[url] = threading.Thread(target=self._load, args=(url,))
  79. self.loading[url].start()
  80. terminologies = Terminologies()
  81. load = terminologies.load
  82. deferred_load = terminologies.deferred_load
  83. if __name__ == "__main__":
  84. f = cache_load('http://portal.g-node.org/odml/terminologies/v1.0/analysis/analysis.xml')