Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

xmlparser.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. #!/usr/bin/env python
  2. """
  3. The XML parsing module.
  4. Parses odML files. Can be invoked standalone:
  5. python -m odml.tools.xmlparser file.odml
  6. """
  7. import csv
  8. import sys
  9. from lxml import etree as ET
  10. from lxml.builder import E
  11. # this is needed for py2exe to include lxml completely
  12. from lxml import _elementpath as _dummy
  13. from os.path import basename
  14. try:
  15. from StringIO import StringIO
  16. except ImportError:
  17. from io import StringIO
  18. from .. import format
  19. from ..info import FORMAT_VERSION
  20. from .parser_utils import InvalidVersionException, ParserException
  21. try:
  22. unicode = unicode
  23. except NameError:
  24. unicode = str
  25. def to_csv(val):
  26. # Make sure all individual values do not contain
  27. # leading or trailing whitespaces.
  28. unicode_values = list(map(unicode.strip, map(unicode, val)))
  29. stream = StringIO()
  30. writer = csv.writer(stream, dialect="excel")
  31. writer.writerow(unicode_values)
  32. # Strip any csv.writer added carriage return line feeds
  33. # and double quotes before saving.
  34. csv_string = stream.getvalue().strip().strip('"')
  35. if len(unicode_values) > 1:
  36. csv_string = "[" + csv_string + "]"
  37. return csv_string
  38. def from_csv(value_string):
  39. if not value_string:
  40. return []
  41. if value_string[0] == "[" and value_string[-1] == "]":
  42. value_string = value_string[1:-1]
  43. else:
  44. # This is a single string entry, any comma contained
  45. # is part of the value and must not be used to
  46. # split up the string.
  47. return [value_string]
  48. if not value_string:
  49. return []
  50. stream = StringIO(value_string)
  51. stream.seek(0)
  52. reader = csv.reader(stream, dialect="excel")
  53. return list(reader)[0]
  54. class XMLWriter:
  55. """
  56. Creates XML nodes storing the information of an odML Document
  57. """
  58. header = """<?xml version="1.0" encoding="UTF-8"?>
  59. <?xml-stylesheet type="text/xsl" href="odmlTerms.xsl"?>
  60. <?xml-stylesheet type="text/xsl" href="odml.xsl"?>
  61. """
  62. def __init__(self, odml_document):
  63. self.doc = odml_document
  64. @staticmethod
  65. def save_element(e):
  66. """
  67. returns an xml node for the odML object e
  68. """
  69. fmt = e.format()
  70. cur = E(fmt.name)
  71. # generate attributes
  72. if isinstance(fmt, format.Document.__class__):
  73. cur.attrib['version'] = FORMAT_VERSION
  74. # generate elements
  75. for k in fmt.arguments_keys:
  76. if not hasattr(e, fmt.map(k)):
  77. continue
  78. val = getattr(e, fmt.map(k))
  79. if val is None:
  80. continue
  81. if isinstance(fmt, format.Property.__class__) and k == "value":
  82. # Custom odML tuples require special handling for save loading from file.
  83. if e.dtype and e.dtype.endswith("-tuple") and len(val) > 0:
  84. ele = E(k, "(%s)" % ";".join(val[0]))
  85. else:
  86. ele = E(k, to_csv(val))
  87. cur.append(ele)
  88. else:
  89. if isinstance(val, list):
  90. for v in val:
  91. if v is None:
  92. continue
  93. ele = XMLWriter.save_element(v)
  94. cur.append(ele)
  95. else:
  96. if sys.version_info < (3,):
  97. ele = E(k, unicode(val))
  98. else:
  99. ele = E(k, str(val))
  100. cur.append(ele)
  101. return cur
  102. def __str__(self):
  103. return ET.tounicode(self.save_element(self.doc), pretty_print=True)
  104. def __unicode__(self):
  105. return ET.tounicode(self.save_element(self.doc), pretty_print=True)
  106. def write_file(self, filename):
  107. # calculate the data before opening the file in case we get any
  108. # exception
  109. if sys.version_info < (3,):
  110. data = unicode(self).encode('utf-8')
  111. else:
  112. data = str(self)
  113. with open(filename, "w") as file:
  114. file.write(self.header)
  115. file.write(data)
  116. def load(filename):
  117. """
  118. shortcut function for XMLReader().from_file(filename)
  119. """
  120. return XMLReader().from_file(filename)
  121. class XMLReader(object):
  122. """
  123. A reader to parse xml-files or strings into odml data structures
  124. Usage:
  125. >>> doc = XMLReader().from_file("file.odml")
  126. """
  127. def __init__(self, ignore_errors=False, show_warnings=True, filename=None):
  128. """
  129. :param ignore_errors: To allow loading and fixing of invalid odml files
  130. encountered errors can be converted to warnings
  131. instead. Such a document can only be saved when
  132. all errors have been addressed though.
  133. :param show_warnings: Toggle whether to print warnings to the command line.
  134. Any warnings can be accessed via the Reader's class
  135. warnings attribute after parsing is done.
  136. :param filename: Path to an odml file.
  137. """
  138. self.parser = ET.XMLParser(remove_comments=True)
  139. self.tags = dict([(obj.name, obj) for obj in format.__all__])
  140. self.ignore_errors = ignore_errors
  141. self.show_warnings = show_warnings
  142. self.filename = filename
  143. self.warnings = []
  144. @staticmethod
  145. def _handle_version(root):
  146. """
  147. Check if the odML version of a handed in parsed lxml.etree is supported
  148. by the current library and raise an Exception otherwise.
  149. :param root: Root node of a parsed lxml.etree. The root tag has to
  150. contain a supported odML version number, otherwise it is not
  151. accepted as a valid odML file.
  152. """
  153. if root.tag != 'odML':
  154. raise ParserException("Expecting <odML> tag but got <%s>.\n" % root.tag)
  155. elif 'version' not in root.attrib:
  156. raise ParserException("Could not find format version attribute "
  157. "in <odML> tag.\n")
  158. elif root.attrib['version'] != FORMAT_VERSION:
  159. msg = ("Cannot parse odML document with format version '%s'. \n"
  160. "\tUse the 'tools.VersionConverter' to import previous odML formats."
  161. % root.attrib['version'])
  162. raise InvalidVersionException(msg)
  163. def from_file(self, xml_file):
  164. """
  165. parse the datastream from a file like object *xml_file*
  166. and return an odml data structure
  167. """
  168. try:
  169. root = ET.parse(xml_file, self.parser).getroot()
  170. if hasattr(xml_file, "close"):
  171. xml_file.close()
  172. except ET.XMLSyntaxError as e:
  173. raise ParserException(e.msg)
  174. self._handle_version(root)
  175. doc = self.parse_element(root)
  176. # Provide original file name via the in memory document
  177. if isinstance(xml_file, unicode):
  178. doc._origin_file_name = basename(xml_file)
  179. return doc
  180. def from_string(self, string):
  181. try:
  182. root = ET.XML(string, self.parser)
  183. except ET.XMLSyntaxError as e:
  184. raise ParserException(e.msg)
  185. self._handle_version(root)
  186. return self.parse_element(root)
  187. def check_mandatory_arguments(self, data, ArgClass, tag_name, node):
  188. for k, v in ArgClass.arguments:
  189. if v != 0 and not ArgClass.map(k) in data:
  190. self.error("missing element <%s> within <%s> tag" %
  191. (k, tag_name) + repr(data), node)
  192. def is_valid_argument(self, tag_name, ArgClass, parent_node, child=None):
  193. if tag_name not in ArgClass.arguments_keys:
  194. self.error("Invalid element <%s> inside <%s> tag" %
  195. (tag_name, parent_node.tag),
  196. parent_node if child is None else child)
  197. def error(self, msg, elem):
  198. if elem is not None:
  199. msg += " (line %d)" % elem.sourceline
  200. if self.ignore_errors:
  201. return self.warn(msg, elem)
  202. raise ParserException(msg)
  203. def warn(self, msg, elem):
  204. if elem is not None:
  205. msg = "warning[%s:%d:<%s>]: %s\n" % (
  206. self.filename, elem.sourceline, elem.tag, msg)
  207. else:
  208. msg = "warning: %s\n" % msg
  209. self.warnings.append(msg)
  210. if self.show_warnings:
  211. sys.stderr.write(msg)
  212. def parse_element(self, node):
  213. if node.tag not in self.tags:
  214. self.error("Invalid element <%s>" % node.tag, node)
  215. return None # won't be able to parse this one
  216. return getattr(self, "parse_" + node.tag)(node, self.tags[node.tag])
  217. def parse_tag(self, root, fmt, insert_children=True):
  218. """
  219. Parse an odml node based on the format description *fmt*
  220. and instantiate the corresponding object.
  221. :param root: lxml.etree node containing an odML object or object tree.
  222. :param fmt: odML class corresponding to the content of the root node.
  223. :param insert_children: Bool value. When True, child elements of the root node
  224. will be parsed to their odML equivalents and appended to
  225. the odML document. When False, child elements of the
  226. root node will be ignored.
  227. """
  228. arguments = {}
  229. extra_args = {}
  230. children = []
  231. for k, v in root.attrib.iteritems():
  232. k = k.lower()
  233. # 'version' is currently the only supported XML attribute.
  234. if k == 'version' and root.tag == 'odML':
  235. continue
  236. # We currently do not support XML attributes.
  237. self.error("Attribute not supported, ignoring '%s=%s'" % (k, v), root)
  238. for node in root:
  239. node.tag = node.tag.lower()
  240. self.is_valid_argument(node.tag, fmt, root, node)
  241. if node.tag in fmt.arguments_keys:
  242. # this is a heuristic, but works for now
  243. if node.tag in self.tags and node.tag in fmt.map_keys:
  244. sub_obj = self.parse_element(node)
  245. if sub_obj is not None:
  246. extra_args[fmt.map(node.tag)] = sub_obj
  247. children.append(sub_obj)
  248. else:
  249. tag = fmt.map(node.tag)
  250. if tag in arguments:
  251. self.warn("Element <%s> is given multiple times in "
  252. "<%s> tag" % (node.tag, root.tag), node)
  253. # Special handling of values;
  254. curr_text = node.text.strip() if node.text else None
  255. if tag == "values" and curr_text:
  256. content = from_csv(node.text)
  257. arguments[tag] = content
  258. else:
  259. arguments[tag] = curr_text
  260. else:
  261. self.error("Invalid element <%s> in odML document section <%s>"
  262. % (node.tag, root.tag), node)
  263. if sys.version_info > (3,):
  264. check_args = dict(list(arguments.items()) + list(extra_args.items()))
  265. else:
  266. check_args = dict(arguments.items() + extra_args.items())
  267. self.check_mandatory_arguments(check_args, fmt, root.tag, root)
  268. # Instantiate the current odML object with the parsed attributes.
  269. obj = fmt.create(**arguments)
  270. if insert_children:
  271. for child in children:
  272. obj.append(child)
  273. return obj
  274. def parse_odML(self, root, fmt):
  275. doc = self.parse_tag(root, fmt)
  276. return doc
  277. def parse_section(self, root, fmt):
  278. return self.parse_tag(root, fmt)
  279. def parse_property(self, root, fmt):
  280. return self.parse_tag(root, fmt, insert_children=False)
  281. if __name__ == '__main__':
  282. from optparse import OptionParser
  283. import odml.tools.dumper as dumper
  284. parser = OptionParser()
  285. (options, args) = parser.parse_args()
  286. if len(args) < 1:
  287. parser.print_help()
  288. else:
  289. dumper.dumpDoc(load(args[0]))