xmlparser.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. #!/usr/bin/env python
  2. """
  3. The XML parsing module.
  4. Parses odML files. Can be invoked standalone:
  5. python -m odml.tools.xmlparser file.odml
  6. """
  7. #TODO make this module a parser class, allow arguments (e.g. skip_errors=1 to parse even broken documents)
  8. from odml import format
  9. from lxml import etree as ET
  10. from lxml.builder import E
  11. # this is needed for py2exe to include lxml completely
  12. from lxml import _elementpath as _dummy
  13. import sys
  14. try:
  15. from StringIO import StringIO
  16. except ImportError:
  17. from io import StringIO
  18. format.Document._xml_name = "odML"
  19. format.Section._xml_name = "section"
  20. format.Property._xml_name = "property"
  21. format.Value._xml_name = "value"
  22. format.Document._xml_attributes = {}
  23. format.Section._xml_attributes = {'name': None} # attribute 'name' maps to 'name', but writing it as a tag is preferred
  24. format.Property._xml_attributes = {}
  25. format.Value._xml_attributes = {}
  26. format.Value._xml_content = 'value'
  27. XML_VERSION = "1"
  28. class XMLWriter:
  29. """
  30. Creates XML nodes storing the information of an odML Document
  31. """
  32. header = """<?xml version="1.0" encoding="UTF-8"?>
  33. <?xml-stylesheet type="text/xsl" href="odmlTerms.xsl"?>
  34. <?xml-stylesheet type="text/xsl" href="odml.xsl"?>
  35. """
  36. def __init__(self, odml_document):
  37. self.doc = odml_document
  38. @staticmethod
  39. def save_element(e):
  40. """
  41. returns an xml node for the odML object e
  42. """
  43. fmt = e._format
  44. if hasattr(fmt, "_xml_content"):
  45. val = getattr(e, fmt.map(fmt._xml_content))
  46. if val is None:
  47. val = ''
  48. cur = E(fmt._name, val)
  49. else:
  50. cur = E(fmt._name)
  51. # generate attributes
  52. if isinstance(fmt, format.Document.__class__):
  53. cur.attrib['version'] = XML_VERSION
  54. for k, v in fmt._xml_attributes.items():
  55. if not v or not hasattr(e, fmt.map(v)):
  56. continue
  57. val = getattr(e, fmt.map(v))
  58. if val is None:
  59. continue # no need to save this
  60. if sys.version_info < (3, 0):
  61. cur.attrib[k] = unicode(val)
  62. else:
  63. cur.attrib[k] = str(val)
  64. # generate elements
  65. for k in fmt._args:
  66. if (k in fmt._xml_attributes and fmt._xml_attributes[k] is not None) or not hasattr(e, fmt.map(k)) \
  67. or (hasattr(fmt, "_xml_content") and fmt._xml_content == k):
  68. continue
  69. val = getattr(e, fmt.map(k))
  70. if val is None:
  71. continue
  72. if isinstance(val, list):
  73. for v in val:
  74. if v is None:
  75. continue
  76. ele = XMLWriter.save_element(v)
  77. cur.append(ele)
  78. else:
  79. if sys.version_info < (3,):
  80. ele = E(k, unicode(val))
  81. else:
  82. ele = E(k, str(val))
  83. # ele = E(k, unicode(val))
  84. cur.append(ele)
  85. return cur
  86. def __str__(self):
  87. return ET.tounicode(self.save_element(self.doc), pretty_print=True)
  88. def __unicode__(self):
  89. return ET.tounicode(self.save_element(self.doc), pretty_print=True)
  90. def write_file(self, filename):
  91. # calculate the data before opening the file in case we get any
  92. # exception
  93. if sys.version_info < (3, ):
  94. data = unicode(self).encode('utf-8')
  95. else:
  96. data = str(self)
  97. f = open(filename, "w")
  98. f.write(self.header)
  99. f.write(data)
  100. f.close()
  101. def load(filename):
  102. """
  103. shortcut function for XMLReader().fromFile(open(filename))
  104. """
  105. return XMLReader().fromFile(open(filename))
  106. class ParserException(Exception):
  107. pass
  108. class XMLReader(object):
  109. """
  110. A reader to parse xml-files or strings into odml data structures
  111. Usage:
  112. >>> doc = XMLReader().fromFile(open("file.odml"))
  113. """
  114. def __init__(self, ignore_errors=False, filename=None):
  115. self.parser = ET.XMLParser(remove_comments=True)
  116. self.tags = dict([(obj._xml_name, obj) for obj in format.__all__])
  117. self.ignore_errors = ignore_errors
  118. self.filename = filename
  119. def fromFile(self, xml_file):
  120. """
  121. parse the datastream from a file like object *xml_file*
  122. and return an odml data structure
  123. """
  124. try:
  125. root = ET.parse(xml_file, self.parser).getroot()
  126. except ET.XMLSyntaxError as e:
  127. raise ParserException(e.message)
  128. return self.parse_element(root)
  129. def fromString(self, string):
  130. try:
  131. root = ET.XML(string, self.parser)
  132. except ET.XMLSyntaxError as e:
  133. raise ParserException(e.message)
  134. return self.parse_element(root)
  135. def check_mandatory_arguments(self, data, ArgClass, tag_name, node):
  136. for k, v in ArgClass._args.items():
  137. if v != 0 and not ArgClass.map(k) in data:
  138. self.error("missing element <%s> within <%s> tag" % (k, tag_name) + repr(data), node)
  139. def is_valid_argument(self, tag_name, ArgClass, parent_node, child=None):
  140. if tag_name not in ArgClass._args:
  141. self.error("Invalid element <%s> inside <%s> tag" % (tag_name, parent_node.tag), parent_node if child is None else child)
  142. def error(self, msg, elem):
  143. if elem is not None:
  144. msg += " (line %d)" % elem.sourceline
  145. if self.ignore_errors:
  146. return self.warn(msg, elem)
  147. raise ParserException(msg)
  148. def warn(self, msg, elem):
  149. if elem is not None:
  150. msg = "warning[%s:%d:<%s>]: %s\n" % (self.filename, elem.sourceline, elem.tag, msg)
  151. else:
  152. msg = "warning: %s\n" % msg
  153. sys.stderr.write(msg)
  154. def parse_element(self, node):
  155. if node.tag not in self.tags:
  156. self.error("Invalid element <%s>" % node.tag, node)
  157. return None # won't be able to parse this one
  158. return getattr(self, "parse_" + node.tag)(node, self.tags[node.tag])
  159. def parse_tag(self, root, fmt, insert_children=True, create=None):
  160. """
  161. parse an odml node based on the format description *fmt*
  162. and a function *create* to instantiate a corresponding object
  163. """
  164. arguments = {}
  165. extra_args = {}
  166. children = []
  167. text = []
  168. if root.text:
  169. text.append(root.text.strip())
  170. for k, v in root.attrib.iteritems():
  171. k = k.lower()
  172. self.is_valid_argument(k, fmt, root)
  173. if k == 'version' and root.tag == 'odML':
  174. continue # special case for XML version
  175. if k not in fmt._xml_attributes:
  176. self.error("<%s %s=...>: is not a valid attribute for %s" % (root.tag, k, root.tag), root)
  177. else:
  178. k = fmt._xml_attributes[k] or k
  179. arguments[k] = v
  180. for node in root:
  181. node.tag = node.tag.lower()
  182. self.is_valid_argument(node.tag, fmt, root, node)
  183. if node.tag in fmt._args:
  184. if node.tag in self.tags and node.tag in fmt._map: # this is a heuristic, but works for now
  185. sub_obj = self.parse_element(node)
  186. if sub_obj is not None:
  187. extra_args[fmt.map(node.tag)] = sub_obj
  188. children.append(sub_obj)
  189. else:
  190. tag = fmt.map(node.tag)
  191. if tag in arguments:
  192. # TODO make this an error, however first figure out a way to let <odML version=><version/> pass
  193. self.warn("Element <%s> is given multiple times in <%s> tag" % (node.tag, root.tag), node)
  194. arguments[tag] = node.text.strip() if node.text else None
  195. else:
  196. self.error("Invalid element <%s> in odML document section <%s>" % (node.tag, root.tag), node)
  197. if node.tail:
  198. text.append(node.tail.strip())
  199. if create is None:
  200. obj = fmt.create()
  201. else:
  202. obj = create(args=arguments, text=''.join(text), children=children)
  203. if sys.version_info > (3,):
  204. self.check_mandatory_arguments(dict(list(arguments.items()) + list(extra_args.items())),
  205. fmt, root.tag, root)
  206. else:
  207. self.check_mandatory_arguments(dict(arguments.items() + extra_args.items()),
  208. fmt, root.tag, root)
  209. for k, v in arguments.items():
  210. if hasattr(obj, k):
  211. try:
  212. setattr(obj, k, v)
  213. except Exception as e:
  214. self.warn("cannot set '%s' property on <%s>: %s" % (k, root.tag, repr(e)), root)
  215. if not self.ignore_errors:
  216. raise
  217. if insert_children:
  218. for child in children:
  219. obj.append(child)
  220. return obj
  221. def parse_odML(self, root, fmt):
  222. doc = self.parse_tag(root, fmt)
  223. return doc
  224. def parse_section(self, root, fmt):
  225. name = root.get("name") # property name= overrides
  226. if name is None: # the element
  227. name_node = root.find("name")
  228. if name_node is not None:
  229. name = name_node.text
  230. root.remove(name_node)
  231. # delete the name_node so its value won't
  232. # be used to overwrite the already set name-attribute
  233. if name is None:
  234. self.error("Missing name element in <section>", root)
  235. return self.parse_tag(root, fmt, create=lambda **kargs: fmt.create(name))
  236. def parse_property(self, root, fmt):
  237. create = lambda children, args, **kargs: fmt.create(value=children, **args)
  238. return self.parse_tag(root, fmt, insert_children=False, create=create)
  239. def parse_value(self, root, fmt):
  240. create = lambda text, args, **kargs: fmt.create(text, **args)
  241. return self.parse_tag(root, fmt, create=create)
  242. if __name__ == '__main__':
  243. from optparse import OptionParser
  244. import odml.tools.dumper as dumper
  245. parser = OptionParser()
  246. (options, args) = parser.parse_args()
  247. if len(args) < 1:
  248. parser.print_help()
  249. else:
  250. dumper.dumpDoc(load(args[0]))