123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306 |
- #!/usr/bin/env python
- """
- The XML parsing module.
- Parses odML files. Can be invoked standalone:
- python -m odml.tools.xmlparser file.odml
- """
- #TODO make this module a parser class, allow arguments (e.g. skip_errors=1 to parse even broken documents)
- from odml import format
- from lxml import etree as ET
- from lxml.builder import E
- # this is needed for py2exe to include lxml completely
- from lxml import _elementpath as _dummy
- import sys
- try:
- from StringIO import StringIO
- except ImportError:
- from io import StringIO
- format.Document._xml_name = "odML"
- format.Section._xml_name = "section"
- format.Property._xml_name = "property"
- format.Value._xml_name = "value"
- format.Document._xml_attributes = {}
- format.Section._xml_attributes = {'name': None} # attribute 'name' maps to 'name', but writing it as a tag is preferred
- format.Property._xml_attributes = {}
- format.Value._xml_attributes = {}
- format.Value._xml_content = 'value'
- XML_VERSION = "1"
- class XMLWriter:
- """
- Creates XML nodes storing the information of an odML Document
- """
- header = """<?xml version="1.0" encoding="UTF-8"?>
- <?xml-stylesheet type="text/xsl" href="odmlTerms.xsl"?>
- <?xml-stylesheet type="text/xsl" href="odml.xsl"?>
- """
- def __init__(self, odml_document):
- self.doc = odml_document
- @staticmethod
- def save_element(e):
- """
- returns an xml node for the odML object e
- """
- fmt = e._format
- if hasattr(fmt, "_xml_content"):
- val = getattr(e, fmt.map(fmt._xml_content))
- if val is None:
- val = ''
- cur = E(fmt._name, val)
- else:
- cur = E(fmt._name)
- # generate attributes
- if isinstance(fmt, format.Document.__class__):
- cur.attrib['version'] = XML_VERSION
- for k, v in fmt._xml_attributes.items():
- if not v or not hasattr(e, fmt.map(v)):
- continue
- val = getattr(e, fmt.map(v))
- if val is None:
- continue # no need to save this
- if sys.version_info < (3, 0):
- cur.attrib[k] = unicode(val)
- else:
- cur.attrib[k] = str(val)
- # generate elements
- for k in fmt._args:
- if (k in fmt._xml_attributes and fmt._xml_attributes[k] is not None) or not hasattr(e, fmt.map(k)) \
- or (hasattr(fmt, "_xml_content") and fmt._xml_content == k):
- continue
- val = getattr(e, fmt.map(k))
- if val is None:
- continue
- if isinstance(val, list):
- for v in val:
- if v is None:
- continue
- ele = XMLWriter.save_element(v)
- cur.append(ele)
- else:
- if sys.version_info < (3,):
- ele = E(k, unicode(val))
- else:
- ele = E(k, str(val))
- # ele = E(k, unicode(val))
- cur.append(ele)
- return cur
- def __str__(self):
- return ET.tounicode(self.save_element(self.doc), pretty_print=True)
- def __unicode__(self):
- return ET.tounicode(self.save_element(self.doc), pretty_print=True)
- def write_file(self, filename):
- # calculate the data before opening the file in case we get any
- # exception
- if sys.version_info < (3, ):
- data = unicode(self).encode('utf-8')
- else:
- data = str(self)
- f = open(filename, "w")
- f.write(self.header)
- f.write(data)
- f.close()
- def load(filename):
- """
- shortcut function for XMLReader().fromFile(open(filename))
- """
- return XMLReader().fromFile(open(filename))
- class ParserException(Exception):
- pass
- class XMLReader(object):
- """
- A reader to parse xml-files or strings into odml data structures
- Usage:
- >>> doc = XMLReader().fromFile(open("file.odml"))
- """
- def __init__(self, ignore_errors=False, filename=None):
- self.parser = ET.XMLParser(remove_comments=True)
- self.tags = dict([(obj._xml_name, obj) for obj in format.__all__])
- self.ignore_errors = ignore_errors
- self.filename = filename
- def fromFile(self, xml_file):
- """
- parse the datastream from a file like object *xml_file*
- and return an odml data structure
- """
- try:
- root = ET.parse(xml_file, self.parser).getroot()
- except ET.XMLSyntaxError as e:
- raise ParserException(e.message)
- return self.parse_element(root)
- def fromString(self, string):
- try:
- root = ET.XML(string, self.parser)
- except ET.XMLSyntaxError as e:
- raise ParserException(e.message)
- return self.parse_element(root)
- def check_mandatory_arguments(self, data, ArgClass, tag_name, node):
- for k, v in ArgClass._args.items():
- if v != 0 and not ArgClass.map(k) in data:
- self.error("missing element <%s> within <%s> tag" % (k, tag_name) + repr(data), node)
- def is_valid_argument(self, tag_name, ArgClass, parent_node, child=None):
- if tag_name not in ArgClass._args:
- self.error("Invalid element <%s> inside <%s> tag" % (tag_name, parent_node.tag), parent_node if child is None else child)
- def error(self, msg, elem):
- if elem is not None:
- msg += " (line %d)" % elem.sourceline
- if self.ignore_errors:
- return self.warn(msg, elem)
- raise ParserException(msg)
- def warn(self, msg, elem):
- if elem is not None:
- msg = "warning[%s:%d:<%s>]: %s\n" % (self.filename, elem.sourceline, elem.tag, msg)
- else:
- msg = "warning: %s\n" % msg
- sys.stderr.write(msg)
- def parse_element(self, node):
- if node.tag not in self.tags:
- self.error("Invalid element <%s>" % node.tag, node)
- return None # won't be able to parse this one
- return getattr(self, "parse_" + node.tag)(node, self.tags[node.tag])
- def parse_tag(self, root, fmt, insert_children=True, create=None):
- """
- parse an odml node based on the format description *fmt*
- and a function *create* to instantiate a corresponding object
- """
- arguments = {}
- extra_args = {}
- children = []
- text = []
- if root.text:
- text.append(root.text.strip())
- for k, v in root.attrib.iteritems():
- k = k.lower()
- self.is_valid_argument(k, fmt, root)
- if k == 'version' and root.tag == 'odML':
- continue # special case for XML version
- if k not in fmt._xml_attributes:
- self.error("<%s %s=...>: is not a valid attribute for %s" % (root.tag, k, root.tag), root)
- else:
- k = fmt._xml_attributes[k] or k
- arguments[k] = v
- for node in root:
- node.tag = node.tag.lower()
- self.is_valid_argument(node.tag, fmt, root, node)
- if node.tag in fmt._args:
- if node.tag in self.tags and node.tag in fmt._map: # this is a heuristic, but works for now
- sub_obj = self.parse_element(node)
- if sub_obj is not None:
- extra_args[fmt.map(node.tag)] = sub_obj
- children.append(sub_obj)
- else:
- tag = fmt.map(node.tag)
- if tag in arguments:
- # TODO make this an error, however first figure out a way to let <odML version=><version/> pass
- self.warn("Element <%s> is given multiple times in <%s> tag" % (node.tag, root.tag), node)
- arguments[tag] = node.text.strip() if node.text else None
- else:
- self.error("Invalid element <%s> in odML document section <%s>" % (node.tag, root.tag), node)
- if node.tail:
- text.append(node.tail.strip())
- if create is None:
- obj = fmt.create()
- else:
- obj = create(args=arguments, text=''.join(text), children=children)
- if sys.version_info > (3,):
- self.check_mandatory_arguments(dict(list(arguments.items()) + list(extra_args.items())),
- fmt, root.tag, root)
- else:
- self.check_mandatory_arguments(dict(arguments.items() + extra_args.items()),
- fmt, root.tag, root)
- for k, v in arguments.items():
- if hasattr(obj, k):
- try:
- setattr(obj, k, v)
- except Exception as e:
- self.warn("cannot set '%s' property on <%s>: %s" % (k, root.tag, repr(e)), root)
- if not self.ignore_errors:
- raise
- if insert_children:
- for child in children:
- obj.append(child)
- return obj
- def parse_odML(self, root, fmt):
- doc = self.parse_tag(root, fmt)
- return doc
- def parse_section(self, root, fmt):
- name = root.get("name") # property name= overrides
- if name is None: # the element
- name_node = root.find("name")
- if name_node is not None:
- name = name_node.text
- root.remove(name_node)
- # delete the name_node so its value won't
- # be used to overwrite the already set name-attribute
- if name is None:
- self.error("Missing name element in <section>", root)
- return self.parse_tag(root, fmt, create=lambda **kargs: fmt.create(name))
- def parse_property(self, root, fmt):
- create = lambda children, args, **kargs: fmt.create(value=children, **args)
- return self.parse_tag(root, fmt, insert_children=False, create=create)
- def parse_value(self, root, fmt):
- create = lambda text, args, **kargs: fmt.create(text, **args)
- return self.parse_tag(root, fmt, create=create)
- if __name__ == '__main__':
- from optparse import OptionParser
- import odml.tools.dumper as dumper
- parser = OptionParser()
- (options, args) = parser.parse_args()
- if len(args) < 1:
- parser.print_help()
- else:
- dumper.dumpDoc(load(args[0]))
|