xmltodict.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. #!/usr/bin/env python
  2. "Makes working with XML feel like you are working with JSON"
  3. from xml.parsers import expat
  4. from xml.sax.saxutils import XMLGenerator
  5. from xml.sax.xmlreader import AttributesImpl
  6. try: # pragma no cover
  7. from cStringIO import StringIO
  8. except ImportError: # pragma no cover
  9. try:
  10. from StringIO import StringIO
  11. except ImportError:
  12. from io import StringIO
  13. try: # pragma no cover
  14. from collections import OrderedDict
  15. except ImportError: # pragma no cover
  16. try:
  17. from ordereddict import OrderedDict
  18. except ImportError:
  19. OrderedDict = dict
  20. try: # pragma no cover
  21. _basestring = basestring
  22. except NameError: # pragma no cover
  23. _basestring = str
  24. try: # pragma no cover
  25. _unicode = unicode
  26. except NameError: # pragma no cover
  27. _unicode = str
  28. __author__ = 'Martin Blech'
  29. __version__ = '0.9.2'
  30. __license__ = 'MIT'
  31. class ParsingInterrupted(Exception):
  32. pass
  33. class _DictSAXHandler(object):
  34. def __init__(self,
  35. item_depth=0,
  36. item_callback=lambda *args: True,
  37. xml_attribs=True,
  38. attr_prefix='@',
  39. cdata_key='#text',
  40. force_cdata=False,
  41. cdata_separator='',
  42. postprocessor=None,
  43. dict_constructor=OrderedDict,
  44. strip_whitespace=True,
  45. namespace_separator=':',
  46. namespaces=None):
  47. self.path = []
  48. self.stack = []
  49. self.data = None
  50. self.item = None
  51. self.item_depth = item_depth
  52. self.xml_attribs = xml_attribs
  53. self.item_callback = item_callback
  54. self.attr_prefix = attr_prefix
  55. self.cdata_key = cdata_key
  56. self.force_cdata = force_cdata
  57. self.cdata_separator = cdata_separator
  58. self.postprocessor = postprocessor
  59. self.dict_constructor = dict_constructor
  60. self.strip_whitespace = strip_whitespace
  61. self.namespace_separator = namespace_separator
  62. self.namespaces = namespaces
  63. def _build_name(self, full_name):
  64. if not self.namespaces:
  65. return full_name
  66. i = full_name.rfind(self.namespace_separator)
  67. if i == -1:
  68. return full_name
  69. namespace, name = full_name[:i], full_name[i+1:]
  70. short_namespace = self.namespaces.get(namespace, namespace)
  71. if not short_namespace:
  72. return name
  73. else:
  74. return self.namespace_separator.join((short_namespace, name))
  75. def _attrs_to_dict(self, attrs):
  76. if isinstance(attrs, dict):
  77. return attrs
  78. return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
  79. def startElement(self, full_name, attrs):
  80. name = self._build_name(full_name)
  81. attrs = self._attrs_to_dict(attrs)
  82. self.path.append((name, attrs or None))
  83. if len(self.path) > self.item_depth:
  84. self.stack.append((self.item, self.data))
  85. if self.xml_attribs:
  86. attrs = self.dict_constructor(
  87. (self.attr_prefix+key, value)
  88. for (key, value) in attrs.items())
  89. else:
  90. attrs = None
  91. self.item = attrs or None
  92. self.data = None
  93. def endElement(self, full_name):
  94. name = self._build_name(full_name)
  95. if len(self.path) == self.item_depth:
  96. item = self.item
  97. if item is None:
  98. item = self.data
  99. should_continue = self.item_callback(self.path, item)
  100. if not should_continue:
  101. raise ParsingInterrupted()
  102. if len(self.stack):
  103. item, data = self.item, self.data
  104. self.item, self.data = self.stack.pop()
  105. if self.strip_whitespace and data is not None:
  106. data = data.strip() or None
  107. if data and self.force_cdata and item is None:
  108. item = self.dict_constructor()
  109. if item is not None:
  110. if data:
  111. self.push_data(item, self.cdata_key, data)
  112. self.item = self.push_data(self.item, name, item)
  113. else:
  114. self.item = self.push_data(self.item, name, data)
  115. else:
  116. self.item = self.data = None
  117. self.path.pop()
  118. def characters(self, data):
  119. if not self.data:
  120. self.data = data
  121. else:
  122. self.data += self.cdata_separator + data
  123. def push_data(self, item, key, data):
  124. if self.postprocessor is not None:
  125. result = self.postprocessor(self.path, key, data)
  126. if result is None:
  127. return item
  128. key, data = result
  129. if item is None:
  130. item = self.dict_constructor()
  131. try:
  132. value = item[key]
  133. if isinstance(value, list):
  134. value.append(data)
  135. else:
  136. item[key] = [value, data]
  137. except KeyError:
  138. item[key] = data
  139. return item
  140. def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
  141. namespace_separator=':', **kwargs):
  142. """Parse the given XML input and convert it into a dictionary.
  143. `xml_input` can either be a `string` or a file-like object.
  144. If `xml_attribs` is `True`, element attributes are put in the dictionary
  145. among regular child elements, using `@` as a prefix to avoid collisions. If
  146. set to `False`, they are just ignored.
  147. Simple example::
  148. >>> import xmltodict
  149. >>> doc = xmltodict.parse(\"\"\"
  150. ... <a prop="x">
  151. ... <b>1</b>
  152. ... <b>2</b>
  153. ... </a>
  154. ... \"\"\")
  155. >>> doc['a']['@prop']
  156. u'x'
  157. >>> doc['a']['b']
  158. [u'1', u'2']
  159. If `item_depth` is `0`, the function returns a dictionary for the root
  160. element (default behavior). Otherwise, it calls `item_callback` every time
  161. an item at the specified depth is found and returns `None` in the end
  162. (streaming mode).
  163. The callback function receives two parameters: the `path` from the document
  164. root to the item (name-attribs pairs), and the `item` (dict). If the
  165. callback's return value is false-ish, parsing will be stopped with the
  166. :class:`ParsingInterrupted` exception.
  167. Streaming example::
  168. >>> def handle(path, item):
  169. ... print 'path:%s item:%s' % (path, item)
  170. ... return True
  171. ...
  172. >>> xmltodict.parse(\"\"\"
  173. ... <a prop="x">
  174. ... <b>1</b>
  175. ... <b>2</b>
  176. ... </a>\"\"\", item_depth=2, item_callback=handle)
  177. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
  178. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
  179. The optional argument `postprocessor` is a function that takes `path`,
  180. `key` and `value` as positional arguments and returns a new `(key, value)`
  181. pair where both `key` and `value` may have changed. Usage example::
  182. >>> def postprocessor(path, key, value):
  183. ... try:
  184. ... return key + ':int', int(value)
  185. ... except (ValueError, TypeError):
  186. ... return key, value
  187. >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
  188. ... postprocessor=postprocessor)
  189. OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])
  190. You can pass an alternate version of `expat` (such as `defusedexpat`) by
  191. using the `expat` parameter. E.g:
  192. >>> import defusedexpat
  193. >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
  194. OrderedDict([(u'a', u'hello')])
  195. """
  196. handler = _DictSAXHandler(namespace_separator=namespace_separator,
  197. **kwargs)
  198. if isinstance(xml_input, _unicode):
  199. if not encoding:
  200. encoding = 'utf-8'
  201. xml_input = xml_input.encode(encoding)
  202. if not process_namespaces:
  203. namespace_separator = None
  204. parser = expat.ParserCreate(
  205. encoding,
  206. namespace_separator
  207. )
  208. try:
  209. parser.ordered_attributes = True
  210. except AttributeError:
  211. # Jython's expat does not support ordered_attributes
  212. pass
  213. parser.StartElementHandler = handler.startElement
  214. parser.EndElementHandler = handler.endElement
  215. parser.CharacterDataHandler = handler.characters
  216. parser.buffer_text = True
  217. try:
  218. parser.ParseFile(xml_input)
  219. except (TypeError, AttributeError):
  220. parser.Parse(xml_input, True)
  221. return handler.item
  222. def _emit(key, value, content_handler,
  223. attr_prefix='@',
  224. cdata_key='#text',
  225. depth=0,
  226. preprocessor=None,
  227. pretty=False,
  228. newl='\n',
  229. indent='\t',
  230. full_document=True):
  231. if preprocessor is not None:
  232. result = preprocessor(key, value)
  233. if result is None:
  234. return
  235. key, value = result
  236. if not isinstance(value, (list, tuple)):
  237. value = [value]
  238. if full_document and depth == 0 and len(value) > 1:
  239. raise ValueError('document with multiple roots')
  240. for v in value:
  241. if v is None:
  242. v = OrderedDict()
  243. elif not isinstance(v, dict):
  244. v = _unicode(v)
  245. if isinstance(v, _basestring):
  246. v = OrderedDict(((cdata_key, v),))
  247. cdata = None
  248. attrs = OrderedDict()
  249. children = []
  250. for ik, iv in v.items():
  251. if ik == cdata_key:
  252. cdata = iv
  253. continue
  254. if ik.startswith(attr_prefix):
  255. attrs[ik[len(attr_prefix):]] = iv
  256. continue
  257. children.append((ik, iv))
  258. if pretty:
  259. content_handler.ignorableWhitespace(depth * indent)
  260. content_handler.startElement(key, AttributesImpl(attrs))
  261. if pretty and children:
  262. content_handler.ignorableWhitespace(newl)
  263. for child_key, child_value in children:
  264. _emit(child_key, child_value, content_handler,
  265. attr_prefix, cdata_key, depth+1, preprocessor,
  266. pretty, newl, indent)
  267. if cdata is not None:
  268. content_handler.characters(cdata)
  269. if pretty and children:
  270. content_handler.ignorableWhitespace(depth * indent)
  271. content_handler.endElement(key)
  272. if pretty and depth:
  273. content_handler.ignorableWhitespace(newl)
  274. def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
  275. **kwargs):
  276. """Emit an XML document for the given `input_dict` (reverse of `parse`).
  277. The resulting XML document is returned as a string, but if `output` (a
  278. file-like object) is specified, it is written there instead.
  279. Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
  280. as XML node attributes, whereas keys equal to `cdata_key`
  281. (default=`'#text'`) are treated as character data.
  282. The `pretty` parameter (default=`False`) enables pretty-printing. In this
  283. mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
  284. can be customized with the `newl` and `indent` parameters.
  285. """
  286. if full_document and len(input_dict) != 1:
  287. raise ValueError('Document must have exactly one root.')
  288. must_return = False
  289. if output is None:
  290. output = StringIO()
  291. must_return = True
  292. content_handler = XMLGenerator(output, encoding)
  293. if full_document:
  294. content_handler.startDocument()
  295. for key, value in input_dict.items():
  296. _emit(key, value, content_handler, full_document=full_document,
  297. **kwargs)
  298. if full_document:
  299. content_handler.endDocument()
  300. if must_return:
  301. value = output.getvalue()
  302. try: # pragma no cover
  303. value = value.decode(encoding)
  304. except AttributeError: # pragma no cover
  305. pass
  306. return value
  307. if __name__ == '__main__': # pragma: no cover
  308. import sys
  309. import marshal
  310. (item_depth,) = sys.argv[1:]
  311. item_depth = int(item_depth)
  312. def handle_item(path, item):
  313. marshal.dump((path, item), sys.stdout)
  314. return True
  315. try:
  316. root = parse(sys.stdin,
  317. item_depth=item_depth,
  318. item_callback=handle_item,
  319. dict_constructor=dict)
  320. if item_depth == 0:
  321. handle_item([], root)
  322. except KeyboardInterrupt:
  323. pass