odmlparser.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. #!/usr/bin/env python
  2. """
  3. A generic odML parsing module. It parses odML files and documents.
  4. All supported formats can be found in parser_utils.SUPPORTED_PARSERS.
  5. """
  6. import datetime
  7. import json
  8. import sys
  9. import warnings
  10. from os.path import basename
  11. import yaml
  12. from . import xmlparser
  13. from .dict_parser import DictWriter, DictReader
  14. from ..info import FORMAT_VERSION
  15. from .parser_utils import ParserException
  16. from .parser_utils import SUPPORTED_PARSERS
  17. from .rdf_converter import RDFReader, RDFWriter
  18. from ..validation import Validation
  19. try:
  20. unicode = unicode
  21. except NameError:
  22. unicode = str
  23. class ODMLWriter:
  24. """
  25. A generic odML document writer for JSON, XML, YAML and RDF.
  26. The output format is specified on init.
  27. Usage:
  28. xml_writer = ODMLWriter(parser='XML')
  29. xml_writer.write_file(odml_document, filepath)
  30. """
  31. def __init__(self, parser='XML'):
  32. self.parsed_doc = None # Python dictionary object equivalent
  33. parser = parser.upper()
  34. if parser not in SUPPORTED_PARSERS:
  35. raise NotImplementedError("'%s' odML parser does not exist!" % parser)
  36. self.parser = parser
  37. def write_file(self, odml_document, filename):
  38. """
  39. Writes an odml.Document to a file using the format
  40. defined in the ODMLWriter.parser property. Supported formats are
  41. JSON, XML, YAML and RDF.
  42. Will raise a ParserException if the odml.Document is not valid.
  43. :param odml_document: odml.Document.
  44. :param filename: path and filename of the output file.
  45. """
  46. # Write document only if it does not contain validation errors.
  47. validation = Validation(odml_document)
  48. msg = ""
  49. for err in validation.errors:
  50. if err.is_error:
  51. # msg += "\n\t- %s %s: %s" % (err.obj, err.rank, err.msg)
  52. msg += "\n- %s" % err
  53. if msg != "":
  54. msg = "Resolve document validation errors before saving %s" % msg
  55. raise ParserException(msg)
  56. report = validation.report()
  57. if report:
  58. msg += "The saved Document contains unresolved issues."
  59. msg += " Run the Documents 'validate' method to access them.\n%s" % report
  60. warnings.warn(msg)
  61. with open(filename, 'w') as file:
  62. # Add XML header to support odML stylesheets.
  63. if self.parser == 'XML':
  64. file.write(xmlparser.XMLWriter.header)
  65. file.write(self.to_string(odml_document))
  66. def to_string(self, odml_document):
  67. """
  68. Parses an odml.Document to a string in the file format
  69. defined in the ODMLWriter.parser property. Supported formats are
  70. JSON, XML, YAML and RDF.
  71. :param odml_document: odml.Document.
  72. :return: string containing the content of the odml.Document in the
  73. specified format.
  74. """
  75. string_doc = ''
  76. if self.parser == 'XML':
  77. string_doc = unicode(xmlparser.XMLWriter(odml_document))
  78. elif self.parser == "RDF":
  79. # Use XML as default output format for now.
  80. string_doc = RDFWriter(odml_document).get_rdf_str("xml")
  81. else:
  82. self.parsed_doc = DictWriter().to_dict(odml_document)
  83. odml_output = {'Document': self.parsed_doc,
  84. 'odml-version': FORMAT_VERSION}
  85. if self.parser == 'YAML':
  86. yaml.add_representer(datetime.time, yaml_time_serializer)
  87. string_doc = yaml.dump(odml_output, default_flow_style=False)
  88. elif self.parser == 'JSON':
  89. string_doc = json.dumps(odml_output, indent=4,
  90. cls=JSONDateTimeSerializer)
  91. if sys.version_info.major < 3:
  92. string_doc = string_doc.encode("utf-8")
  93. return string_doc
  94. def yaml_time_serializer(dumper, data):
  95. """
  96. This function is required to serialize datetime.time as string objects
  97. when working with YAML as output format.
  98. """
  99. return dumper.represent_scalar('tag:yaml.org,2002:str', str(data))
  100. class JSONDateTimeSerializer(json.JSONEncoder):
  101. """
  102. Required to serialize datetime objects as string objects when working with JSON
  103. as output format.
  104. """
  105. def default(self, o):
  106. if isinstance(o, (datetime.datetime, datetime.date, datetime.time)):
  107. return str(o)
  108. return json.JSONEncoder.default(self, o)
  109. class ODMLReader:
  110. """
  111. A reader to parse odML files or strings into odml documents,
  112. based on the given data exchange format, like XML, YAML, JSON or RDF.
  113. Usage:
  114. yaml_odml_doc = ODMLReader(parser='YAML').from_file("odml_doc.yaml")
  115. json_odml_doc = ODMLReader(parser='JSON').from_file("odml_doc.json")
  116. """
  117. def __init__(self, parser='XML', show_warnings=True):
  118. """
  119. :param parser: odml parser; supported are 'XML', 'JSON', 'YAML' and 'RDF'.
  120. :param show_warnings: Toggle whether to print warnings to the command line.
  121. """
  122. self.doc = None # odML document
  123. self.parsed_doc = None # Python dictionary object equivalent
  124. parser = parser.upper()
  125. if parser not in SUPPORTED_PARSERS:
  126. raise NotImplementedError("'%s' odML parser does not exist!" % parser)
  127. self.parser = parser
  128. self.show_warnings = show_warnings
  129. self.warnings = []
  130. def _validation_warning(self):
  131. report = Validation(self.doc).report()
  132. if report:
  133. msg = "The loaded Document contains unresolved issues."
  134. msg += " Run the Documents 'validate' method to access them.\n%s" % report
  135. warnings.warn(msg)
  136. def from_file(self, file, doc_format=None):
  137. """
  138. Loads an odML document from a file. The ODMLReader.parser specifies the
  139. input file format. If the input file is an RDF file, the specific RDF format
  140. has to be provided as well.
  141. Available RDF formats: 'xml', 'n3', 'turtle', 'nt', 'pretty-xml',
  142. 'trix', 'trig', 'nquads'.
  143. :param file: file path to load an odML document from.
  144. :param doc_format: Required for RDF files only and provides the specific format
  145. of an RDF file.
  146. :return: parsed odml.Document
  147. """
  148. if self.parser == 'XML':
  149. par = xmlparser.XMLReader(ignore_errors=True,
  150. show_warnings=self.show_warnings)
  151. self.warnings = par.warnings
  152. self.doc = par.from_file(file)
  153. # Print validation warnings after parsing
  154. if self.show_warnings:
  155. self._validation_warning()
  156. return self.doc
  157. if self.parser == 'YAML':
  158. with open(file) as yaml_data:
  159. try:
  160. yaml.SafeLoader.add_constructor("tag:yaml.org,2002:python/unicode",
  161. unicode_loader_constructor)
  162. self.parsed_doc = yaml.safe_load(yaml_data)
  163. except yaml.parser.ParserError as err:
  164. print(err)
  165. return None
  166. par = DictReader(ignore_errors=True,
  167. show_warnings=self.show_warnings)
  168. self.doc = par.to_odml(self.parsed_doc)
  169. # Provide original file name via the in memory document
  170. self.doc.origin_file_name = basename(file)
  171. # Print validation warnings after parsing
  172. if self.show_warnings:
  173. self._validation_warning()
  174. return self.doc
  175. if self.parser == 'JSON':
  176. with open(file) as json_data:
  177. try:
  178. self.parsed_doc = json.load(json_data)
  179. except ValueError as err: # Python 2 does not support JSONDecodeError
  180. print("JSON Decoder Error: %s" % err)
  181. return None
  182. par = DictReader(show_warnings=self.show_warnings)
  183. self.doc = par.to_odml(self.parsed_doc)
  184. # Provide original file name via the in memory document
  185. self.doc.origin_file_name = basename(file)
  186. # Print validation warnings after parsing
  187. if self.show_warnings:
  188. self._validation_warning()
  189. return self.doc
  190. if self.parser == 'RDF':
  191. if not doc_format:
  192. raise ValueError("Format of the rdf file was not specified")
  193. # Importing from an RDF graph can return multiple documents
  194. self.doc = RDFReader().from_file(file, doc_format)
  195. for doc in self.doc:
  196. report = Validation(doc).report()
  197. if report:
  198. msg = "The loaded Document contains unresolved issues."
  199. msg += " Run the Documents 'validate' method to access them.\n%s" % report
  200. warnings.warn(msg)
  201. return self.doc
  202. def from_string(self, string, doc_format=None):
  203. """
  204. Loads an odML document from a string object. The ODMLReader.parser specifies the
  205. input file format. If the input string contains an RDF format,
  206. the specific RDF format has to be provided as well.
  207. Available RDF formats: 'xml', 'n3', 'turtle', 'nt', 'pretty-xml',
  208. 'trix', 'trig', 'nquads'.
  209. :param string: file path to load an odML document from.
  210. :param doc_format: Required for RDF files only and provides the specific format
  211. of an RDF file.
  212. :return: parsed odml.Document
  213. """
  214. if self.parser == 'XML':
  215. self.doc = xmlparser.XMLReader().from_string(string)
  216. # Print validation warnings after parsing
  217. if self.show_warnings:
  218. self._validation_warning()
  219. return self.doc
  220. if self.parser == 'YAML':
  221. try:
  222. self.parsed_doc = yaml.safe_load(string)
  223. except yaml.parser.ParserError as err:
  224. print(err)
  225. return
  226. self.doc = DictReader().to_odml(self.parsed_doc)
  227. # Print validation warnings after parsing
  228. if self.show_warnings:
  229. self._validation_warning()
  230. return self.doc
  231. if self.parser == 'JSON':
  232. try:
  233. self.parsed_doc = json.loads(string)
  234. except ValueError as err: # Python 2 does not support JSONDecodeError
  235. print("JSON Decoder Error: %s" % err)
  236. return
  237. self.doc = DictReader().to_odml(self.parsed_doc)
  238. # Print validation warnings after parsing
  239. if self.show_warnings:
  240. self._validation_warning()
  241. return self.doc
  242. if self.parser == 'RDF':
  243. if not doc_format:
  244. raise ValueError("Format of the rdf file was not specified")
  245. # Importing from an RDF graph can return multiple documents
  246. self.doc = RDFReader().from_string(string, doc_format)
  247. for doc in self.doc:
  248. report = Validation(doc).report()
  249. if report:
  250. msg = "The loaded Document contains unresolved issues."
  251. msg += " Run the Documents 'validate' method to access them.\n%s" % report
  252. warnings.warn(msg)
  253. return self.doc
  254. # Needed only for < Python 3
  255. def unicode_loader_constructor(_, node):
  256. """
  257. Constructor for PyYAML to load unicode characters
  258. """
  259. return node.value