# -*- coding: utf-8 -*-
"""This module implements Datconv Reader which reads data from XML file."""
# Standard Python Libs
import xml.sax as sax
import logging
# Libs installed using pip
from lxml import etree # http://lxml.de/tutorial.html
# Datconv generic modules
from datconv.filters import WRITE, REPEAT, BREAK
####################################################################
Log = None
"""Log variable is automatically set by main pandoc script using logging.getLogger method.
Use it for logging messages in need.
"""
[docs]class FilterBreak(Exception):
"""Exception class to support Reader.process break isued from Filter class."""
pass
[docs]class ToLimitBreak(Exception):
"""Exception class to support Reader.process break caused by reaching configured record limit."""
pass
####################################################################
[docs]class ContentGenerator(sax.handler.ContentHandler):
"""This class handles XML events generated by parser created by xml.sax.make_parser().
It implements most of the functionality of this XML Reader.
See documentation of its base class for description of methods meaning.
"""
def __init__(self, bratags, headtags, rectags, foottags, wri, flt = None, lp_step = 0, rfrom = 1, rto = 0):
"""See description of DCReader constructor and Process() method for meaning of most parameters."""
sax.handler.ContentHandler.__init__(self)
self._btags = bratags
self._htags = headtags
self._rtags = rectags
self._ftags = foottags
self._wri = wri
self._flt = flt
self._lp_step = lp_step
self._rfrom = rfrom
self._rto = rto
self._bs = None
self._curtag = None
self._header = []
self._footer = []
self._header_read = False
# ContentHandler methods
[docs] def startDocument(self):
self._recno = 0
self._lp_rec = 0
if self._lp_step > 0 and Log.isEnabledFor(logging.INFO):
self._lp_rec = self._lp_step
[docs] def endDocument(self):
if not self._header_read:
# OBLIGATORY
if self._flt is not None:
if hasattr(self._flt, 'setHeader'):
self._flt.setHeader(self._header)
self._wri.writeHeader(self._header)
self._header_read = True
# OBLIGATORY
if self._flt is not None:
if hasattr(self._flt, 'setFooter'):
self._flt.setFooter(self._footer)
self._wri.writeFooter(self._footer)
[docs] def startElement(self, name, attrs):
if not self._header_read and name in self._btags:
h = dict()
h['_tag_'] = name
h['_bra_'] = True
for (aname, avalue) in attrs.items():
h[aname] = avalue
self._header.append(h)
elif not self._header_read and name in self._htags:
h = dict()
h['_tag_'] = name
h['_bra_'] = False
for (aname, avalue) in attrs.items():
h[aname] = avalue
self._header.append(h)
elif self._bs is None and name in self._ftags:
f = dict()
f['_tag_'] = name
for (aname, avalue) in attrs.items():
f[aname] = avalue
self._footer.append(f)
elif (self._bs is None and name in self._rtags) or \
(self._bs is None and len(self._rtags) == 0):
if not self._header_read:
if self._flt is not None:
if hasattr(self._flt, 'setHeader'):
self._flt.setHeader(self._header)
self._wri.writeHeader(self._header)
self._header_read = True
self._recno = self._recno + 1
if self._recno < self._rfrom:
return
if self._rto > 0 and self._recno > self._rto:
self.endDocument()
raise ToLimitBreak
self._bs = etree.Element(name)
self._curtag = self._bs
for (aname, avalue) in attrs.items():
self._curtag.set(aname, avalue)
elif self._bs is not None:
if name in self._rtags:
Log.error('Nested record tag: <%s> in %d record; file will not be interpretted correctly' % (name, self._recno))
ntag = etree.SubElement(self._curtag, name)
self._curtag = ntag
for (aname, avalue) in attrs.items():
self._curtag.set(aname, avalue)
[docs] def endElement(self, name):
if self._bs is not None:
#if name in self._rtags:
if name == self._bs.tag:
if self._recno == self._lp_rec:
Log.info('Processed %d records' % self._recno)
#self._log.info('Processed %d records' % self._recno)
self._lp_rec = self._lp_rec + self._lp_step
if self._flt is not None:
rec = self._bs
while True:
# OBLIGATORY
res = self._flt.filterRecord(rec)
if res & WRITE:
self._wri.writeRecord(rec)
if res & REPEAT:
continue
if res & BREAK:
self.endDocument()
Log.info('Filter caused Process to stop on record %d' % self._recno)
raise FilterBreak
break
else:
# OBLIGATORY
self._wri.writeRecord(self._bs)
self._bs = None
self._curtag = None
else:
self._curtag = self._curtag.getparent()
[docs] def characters(self, content):
if not self._bs is None:
content = content.strip()
if len(content) > 0:
if self._curtag.text is None:
self._curtag.text = content
else:
self._curtag.text = self._curtag.text + content
#self._curtag.tail = content
####################################################################
[docs]class DCReader:
"""This Datconv XML Reader class uses xml.sax parser to read and interpret XML file.
This parser uses ContentGenerator class from this module to handle XML events.
See documentation of standard Python xml.sax library for more information how it works.
This Reader assumens that srtucture of input XML file is following:
* there is/are some (one or more) BRACE tag(s);
entire document content is included in this/those brace tag(s);
well-formed XML document should have at least one such tag;
* then there is/are some optional HEAD tag(s);
head tags begin and end completly before record tags begin;
* then there are RECORD tags;
everything what is inside record tags is treated as record data and is being passed to Filter and Writer;
record tags can not be nested - every record tag must end before another record tag begin;
there may be several kinds (names) or record tags - in such case we say that we have multiply record types.
If list of record tags is empty then every tag which is one level under brace tag and which is not head nor foot tag is treated as record tag.
* then there is/are some optional FOOTER tag(s);
footer tags begin and end completly after record tags;
Constructor parameters explicitly list which tags are of what kind.\n
TODO: The text inside brace, header and footer tags is discarded (only attributes are passed to Writer).\n
TODO: The header tags between record tags are discarded (only ones before first record tag are passed to Writer.\n
TODO: This class does not support CDATA inside XML.
"""
def __init__(self, bratags = [], headtags = [], rectags = [], foottags = [], log_prog_step = 0):
"""Parameters are usually passed from YAML file as subkeys of ``Reader:CArg`` key.
:param bratags: list of tag names that will be treated as brace tags (see above).
:param headtags: list of tag names that will be treated as header tags (see above).
:param rectags: list of tag names that will be treated as record tags (see above).
:param foottags: list of tag names that will be treated as footer tags (see above).
:param log_prog_step: log info message after this number of records or does not log progress messages if this key is 0 or logging level is set to value higher than INFO.
For more detailed descriptions see :ref:`readers_conf_template`.
"""
assert Log is not None
self._wri = self._flt = None
self._btags = bratags
self._htags = headtags
self._rtags = rectags
self._ftags = foottags
self._lp_step = log_prog_step
# OBLIGATORY
def setWriter(self, writer):
self._wri = writer
# OBLIGATORY
def setFilter(self, filter):
self._flt = filter
[docs] def Process(self, inpath, outpath = None, rfrom = 1, rto = 0):
"""Parameters are usually passed from YAML file as subkeys of ``Reader:PArg`` key.
:param inpath: Path to input file.
:param outpath: Path to output file passed to Writer (fall-back if output connector is not defined).
:param rfrom-rto: specifies scope of records to be processed.
For more detailed descriptions see :ref:`readers_conf_template`.
"""
parser = sax.make_parser()
parser.setContentHandler( \
ContentGenerator( \
bratags = self._btags, \
headtags = self._htags, \
rectags = self._rtags, \
foottags = self._ftags, \
wri = self._wri, \
flt = self._flt, \
lp_step = self._lp_step, \
rfrom = rfrom, \
rto = rto \
) \
)
try:
parser.parse(inpath)
except FilterBreak:
pass
except ToLimitBreak:
pass