Source code for datconv.writers.dcxpaths

# -*- coding: utf-8 -*-
"""This module implements Datconv Writer which generates list of fields (tags that have text) in scanned document.
This is helper Writer, it generates text file that can be used as configuration file (list of columns) for CSV Writer. It may be also helpfull if you only want to extract (e.g. to compare) structure of XML file.

| Format of output file is following:
| **Field Name**, **Record Name**, **XPath**, **Default Value**
| where:
| **Field Name** - the name of tag with text
| **Record Name** - the name of record (root tag) in which this field is contained
| **XPath** - path to tag inside XML structure starting from record root (but not containing record name) in the forn of XPath expression
| **Default Value** - placeholder to place default value, this writer leave it empty
| **Type** - tye of data guessed from data (if add_type option is set)

Generated entries are unique and sorted by Record Name and XPath.
Supports connectors of type: STRING, LIST, ITERABLE.
"""

# In Python 2.7 only
from __future__ import print_function

# Standard Python Libs
import csv

# Libs installed using pip
from lxml import etree

#Datconv classes
from datconv.outconn import STRING, LIST, ITERABLE

Log = None
"""Log varaible is automatically set by main pandoc script using logging.getLogger method.
Use it for logging messages in need.
"""

[docs]class DCWriter:
    """Please see constructor description for more details."""
    def __init__(self, simple_xpath = False, ignore_rectyp = False, ignore_xpath = False, ignore_attr = False, add_header = True, add_type = False, rectyp_separator = '_', colno = 0):
        """Constructor parameters are usually passed from YAML file as subkeys of Writer:CArg key.
        
        :param simple_xpath: if True, Wirter generate xpaths relatative to record tag, and will not generate separate fields for replicated data (repeated tags; arrays) and not generate fields for tag's attributes. The same setting must be applied in CSV Writer if it uses configuration file generated by this Writer.
        :param ignore_rectyp: if True, Writer join fields with the same name contained in different records. Generated Field Name does not contain record name prefix, and in place of record name '*' is placed.
        :param ignore_xpath: if True, Writer join fields with the same name contained in different paths of XML structure. Generated XPath is in form './/FieldName'.
        :param ignore_attr: if True, Writer will not generate fields for XML attbibutes. If simple_xpath is True, this option is automatically set to True.
        :param add_header: add header as first line of output.
        :param add_type: add data type information guessed from data.
        :param rectyp_separator: separator in generated column name between record type and calumn name (has effect if ignore_rectyp = false).
        :param colno: this parameter is for interface compatibility reason, it has no meaning in this class.
        
        For more detailed descriptions see :ref:`conf_template.yaml <writers_conf_template>` file in this module folder.
        """
        assert Log is not None

        self._out = None
        self._out_flags = 0;
        self._writers = []
        self._xpaths = None
        self._simple_xpath = simple_xpath
        self._ign_rectyp = ignore_rectyp
        self._ign_xpath = ignore_xpath
        self._ignore_attr = (simple_xpath or ignore_attr)
        self._add_header = add_header
        self._add_type = add_type
        self._max_int = 2**64 - 1
        self._min_int = - 2**63
        self._sep = rectyp_separator

    def setOutput(self, out):
        self._writers = []
        self._out = None
        self._out_flags = out.supportedInterfases();
        if self._out_flags & STRING:
            for stream in out.getStreams():
                self._writers.append(csv.writer(stream, lineterminator='\n'))
        if self._out_flags & (LIST | ITERABLE):
            if not out.tryObject(list()):
                raise Exception('Incompatible OutConnector used, dcxpaths Writer requires that connector supports list objects')
            self._out = out
        self.resetXPaths()
        
    def writeHeader(self, header):
        if self._add_header:
            self._writeRow(self.getHeader())

    def writeFooter(self, footer):
        for rtype in sorted(self._xpaths.keys()):
            for xpath, name in sorted(self._xpaths[rtype].items()):
                self._writeRow([name[0], rtype, xpath, None, name[1]])
    
    def getHeader(self):
        return ['#ColumnName', 'RecordType', 'XPath', 'Default', 'DataType']
    
    def getFooter(self):
        return None
        
    def writeRecord(self, record):
        self.checkXPath(record)
        return None
        
    # Helper Functions
[docs]    def checkXPath(self, record, ret_new = False):
        """Helper function - it scans record and finds new (not already known) xpaths to add to output. 
        
        Depending on constructor simple_xpath parameter it calls either _checkXPathSimple or _checkXPath."""
        if self._simple_xpath:
            return self._checkXPathSimple(record, ret_new)
        else:
            return self._checkXPath(record, ret_new)
       
    # Internal Functions
    def _checkXPath(self, record, ret_new):
        """Internal function - it scans record and finds new (not already known) xpaths to add to output.
        
        This variant generates XPaths with attributes (if not excluded by paramters) and replications (arrays).
        
        :param ret_new: if True, returns found new xpaths in form of 4 elements' list (see description of this Writer output file above); if this parameter is False or no new xpaths are found in current record, function returns None.
        
        Function typically called in Writer.writeRecord.
        """
        new_col = []
        tree = etree.ElementTree(record)
        for tag in record.iter():
            if tag.text or (not self._ignore_attr and len(tag.keys()) > 0):
                xpath = tree.getpath(tag)
                dtype = self._guessType(tag.text) if self._add_type else None
                if self._ign_xpath:
                    xplist = xpath.split('/')
                    xpath = '//' + xplist[-1]
                if self._ign_rectyp and not self._ign_xpath:
                    xplist = xpath.split('/')
                    xplist[1] = '*'
                    xpath = '/'.join(xplist)
                
                for attr in [1] + tag.keys():
                    if attr == 1:
                        fld = tag.tag
                        if tag.text:
                            xpath2 = xpath
                        else:
                            continue
                    else:
                        if self._ignore_attr:
                            break
                        else:
                            fld = tag.tag + self._sep + attr
                            xpath2 = xpath + '/@' + attr
                            dtype = self._guessType(tag.get(attr)) if self._add_type else None
                    if self._ign_rectyp:
                        if xpath2 not in self._xpaths['*']:
                            self._xpaths['*'][xpath2] = [fld, dtype]
                            if ret_new:
                                new_col.append([fld, '*', xpath2, None, dtype])
                    else:
                        fld = record.tag + self._sep + fld
                        if record.tag not in self._xpaths:
                            self._xpaths[record.tag] = {}
                            self._xpaths[record.tag][xpath2] = [fld, dtype]
                            if ret_new:
                                new_col.append([fld, record.tag, xpath2, None, dtype])
                        else:
                            if xpath2 not in self._xpaths[record.tag]:
                                self._xpaths[record.tag][xpath2] = [fld, dtype]
                                if ret_new:
                                    new_col.append([fld, record.tag, xpath2, None, dtype])
        if ret_new and len(new_col) > 0:
            return new_col
        else:
            return None
        
    def _checkXPathSimple(self, record, ret_new):
        """Internal function - it scans record and finds new (not already known) xpaths to add to output.
        
        This variant generates simple XPaths (without attributes and replications (arrays).
        :param ret_new: if True, returns found new xpaths in form of 4 elements' list (see description of this Writer output file above); if this parameter is False or no new xpaths are found in current record, function returns None.
        
        Function typically called in Writer.writeRecord.
        """
        new_col = []
        for tag in record.iter():
            if tag.text:
                xpath = tag.tag
                dtype = self._guessType(tag.text) if self._add_type else None
                if self._ign_xpath:
                    xpath = './/' + xpath
                else:
                    for tag2 in tag.iterancestors():
                        if tag2.tag != record.tag:
                            xpath = tag2.tag + '/' + xpath
                if self._ign_rectyp:
                    if xpath not in self._xpaths['*']:
                        self._xpaths['*'][xpath] = [tag.tag, dtype]
                        if ret_new:
                            new_col.append([tag.tag, '*', xpath, None, dtype])
                else:
                    if record.tag not in self._xpaths:
                        self._xpaths[record.tag] = {}
                        self._xpaths[record.tag][xpath] = [record.tag + self._sep + tag.tag, dtype]
                        if ret_new:
                            new_col.append([record.tag + self._sep + tag.tag, record.tag, xpath, None, dtype])
                    else:
                        if xpath not in self._xpaths[record.tag]:
                            self._xpaths[record.tag][xpath] = [record.tag + self._sep + tag.tag, dtype]
                            if ret_new:
                                new_col.append([record.tag + self._sep + tag.tag, record.tag, xpath, None, dtype])
        if ret_new and len(new_col) > 0:
            return new_col
        else:
            return None

[docs]    def resetXPaths(self):
        """Reset class internal structures (found xpaths' list).
        
        Typically called in Writer.setOutput when we are about to read new file.
        """
        self._xpaths = {}
        if self._ign_rectyp:
            self._xpaths['*'] = {}
    
    def _writeRow(self, line):
        if self._out_flags & STRING:
            for wri in self._writers:
                wri.writerow(line)
        if self._out_flags & (LIST | ITERABLE):
            self._out.pushObject(line)

    def _guessType(self, val):
        if val is None:
            return None
        try:
            ret = int(val) #Note: True=>1; False=>0
            if ret > self._max_int or ret < self._min_int:
                return 'str'
            return 'int'
        except ValueError:
            try:
                float(val)
                return 'float'
            except ValueError:
                return 'str'
        except TypeError:
            return 'str'