Merge pull request #179 from OpenDataServices/151-xml-to-spreadsheet

edugomez · web-flow · commit aba7ef80b42e · 2017-10-18T14:09:07.000+01:00
Convert XML -&gt; spreadsheet
diff --git a/examples/help/flatten/expected.txt b/examples/help/flatten/expected.txt
@@ -1,7 +1,7 @@
-usage: flatten-tool flatten [-h] [-s SCHEMA] [-f OUTPUT_FORMAT]
-                            [-m MAIN_SHEET_NAME] [-o OUTPUT_NAME]
-                            [--root-list-path ROOT_LIST_PATH] [--rollup]
-                            [-r ROOT_ID] [--use-titles]
+usage: flatten-tool flatten [-h] [-s SCHEMA] [-f OUTPUT_FORMAT] [--xml]
+                            [--id-name ID_NAME] [-m MAIN_SHEET_NAME]
+                            [-o OUTPUT_NAME] [--root-list-path ROOT_LIST_PATH]
+                            [--rollup] [-r ROOT_ID] [--use-titles]
                             input_name
 
 positional arguments:
@@ -14,6 +14,8 @@ optional arguments:
   -f OUTPUT_FORMAT, --output-format OUTPUT_FORMAT
                         Type of template you want to create. Defaults to all
                         available options
+  --xml                 Use XML as the input format
+  --id-name ID_NAME     String to use for the identifier key, defaults to 'id'
   -m MAIN_SHEET_NAME, --main-sheet-name MAIN_SHEET_NAME
                         The name of the main sheet, as seen in the first tab
                         of the spreadsheet for example. Defaults to main
diff --git a/flattentool/__init__.py b/flattentool/__init__.py
@@ -41,7 +41,8 @@ def spreadsheet_output(spreadsheet_output_class, name):
         raise Exception('The requested format is not available')
 
 
-def flatten(input_name, schema=None, output_name='flattened', output_format='all', main_sheet_name='main', root_list_path='main', rollup=False, root_id=None, use_titles=False, **_):
+def flatten(input_name, schema=None, output_name='flattened', output_format='all', main_sheet_name='main',
+            root_list_path='main', rollup=False, root_id=None, use_titles=False, xml=False, id_name='id',  **_):
     """
     Flatten a nested structure (JSON) to a flat structure (spreadsheet - csv or xlsx).
 
@@ -61,7 +62,9 @@ def flatten(input_name, schema=None, output_name='flattened', output_format='all
         root_list_path=root_list_path,
         schema_parser=schema_parser,
         root_id=root_id,
-        use_titles=use_titles)
+        use_titles=use_titles,
+        xml=xml,
+        id_name=id_name)
     parser.parse()
 
     def spreadsheet_output(spreadsheet_output_class, name):
diff --git a/flattentool/cli.py b/flattentool/cli.py
@@ -68,6 +68,13 @@ def create_parser():
     parser_flatten.add_argument(
         "-f", "--output-format",
         help="Type of template you want to create. Defaults to all available options")
+    parser_flatten.add_argument(
+        "--xml",
+        action='store_true',
+        help="Use XML as the input format")
+    parser_flatten.add_argument(
+        "--id-name",
+        help="String to use for the identifier key, defaults to 'id'")
     parser_flatten.add_argument(
         "-m", "--main-sheet-name",
         help="The name of the main sheet, as seen in the first tab of the spreadsheet for example. Defaults to main")
diff --git a/flattentool/json_input.py b/flattentool/json_input.py
@@ -15,6 +15,7 @@
 from flattentool.sheet import Sheet
 from warnings import warn
 import codecs
+import xmltodict
 
 BASIC_TYPES = [six.text_type, bool, int, Decimal, type(None)]
 
@@ -45,12 +46,15 @@ class JSONParser(object):
     # Named for consistency with schema.SchemaParser, but not sure it's the most appropriate name.
     # Similarily with methods like parse_json_dict
 
-    def __init__(self, json_filename=None, root_json_dict=None, schema_parser=None, root_list_path=None, root_id='ocid', use_titles=False):
+    def __init__(self, json_filename=None, root_json_dict=None, schema_parser=None, root_list_path=None,
+                 root_id='ocid', use_titles=False, xml=False, id_name='id'):
         self.sub_sheets = {}
         self.main_sheet = Sheet()
         self.root_list_path = root_list_path
         self.root_id = root_id
         self.use_titles = use_titles
+        self.id_name = id_name
+        self.xml = xml
         if schema_parser:
             self.main_sheet = schema_parser.main_sheet
             self.sub_sheets = schema_parser.sub_sheets
@@ -60,6 +64,18 @@ def __init__(self, json_filename=None, root_json_dict=None, schema_parser=None,
         else:
             self.rollup = False
 
+        if self.xml:
+            with codecs.open(json_filename, 'rb') as xml_file:
+                top_dict = xmltodict.parse(
+                    xml_file,
+                    force_list=(root_list_path,),
+                    force_cdata=True,
+                    )
+                # AFAICT, this should be true for *all* XML files
+                assert len(top_dict) == 1
+                root_json_dict = list(top_dict.values())[0]
+            json_filename = None
+
         if json_filename is None and root_json_dict is None:
             raise ValueError('Etiher json_filename or root_json_dict must be supplied')
 
@@ -81,6 +97,10 @@ def parse(self):
         else:
             root_json_list = path_search(self.root_json_dict, self.root_list_path.split('/'))
         for json_dict in root_json_list:
+            if json_dict is None:
+                # This is particularly useful for IATI XML, in order to not
+                # fallover on empty activity, e.g. <iati-activity/>
+                continue
             self.parse_json_dict(json_dict, sheet=self.main_sheet)
     
     def parse_json_dict(self, json_dict, sheet, json_key=None, parent_name='', flattened_dict=None, parent_id_fields=None, top_level_of_sub_sheet=False):
@@ -109,17 +129,24 @@ def parse_json_dict(self, json_dict, sheet, json_key=None, parent_name='', flatt
         if top_level_of_sub_sheet:
             # Only add the IDs for the top level of object in an array
             for k, v in parent_id_fields.items():
-                flattened_dict[sheet_key(sheet, k)] = v
+                if self.xml:
+                    flattened_dict[sheet_key(sheet, k)] = v['#text']
+                else:
+                    flattened_dict[sheet_key(sheet, k)] = v
 
         if self.root_id and self.root_id in json_dict:
             parent_id_fields[sheet_key(sheet, self.root_id)] = json_dict[self.root_id]
 
-        if 'id' in json_dict:
-            parent_id_fields[sheet_key(sheet, parent_name+'id')] = json_dict['id']
+        if self.id_name in json_dict:
+            parent_id_fields[sheet_key(sheet, parent_name+self.id_name)] = json_dict[self.id_name]
 
 
         for key, value in json_dict.items():
             if type(value) in BASIC_TYPES:
+                if self.xml and key == '#text':
+                    # Handle the text output from xmltodict
+                    key = ''
+                    parent_name = parent_name.strip('/')
                 flattened_dict[sheet_key(sheet, parent_name+key)] = value
             elif hasattr(value, 'items'):
                 self.parse_json_dict(
diff --git a/flattentool/tests/fixtures/empty.xml b/flattentool/tests/fixtures/empty.xml
@@ -0,0 +1 @@
+<iati-activities><iati-activity/></iati-activities>
diff --git a/flattentool/tests/test_roundtrip.py b/flattentool/tests/test_roundtrip.py
@@ -1,8 +1,9 @@
 from flattentool import unflatten, flatten
 import json
-import pytest
 import sys
 import os
+import xmltodict
+import pytest
 
 
 @pytest.mark.parametrize('output_format', ['xlsx', 'csv'])
@@ -93,3 +94,30 @@ def test_roundtrip_360_rollup(tmpdir, use_titles):
     original_json = json.load(open(input_name))
     roundtripped_json = json.load(tmpdir.join('roundtrip.json'))
     assert original_json == roundtripped_json
+
+
+@pytest.mark.parametrize('output_format', ['xlsx', 'csv'])
+def test_roundtrip_xml(tmpdir, output_format):
+    input_name = 'examples/iati/expected.xml'
+    flatten(
+        input_name=input_name,
+        output_name=tmpdir.join('flattened').strpath+'.'+output_format,
+        output_format=output_format,
+        root_list_path='iati-activity',
+        id_name='iati-identifier',
+        xml=True)
+    unflatten(
+        input_name=tmpdir.join('flattened').strpath+'.'+output_format,
+        output_name=tmpdir.join('roundtrip.xml').strpath,
+        input_format=output_format,
+        root_list_path='iati-activity',
+        id_name='iati-identifier',
+        xml=True)
+    original_xml = open(input_name, 'rb')
+    roundtripped_xml = tmpdir.join('roundtrip.xml').open('rb')
+
+    # Compare without ordering, by using dict_constructor=dict instead of
+    # OrderedDict
+    original = xmltodict.parse(original_xml, dict_constructor=dict)
+    roundtripped = xmltodict.parse(roundtripped_xml, dict_constructor=dict)
+    assert original == roundtripped
diff --git a/flattentool/tests/test_xml_input.py b/flattentool/tests/test_xml_input.py
@@ -0,0 +1,45 @@
+from flattentool.json_input import JSONParser
+
+def test_xml_empty():
+    parser = JSONParser(
+        json_filename='flattentool/tests/fixtures/empty.xml',
+        root_list_path='iati-activity',
+        schema_parser=None,
+        root_id='',
+        xml=True,
+        id_name='iati-identifier')
+    parser.parse()
+    assert list(parser.main_sheet) == []
+    assert parser.main_sheet.lines == []
+    assert parser.sub_sheets == {}
+
+
+def test_xml_basic_example():
+    parser = JSONParser(
+        json_filename='examples/iati/expected.xml',
+        root_list_path='iati-activity',
+        schema_parser=None,
+        root_id='',
+        xml=True,
+        id_name='iati-identifier')
+    parser.parse()
+    assert list(parser.main_sheet) == ['iati-identifier', 'reporting-org/@ref', 'reporting-org/@type', 'reporting-org/narrative', 'participating-org/@ref', 'participating-org/@role', 'activity-status/@code', 'activity-date/@iso-date', 'activity-date/@type', 'title/narrative', 'description/narrative']
+    assert parser.main_sheet.lines == [
+        {'activity-date/@type': '1', 'reporting-org/narrative': 'Organisation name', 'participating-org/@ref': 'AA-AAA-123456789', 'title/narrative': 'A title', 'participating-org/@role': '1', 'reporting-org/@ref': 'AA-AAA-123456789', 'iati-identifier': 'AA-AAA-123456789-ABC123', 'reporting-org/@type': '40', 'description/narrative': 'A description', 'activity-date/@iso-date': '2011-10-01', 'activity-status/@code': '2'},
+        {'activity-date/@type': '2', 'reporting-org/narrative': 'Organisation name', 'participating-org/@ref': 'AA-AAA-123456789', 'title/narrative': 'Another title', 'participating-org/@role': '1', 'reporting-org/@ref': 'AA-AAA-123456789', 'iati-identifier': 'AA-AAA-123456789-ABC124', 'reporting-org/@type': '40', 'description/narrative': 'Another description', 'activity-date/@iso-date': '2016-01-01', 'activity-status/@code': '3'}
+    ]
+    assert set(parser.sub_sheets.keys()) == set(['transaction', 'recipient-country'])
+    assert list(parser.sub_sheets['transaction']) == ['iati-identifier', 'transaction/0/transaction-type/@code', 'transaction/0/transaction-date/@iso-date', 'transaction/0/value/@value-date', 'transaction/0/value']
+    assert parser.sub_sheets['transaction'].lines == [
+       {'transaction/0/value/@value-date': '2012-01-01', 'iati-identifier': 'AA-AAA-123456789-ABC123', 'transaction/0/transaction-date/@iso-date': '2012-01-01', 'transaction/0/value': '10', 'transaction/0/transaction-type/@code': '2'},
+       {'transaction/0/value/@value-date': '2012-03-03', 'iati-identifier': 'AA-AAA-123456789-ABC123', 'transaction/0/transaction-date/@iso-date': '2012-03-03', 'transaction/0/value': '20', 'transaction/0/transaction-type/@code': '3'},
+       {'transaction/0/value/@value-date': '2013-04-04', 'iati-identifier': 'AA-AAA-123456789-ABC124', 'transaction/0/transaction-date/@iso-date': '2013-04-04', 'transaction/0/value': '30', 'transaction/0/transaction-type/@code': '2'},
+       {'transaction/0/value/@value-date': '2013-05-05', 'iati-identifier': 'AA-AAA-123456789-ABC124', 'transaction/0/transaction-date/@iso-date': '2013-05-05', 'transaction/0/value': '40', 'transaction/0/transaction-type/@code': '3'}
+    ]
+    assert list(parser.sub_sheets['recipient-country']) == ['iati-identifier', 'recipient-country/0/@code', 'recipient-country/0/@percentage']
+    assert parser.sub_sheets['recipient-country'].lines == [
+        {'iati-identifier': 'AA-AAA-123456789-ABC123', 'recipient-country/0/@code': 'AF', 'recipient-country/0/@percentage': '40'},
+        {'iati-identifier': 'AA-AAA-123456789-ABC123', 'recipient-country/0/@code': 'XK', 'recipient-country/0/@percentage': '60'},
+        {'iati-identifier': 'AA-AAA-123456789-ABC124', 'recipient-country/0/@code': 'AG', 'recipient-country/0/@percentage': '30'},
+        {'iati-identifier': 'AA-AAA-123456789-ABC124', 'recipient-country/0/@code': 'XK', 'recipient-country/0/@percentage': '70'}
+    ]
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 import sys
 
-install_requires = ['jsonref', 'schema', 'openpyxl>=2,<2.4', 'six', 'pytz']
+install_requires = ['jsonref', 'schema', 'openpyxl>=2,<2.4', 'six', 'pytz', 'xmltodict']
 
 if sys.version < '3':
     install_requires.append('unicodecsv')

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+<iati-activities><iati-activity/></iati-activities>`