Skip to content

Commit aba7ef8

Browse files
authored
Merge pull request #179 from OpenDataServices/151-xml-to-spreadsheet
Convert XML -> spreadsheet
2 parents 948f6f0 + c2d1b43 commit aba7ef8

File tree

8 files changed

+125
-12
lines changed

8 files changed

+125
-12
lines changed

examples/help/flatten/expected.txt

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
usage: flatten-tool flatten [-h] [-s SCHEMA] [-f OUTPUT_FORMAT]
2-
[-m MAIN_SHEET_NAME] [-o OUTPUT_NAME]
3-
[--root-list-path ROOT_LIST_PATH] [--rollup]
4-
[-r ROOT_ID] [--use-titles]
1+
usage: flatten-tool flatten [-h] [-s SCHEMA] [-f OUTPUT_FORMAT] [--xml]
2+
[--id-name ID_NAME] [-m MAIN_SHEET_NAME]
3+
[-o OUTPUT_NAME] [--root-list-path ROOT_LIST_PATH]
4+
[--rollup] [-r ROOT_ID] [--use-titles]
55
input_name
66

77
positional arguments:
@@ -14,6 +14,8 @@ optional arguments:
1414
-f OUTPUT_FORMAT, --output-format OUTPUT_FORMAT
1515
Type of template you want to create. Defaults to all
1616
available options
17+
--xml Use XML as the input format
18+
--id-name ID_NAME String to use for the identifier key, defaults to 'id'
1719
-m MAIN_SHEET_NAME, --main-sheet-name MAIN_SHEET_NAME
1820
The name of the main sheet, as seen in the first tab
1921
of the spreadsheet for example. Defaults to main

flattentool/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ def spreadsheet_output(spreadsheet_output_class, name):
4141
raise Exception('The requested format is not available')
4242

4343

44-
def flatten(input_name, schema=None, output_name='flattened', output_format='all', main_sheet_name='main', root_list_path='main', rollup=False, root_id=None, use_titles=False, **_):
44+
def flatten(input_name, schema=None, output_name='flattened', output_format='all', main_sheet_name='main',
45+
root_list_path='main', rollup=False, root_id=None, use_titles=False, xml=False, id_name='id', **_):
4546
"""
4647
Flatten a nested structure (JSON) to a flat structure (spreadsheet - csv or xlsx).
4748
@@ -61,7 +62,9 @@ def flatten(input_name, schema=None, output_name='flattened', output_format='all
6162
root_list_path=root_list_path,
6263
schema_parser=schema_parser,
6364
root_id=root_id,
64-
use_titles=use_titles)
65+
use_titles=use_titles,
66+
xml=xml,
67+
id_name=id_name)
6568
parser.parse()
6669

6770
def spreadsheet_output(spreadsheet_output_class, name):

flattentool/cli.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ def create_parser():
6868
parser_flatten.add_argument(
6969
"-f", "--output-format",
7070
help="Type of template you want to create. Defaults to all available options")
71+
parser_flatten.add_argument(
72+
"--xml",
73+
action='store_true',
74+
help="Use XML as the input format")
75+
parser_flatten.add_argument(
76+
"--id-name",
77+
help="String to use for the identifier key, defaults to 'id'")
7178
parser_flatten.add_argument(
7279
"-m", "--main-sheet-name",
7380
help="The name of the main sheet, as seen in the first tab of the spreadsheet for example. Defaults to main")

flattentool/json_input.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from flattentool.sheet import Sheet
1616
from warnings import warn
1717
import codecs
18+
import xmltodict
1819

1920
BASIC_TYPES = [six.text_type, bool, int, Decimal, type(None)]
2021

@@ -45,12 +46,15 @@ class JSONParser(object):
4546
# Named for consistency with schema.SchemaParser, but not sure it's the most appropriate name.
4647
# Similarily with methods like parse_json_dict
4748

48-
def __init__(self, json_filename=None, root_json_dict=None, schema_parser=None, root_list_path=None, root_id='ocid', use_titles=False):
49+
def __init__(self, json_filename=None, root_json_dict=None, schema_parser=None, root_list_path=None,
50+
root_id='ocid', use_titles=False, xml=False, id_name='id'):
4951
self.sub_sheets = {}
5052
self.main_sheet = Sheet()
5153
self.root_list_path = root_list_path
5254
self.root_id = root_id
5355
self.use_titles = use_titles
56+
self.id_name = id_name
57+
self.xml = xml
5458
if schema_parser:
5559
self.main_sheet = schema_parser.main_sheet
5660
self.sub_sheets = schema_parser.sub_sheets
@@ -60,6 +64,18 @@ def __init__(self, json_filename=None, root_json_dict=None, schema_parser=None,
6064
else:
6165
self.rollup = False
6266

67+
if self.xml:
68+
with codecs.open(json_filename, 'rb') as xml_file:
69+
top_dict = xmltodict.parse(
70+
xml_file,
71+
force_list=(root_list_path,),
72+
force_cdata=True,
73+
)
74+
# AFAICT, this should be true for *all* XML files
75+
assert len(top_dict) == 1
76+
root_json_dict = list(top_dict.values())[0]
77+
json_filename = None
78+
6379
if json_filename is None and root_json_dict is None:
6480
raise ValueError('Etiher json_filename or root_json_dict must be supplied')
6581

@@ -81,6 +97,10 @@ def parse(self):
8197
else:
8298
root_json_list = path_search(self.root_json_dict, self.root_list_path.split('/'))
8399
for json_dict in root_json_list:
100+
if json_dict is None:
101+
# This is particularly useful for IATI XML, in order to not
102+
# fallover on empty activity, e.g. <iati-activity/>
103+
continue
84104
self.parse_json_dict(json_dict, sheet=self.main_sheet)
85105

86106
def parse_json_dict(self, json_dict, sheet, json_key=None, parent_name='', flattened_dict=None, parent_id_fields=None, top_level_of_sub_sheet=False):
@@ -109,17 +129,24 @@ def parse_json_dict(self, json_dict, sheet, json_key=None, parent_name='', flatt
109129
if top_level_of_sub_sheet:
110130
# Only add the IDs for the top level of object in an array
111131
for k, v in parent_id_fields.items():
112-
flattened_dict[sheet_key(sheet, k)] = v
132+
if self.xml:
133+
flattened_dict[sheet_key(sheet, k)] = v['#text']
134+
else:
135+
flattened_dict[sheet_key(sheet, k)] = v
113136

114137
if self.root_id and self.root_id in json_dict:
115138
parent_id_fields[sheet_key(sheet, self.root_id)] = json_dict[self.root_id]
116139

117-
if 'id' in json_dict:
118-
parent_id_fields[sheet_key(sheet, parent_name+'id')] = json_dict['id']
140+
if self.id_name in json_dict:
141+
parent_id_fields[sheet_key(sheet, parent_name+self.id_name)] = json_dict[self.id_name]
119142

120143

121144
for key, value in json_dict.items():
122145
if type(value) in BASIC_TYPES:
146+
if self.xml and key == '#text':
147+
# Handle the text output from xmltodict
148+
key = ''
149+
parent_name = parent_name.strip('/')
123150
flattened_dict[sheet_key(sheet, parent_name+key)] = value
124151
elif hasattr(value, 'items'):
125152
self.parse_json_dict(
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<iati-activities><iati-activity/></iati-activities>

flattentool/tests/test_roundtrip.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from flattentool import unflatten, flatten
22
import json
3-
import pytest
43
import sys
54
import os
5+
import xmltodict
6+
import pytest
67

78

89
@pytest.mark.parametrize('output_format', ['xlsx', 'csv'])
@@ -93,3 +94,30 @@ def test_roundtrip_360_rollup(tmpdir, use_titles):
9394
original_json = json.load(open(input_name))
9495
roundtripped_json = json.load(tmpdir.join('roundtrip.json'))
9596
assert original_json == roundtripped_json
97+
98+
99+
@pytest.mark.parametrize('output_format', ['xlsx', 'csv'])
100+
def test_roundtrip_xml(tmpdir, output_format):
101+
input_name = 'examples/iati/expected.xml'
102+
flatten(
103+
input_name=input_name,
104+
output_name=tmpdir.join('flattened').strpath+'.'+output_format,
105+
output_format=output_format,
106+
root_list_path='iati-activity',
107+
id_name='iati-identifier',
108+
xml=True)
109+
unflatten(
110+
input_name=tmpdir.join('flattened').strpath+'.'+output_format,
111+
output_name=tmpdir.join('roundtrip.xml').strpath,
112+
input_format=output_format,
113+
root_list_path='iati-activity',
114+
id_name='iati-identifier',
115+
xml=True)
116+
original_xml = open(input_name, 'rb')
117+
roundtripped_xml = tmpdir.join('roundtrip.xml').open('rb')
118+
119+
# Compare without ordering, by using dict_constructor=dict instead of
120+
# OrderedDict
121+
original = xmltodict.parse(original_xml, dict_constructor=dict)
122+
roundtripped = xmltodict.parse(roundtripped_xml, dict_constructor=dict)
123+
assert original == roundtripped
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from flattentool.json_input import JSONParser
2+
3+
def test_xml_empty():
4+
parser = JSONParser(
5+
json_filename='flattentool/tests/fixtures/empty.xml',
6+
root_list_path='iati-activity',
7+
schema_parser=None,
8+
root_id='',
9+
xml=True,
10+
id_name='iati-identifier')
11+
parser.parse()
12+
assert list(parser.main_sheet) == []
13+
assert parser.main_sheet.lines == []
14+
assert parser.sub_sheets == {}
15+
16+
17+
def test_xml_basic_example():
18+
parser = JSONParser(
19+
json_filename='examples/iati/expected.xml',
20+
root_list_path='iati-activity',
21+
schema_parser=None,
22+
root_id='',
23+
xml=True,
24+
id_name='iati-identifier')
25+
parser.parse()
26+
assert list(parser.main_sheet) == ['iati-identifier', 'reporting-org/@ref', 'reporting-org/@type', 'reporting-org/narrative', 'participating-org/@ref', 'participating-org/@role', 'activity-status/@code', 'activity-date/@iso-date', 'activity-date/@type', 'title/narrative', 'description/narrative']
27+
assert parser.main_sheet.lines == [
28+
{'activity-date/@type': '1', 'reporting-org/narrative': 'Organisation name', 'participating-org/@ref': 'AA-AAA-123456789', 'title/narrative': 'A title', 'participating-org/@role': '1', 'reporting-org/@ref': 'AA-AAA-123456789', 'iati-identifier': 'AA-AAA-123456789-ABC123', 'reporting-org/@type': '40', 'description/narrative': 'A description', 'activity-date/@iso-date': '2011-10-01', 'activity-status/@code': '2'},
29+
{'activity-date/@type': '2', 'reporting-org/narrative': 'Organisation name', 'participating-org/@ref': 'AA-AAA-123456789', 'title/narrative': 'Another title', 'participating-org/@role': '1', 'reporting-org/@ref': 'AA-AAA-123456789', 'iati-identifier': 'AA-AAA-123456789-ABC124', 'reporting-org/@type': '40', 'description/narrative': 'Another description', 'activity-date/@iso-date': '2016-01-01', 'activity-status/@code': '3'}
30+
]
31+
assert set(parser.sub_sheets.keys()) == set(['transaction', 'recipient-country'])
32+
assert list(parser.sub_sheets['transaction']) == ['iati-identifier', 'transaction/0/transaction-type/@code', 'transaction/0/transaction-date/@iso-date', 'transaction/0/value/@value-date', 'transaction/0/value']
33+
assert parser.sub_sheets['transaction'].lines == [
34+
{'transaction/0/value/@value-date': '2012-01-01', 'iati-identifier': 'AA-AAA-123456789-ABC123', 'transaction/0/transaction-date/@iso-date': '2012-01-01', 'transaction/0/value': '10', 'transaction/0/transaction-type/@code': '2'},
35+
{'transaction/0/value/@value-date': '2012-03-03', 'iati-identifier': 'AA-AAA-123456789-ABC123', 'transaction/0/transaction-date/@iso-date': '2012-03-03', 'transaction/0/value': '20', 'transaction/0/transaction-type/@code': '3'},
36+
{'transaction/0/value/@value-date': '2013-04-04', 'iati-identifier': 'AA-AAA-123456789-ABC124', 'transaction/0/transaction-date/@iso-date': '2013-04-04', 'transaction/0/value': '30', 'transaction/0/transaction-type/@code': '2'},
37+
{'transaction/0/value/@value-date': '2013-05-05', 'iati-identifier': 'AA-AAA-123456789-ABC124', 'transaction/0/transaction-date/@iso-date': '2013-05-05', 'transaction/0/value': '40', 'transaction/0/transaction-type/@code': '3'}
38+
]
39+
assert list(parser.sub_sheets['recipient-country']) == ['iati-identifier', 'recipient-country/0/@code', 'recipient-country/0/@percentage']
40+
assert parser.sub_sheets['recipient-country'].lines == [
41+
{'iati-identifier': 'AA-AAA-123456789-ABC123', 'recipient-country/0/@code': 'AF', 'recipient-country/0/@percentage': '40'},
42+
{'iati-identifier': 'AA-AAA-123456789-ABC123', 'recipient-country/0/@code': 'XK', 'recipient-country/0/@percentage': '60'},
43+
{'iati-identifier': 'AA-AAA-123456789-ABC124', 'recipient-country/0/@code': 'AG', 'recipient-country/0/@percentage': '30'},
44+
{'iati-identifier': 'AA-AAA-123456789-ABC124', 'recipient-country/0/@code': 'XK', 'recipient-country/0/@percentage': '70'}
45+
]

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from setuptools import setup
22
import sys
33

4-
install_requires = ['jsonref', 'schema', 'openpyxl>=2,<2.4', 'six', 'pytz']
4+
install_requires = ['jsonref', 'schema', 'openpyxl>=2,<2.4', 'six', 'pytz', 'xmltodict']
55

66
if sys.version < '3':
77
install_requires.append('unicodecsv')

0 commit comments

Comments
 (0)