Skip to content

Commit 57af89b

Browse files
author
kindly
authored
Merge pull request #376 from OpenDataServices/315-lower-memory-usage
Flattening: Reduce Memory Footprint.
2 parents 843dd99 + 31b9399 commit 57af89b

File tree

10 files changed

+246
-140
lines changed

10 files changed

+246
-140
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1616

1717
### Fixed
1818

19+
- flattening: Uses much less memory by storing data in a embedded ZODB database, using ijson and using write only mode in pyopenxl.
1920
- use-titles: Use $ref'erring title if available https://github.com/OpenDataServices/flatten-tool/pull/368
2021
- create-template --no-deprecated-fields: Did not work if deprecated element at same level as a $ref https://github.com/OpenDataServices/flatten-tool/issues/185#issuecomment-719587348
2122

flattentool/__init__.py

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ def flatten(
112112
else:
113113
schema_parser = None
114114

115-
parser = JSONParser(
115+
# context manager to clean up ZODB database when it exits
116+
with JSONParser(
116117
json_filename=input_name,
117118
root_list_path=None if root_is_list else root_list_path,
118119
schema_parser=schema_parser,
@@ -126,33 +127,33 @@ def flatten(
126127
preserve_fields=preserve_fields,
127128
remove_empty_schema_columns=remove_empty_schema_columns,
128129
truncation_length=truncation_length,
129-
)
130-
parser.parse()
131-
132-
def spreadsheet_output(spreadsheet_output_class, name):
133-
spreadsheet_output = spreadsheet_output_class(
134-
parser=parser,
135-
main_sheet_name=main_sheet_name,
136-
output_name=name,
137-
sheet_prefix=sheet_prefix,
138-
)
139-
spreadsheet_output.write_sheets()
140-
141-
if output_format == "all":
142-
if not output_name:
143-
output_name = "flattened"
144-
for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items():
145-
spreadsheet_output(
146-
spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name]
130+
persist=True,
131+
) as parser:
132+
133+
def spreadsheet_output(spreadsheet_output_class, name):
134+
spreadsheet_output = spreadsheet_output_class(
135+
parser=parser,
136+
main_sheet_name=main_sheet_name,
137+
output_name=name,
138+
sheet_prefix=sheet_prefix,
147139
)
140+
spreadsheet_output.write_sheets()
141+
142+
if output_format == "all":
143+
if not output_name:
144+
output_name = "flattened"
145+
for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items():
146+
spreadsheet_output(
147+
spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name]
148+
)
148149

149-
elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats
150-
if not output_name:
151-
output_name = "flattened" + FORMATS_SUFFIX[output_format]
152-
spreadsheet_output(OUTPUT_FORMATS[output_format], output_name)
150+
elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats
151+
if not output_name:
152+
output_name = "flattened" + FORMATS_SUFFIX[output_format]
153+
spreadsheet_output(OUTPUT_FORMATS[output_format], output_name)
153154

154-
else:
155-
raise Exception("The requested format is not available")
155+
else:
156+
raise Exception("The requested format is not available")
156157

157158

158159
# From http://bugs.python.org/issue16535

flattentool/json_input.py

Lines changed: 94 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,24 @@
77

88
import codecs
99
import copy
10-
import json
1110
import os
11+
import tempfile
12+
import uuid
1213
from collections import OrderedDict
1314
from decimal import Decimal
1415
from warnings import warn
1516

17+
import BTrees.OOBTree
18+
import ijson
19+
import transaction
1620
import xmltodict
21+
import zc.zlibstorage
22+
import ZODB.FileStorage
1723

1824
from flattentool.i18n import _
1925
from flattentool.input import path_search
2026
from flattentool.schema import make_sub_sheet_name
21-
from flattentool.sheet import Sheet
27+
from flattentool.sheet import PersistentSheet
2228

2329
BASIC_TYPES = [str, bool, int, Decimal, type(None)]
2430

@@ -112,9 +118,31 @@ def __init__(
112118
remove_empty_schema_columns=False,
113119
rollup=False,
114120
truncation_length=3,
121+
persist=False,
115122
):
123+
if persist:
124+
# Use temp directories in OS agnostic way
125+
self.zodb_db_location = (
126+
tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4())
127+
)
128+
# zlibstorage lowers disk usage by a lot at very small performance cost
129+
zodb_storage = zc.zlibstorage.ZlibStorage(
130+
ZODB.FileStorage.FileStorage(self.zodb_db_location)
131+
)
132+
self.db = ZODB.DB(zodb_storage)
133+
else:
134+
# If None, in memory storage is used.
135+
self.db = ZODB.DB(None)
136+
137+
self.connection = self.db.open()
138+
139+
# ZODB root, only objects attached here will be persisted
140+
root = self.connection.root
141+
# OOBTree means a btree with keys and values are objects (including strings)
142+
root.sheet_store = BTrees.OOBTree.BTree()
143+
116144
self.sub_sheets = {}
117-
self.main_sheet = Sheet()
145+
self.main_sheet = PersistentSheet(connection=self.connection, name="")
118146
self.root_list_path = root_list_path
119147
self.root_id = root_id
120148
self.use_titles = use_titles
@@ -125,9 +153,19 @@ def __init__(
125153
self.filter_value = filter_value
126154
self.remove_empty_schema_columns = remove_empty_schema_columns
127155
self.seen_paths = set()
156+
self.persist = persist
128157

129158
if schema_parser:
130-
self.main_sheet = copy.deepcopy(schema_parser.main_sheet)
159+
# schema parser does not make sheets that are persistent,
160+
# so use from_sheets which deep copies everything in it.
161+
self.main_sheet = PersistentSheet.from_sheet(
162+
schema_parser.main_sheet, self.connection
163+
)
164+
for sheet_name, sheet in list(self.sub_sheets.items()):
165+
self.sub_sheets[sheet_name] = PersistentSheet.from_sheet(
166+
sheet, self.connection
167+
)
168+
131169
self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets)
132170
if remove_empty_schema_columns:
133171
# Don't use columns from the schema parser
@@ -194,18 +232,13 @@ def __init__(
194232
_("Only one of json_file or root_json_dict should be supplied")
195233
)
196234

197-
if json_filename:
198-
with codecs.open(json_filename, encoding="utf-8") as json_file:
199-
try:
200-
self.root_json_dict = json.load(
201-
json_file, object_pairs_hook=OrderedDict, parse_float=Decimal
202-
)
203-
except UnicodeError as err:
204-
raise BadlyFormedJSONErrorUTF8(*err.args)
205-
except ValueError as err:
206-
raise BadlyFormedJSONError(*err.args)
207-
else:
208-
self.root_json_dict = root_json_dict
235+
if not json_filename:
236+
if self.root_list_path is None:
237+
self.root_json_list = root_json_dict
238+
else:
239+
self.root_json_list = path_search(
240+
root_json_dict, self.root_list_path.split("/")
241+
)
209242

210243
if preserve_fields:
211244
# Extract fields to be preserved from input file (one path per line)
@@ -240,19 +273,41 @@ def __init__(
240273
self.preserve_fields = None
241274
self.preserve_fields_input = None
242275

276+
if json_filename:
277+
if self.root_list_path is None:
278+
path = "item"
279+
else:
280+
path = root_list_path.replace("/", ".") + ".item"
281+
282+
json_file = codecs.open(json_filename, encoding="utf-8")
283+
284+
self.root_json_list = ijson.items(json_file, path, map_type=OrderedDict)
285+
286+
try:
287+
self.parse()
288+
except ijson.common.IncompleteJSONError as err:
289+
raise BadlyFormedJSONError(*err.args)
290+
except UnicodeDecodeError as err:
291+
raise BadlyFormedJSONErrorUTF8(*err.args)
292+
finally:
293+
if json_filename:
294+
json_file.close()
295+
243296
def parse(self):
244-
if self.root_list_path is None:
245-
root_json_list = self.root_json_dict
246-
else:
247-
root_json_list = path_search(
248-
self.root_json_dict, self.root_list_path.split("/")
249-
)
250-
for json_dict in root_json_list:
297+
for num, json_dict in enumerate(self.root_json_list):
251298
if json_dict is None:
252299
# This is particularly useful for IATI XML, in order to not
253300
# fall over on empty activity, e.g. <iati-activity/>
254301
continue
255302
self.parse_json_dict(json_dict, sheet=self.main_sheet)
303+
# only persist every 2000 objects. peristing more often slows down storing.
304+
# 2000 top level objects normally not too much to store in memory.
305+
if num % 2000 == 0 and num != 0:
306+
transaction.commit()
307+
308+
# This commit could be removed which would mean that upto 2000 objects
309+
# could be stored in memory without anything being persisted.
310+
transaction.commit()
256311

257312
if self.remove_empty_schema_columns:
258313
# Remove sheets with no lines of data
@@ -501,7 +556,9 @@ def parse_json_dict(
501556
parent_name, key, truncation_length=self.truncation_length
502557
)
503558
if sub_sheet_name not in self.sub_sheets:
504-
self.sub_sheets[sub_sheet_name] = Sheet(name=sub_sheet_name)
559+
self.sub_sheets[sub_sheet_name] = PersistentSheet(
560+
name=sub_sheet_name, connection=self.connection
561+
)
505562

506563
for json_dict in value:
507564
if json_dict is None:
@@ -518,4 +575,16 @@ def parse_json_dict(
518575
raise ValueError(_("Unsupported type {}").format(type(value)))
519576

520577
if top:
521-
sheet.lines.append(flattened_dict)
578+
sheet.append_line(flattened_dict)
579+
580+
def __enter__(self):
581+
return self
582+
583+
def __exit__(self, type, value, traceback):
584+
if self.persist:
585+
self.connection.close()
586+
self.db.close()
587+
os.remove(self.zodb_db_location)
588+
os.remove(self.zodb_db_location + ".lock")
589+
os.remove(self.zodb_db_location + ".index")
590+
os.remove(self.zodb_db_location + ".tmp")

flattentool/output.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ def close(self):
5050

5151
class XLSXOutput(SpreadsheetOutput):
5252
def open(self):
53-
self.workbook = openpyxl.Workbook()
53+
# write only means that the output will be streamed
54+
self.workbook = openpyxl.Workbook(write_only=True)
5455

5556
def write_sheet(self, sheet_name, sheet):
5657
sheet_header = list(sheet)
@@ -75,7 +76,6 @@ def write_sheet(self, sheet_name, sheet):
7576
worksheet.append(line)
7677

7778
def close(self):
78-
self.workbook.remove(self.workbook.active)
7979
self.workbook.save(self.output_name)
8080

8181

flattentool/sheet.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
import copy
2+
3+
import BTrees.IOBTree
4+
5+
16
class Sheet(object):
27
"""
38
An abstract representation of a single sheet of a spreadsheet.
@@ -8,10 +13,14 @@ def __init__(self, columns=None, root_id="", name=None):
813
self.id_columns = []
914
self.columns = columns if columns else []
1015
self.titles = {}
11-
self.lines = []
16+
self._lines = []
1217
self.root_id = root_id
1318
self.name = name
1419

20+
@property
21+
def lines(self):
22+
return self._lines
23+
1524
def add_field(self, field, id_field=False):
1625
columns = self.id_columns if id_field else self.columns
1726
if field not in columns:
@@ -27,3 +36,44 @@ def __iter__(self):
2736
yield column
2837
for column in self.columns:
2938
yield column
39+
40+
def append_line(self, flattened_dict):
41+
self._lines.append(flattened_dict)
42+
43+
44+
class PersistentSheet(Sheet):
45+
"""
46+
A sheet that is persisted in ZODB database.
47+
48+
"""
49+
50+
def __init__(self, columns=None, root_id="", name=None, connection=None):
51+
super().__init__(columns=columns, root_id=root_id, name=name)
52+
self.connection = connection
53+
self.index = 0
54+
# Integer key and object value btree. Store sequential index in order to preserve input order.
55+
connection.root.sheet_store[self.name] = BTrees.IOBTree.BTree()
56+
57+
@property
58+
def lines(self):
59+
# btrees iterate in key order.
60+
for key, value in self.connection.root.sheet_store[self.name].items():
61+
# 5000 chosen by trial and error. The written row
62+
# data is removed from memory as is no loner needed.
63+
# All new sheets clear out previous sheets data from memory.
64+
if key % 5000 == 0:
65+
self.connection.cacheMinimize()
66+
yield value
67+
68+
def append_line(self, flattened_dict):
69+
self.connection.root.sheet_store[self.name][self.index] = flattened_dict
70+
self.index += 1
71+
72+
@classmethod
73+
def from_sheet(cls, sheet, connection):
74+
instance = cls(name=sheet.name, connection=connection)
75+
instance.id_columns = copy.deepcopy(sheet.id_columns)
76+
instance.columns = copy.deepcopy(sheet.columns)
77+
instance.titles = copy.deepcopy(sheet.titles)
78+
instance.root_id = sheet.root_id
79+
return instance

0 commit comments

Comments
 (0)