Skip to content

Commit 82b57a1

Browse files
committed
[#418] reduce memory footprint
Move around some things to stop data being copied. Reduce memory for when no source maps are created.
1 parent 03c36c5 commit 82b57a1

File tree

3 files changed

+80
-75
lines changed

3 files changed

+80
-75
lines changed

flattentool/__init__.py

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@
33
from flattentool.output import FORMATS as OUTPUT_FORMATS
44
from flattentool.output import FORMATS_SUFFIX
55
from flattentool.input import FORMATS as INPUT_FORMATS, WITH_CELLS
6+
from flattentool.lib import decimal_default
67
import json
78
import codecs
8-
from decimal import Decimal
99
from collections import OrderedDict
1010

11-
1211
def create_template(schema, output_name='releases', output_format='all', main_sheet_name='main', flatten=False, rollup=False, root_id='ocid', use_titles=False, **_):
1312
"""
1413
Creates template file(s) from given inputs
@@ -81,27 +80,6 @@ def spreadsheet_output(spreadsheet_output_class, name):
8180
raise Exception('The requested format is not available')
8281

8382

84-
# From http://bugs.python.org/issue16535
85-
class NumberStr(float):
86-
def __init__(self, o):
87-
# We don't call the parent here, since we're deliberately altering it's functionality
88-
# pylint: disable=W0231
89-
self.o = o
90-
91-
def __repr__(self):
92-
return str(self.o)
93-
94-
# This is needed for this trick to work in python 3.4
95-
def __float__(self):
96-
return self
97-
98-
99-
def decimal_default(o):
100-
if isinstance(o, Decimal):
101-
return NumberStr(o)
102-
raise TypeError(repr(o) + " is not JSON serializable")
103-
104-
10583
def unflatten(input_name, base_json=None, input_format=None, output_name='releases.json',
10684
main_sheet_name='releases', encoding='utf8', timezone_name='UTC',
10785
root_id='ocid', schema='', convert_titles=False, cell_source_map=None,
@@ -134,16 +112,8 @@ def unflatten(input_name, base_json=None, input_format=None, output_name='releas
134112
else:
135113
base = OrderedDict()
136114
if WITH_CELLS:
137-
result, cell_source_map_data, heading_source_map_data = spreadsheet_input.fancy_unflatten()
138-
base[main_sheet_name] = list(result)
139-
with codecs.open(output_name, 'w', encoding='utf-8') as fp:
140-
json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
141-
if cell_source_map:
142-
with codecs.open(cell_source_map, 'w', encoding='utf-8') as fp:
143-
json.dump(cell_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
144-
if heading_source_map:
145-
with codecs.open(heading_source_map, 'w', encoding='utf-8') as fp:
146-
json.dump(heading_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
115+
spreadsheet_input.fancy_unflatten(base, main_sheet_name, output_name, cell_source_map, heading_source_map)
116+
147117
else:
148118
result = spreadsheet_input.unflatten()
149119
base[main_sheet_name] = list(result)

flattentool/input.py

Lines changed: 49 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,22 @@
88
import sys
99
from decimal import Decimal, InvalidOperation
1010
import os
11+
import codecs
1112
from collections import OrderedDict
13+
1214
import openpyxl
1315
from six import text_type
1416
from warnings import warn
1517
import traceback
1618
import datetime
19+
import json
1720
import pytz
1821
from openpyxl.utils import _get_column_letter, column_index_from_string
22+
from flattentool.lib import decimal_default, Cell
23+
import tempfile
1924

2025
WITH_CELLS = True
2126

22-
class Cell:
23-
def __init__(self, cell_value, cell_location):
24-
self.cell_value = cell_value
25-
self.cell_location = cell_location
26-
self.sub_cells = []
2727

2828
# The "pylint: disable" lines exist to ignore warnings about the imports we expect not to work not working
2929

@@ -238,26 +238,41 @@ def inthere(unflattened, id_name):
238238
else:
239239
main_sheet_by_ocid[root_id_or_none].append(unflattened)
240240
temporarydicts_to_lists(main_sheet_by_ocid)
241+
241242
return sum(main_sheet_by_ocid.values(), [])
242243

244+
243245
def unflatten(self):
244-
result = self.do_unflatten()
245246
if WITH_CELLS:
246-
result = extract_list_to_value(result)
247-
return result
247+
tmp_directory = tempfile.mkdtemp()
248+
file_name = os.path.join(tmp_directory, 'unflattened.json')
249+
self.results_from_cell_tree({}, 'main', file_name)
250+
with open(file_name) as unflattened:
251+
return json.load(unflattened, object_pairs_hook=OrderedDict)['main']
252+
return self.do_unflatten()
253+
254+
255+
def extract_error_path(self, cell_tree):
256+
return sorted(extract_list_to_error_path([self.main_sheet_name.lower()], cell_tree).items())
257+
248258

249-
def fancy_unflatten(self):
259+
def results_from_cell_tree(self, base, main_sheet_name, output_name):
260+
cell_tree = self.do_unflatten()
261+
base[main_sheet_name] = cell_tree
262+
with codecs.open(output_name, 'w', encoding='utf-8') as fp:
263+
json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
264+
return self.extract_error_path(cell_tree)
265+
266+
267+
def fancy_unflatten(self, base, main_sheet_name, output_name, cell_source_map, heading_source_map):
250268
if not WITH_CELLS:
251269
raise Exception('Can only do a fancy_unflatten() if WITH_CELLS=True')
252-
cell_tree = self.do_unflatten()
253-
result = extract_list_to_value(cell_tree)
254-
cell_source_map = extract_list_to_error_path([self.main_sheet_name.lower()], cell_tree)
255-
ordered_items = sorted(cell_source_map.items())
256-
ordered_cell_source_map = OrderedDict(( '/'.join(str(x) for x in path), location) for path, location in ordered_items)
270+
ordered_items = self.results_from_cell_tree(base, main_sheet_name, output_name)
271+
if not cell_source_map and not heading_source_map:
272+
return
257273
row_source_map = OrderedDict()
258-
heading_source_map = OrderedDict()
259-
for path, _ in ordered_items:
260-
cells = cell_source_map[path]
274+
heading_source_map_data = OrderedDict()
275+
for path, cells in ordered_items:
261276
# Prepare row_source_map key
262277
key = '/'.join(str(x) for x in path[:-1])
263278
if not key in row_source_map:
@@ -270,19 +285,28 @@ def fancy_unflatten(self):
270285
except:
271286
header_path_parts.append(x)
272287
header_path = '/'.join(header_path_parts)
273-
if header_path not in heading_source_map:
274-
heading_source_map[header_path] = []
288+
if header_path not in heading_source_map_data:
289+
heading_source_map_data[header_path] = []
275290
# Populate the row and header source maps
276291
for cell in cells:
277292
sheet, col, row, header = cell
278293
if (sheet, row) not in row_source_map[key]:
279294
row_source_map[key].append((sheet, row))
280-
if (sheet, header) not in heading_source_map[header_path]:
281-
heading_source_map[header_path].append((sheet, header))
295+
if (sheet, header) not in heading_source_map_data[header_path]:
296+
heading_source_map_data[header_path].append((sheet, header))
282297
for key in row_source_map:
283-
assert key not in ordered_cell_source_map, 'Row/cell collision: {}'.format(key)
284-
ordered_cell_source_map[key] = row_source_map[key]
285-
return result, ordered_cell_source_map, heading_source_map
298+
ordered_items.append((key.split('/'), row_source_map[key]))
299+
300+
if cell_source_map:
301+
with codecs.open(cell_source_map, 'w', encoding='utf-8') as fp:
302+
json.dump(
303+
OrderedDict(( '/'.join(str(x) for x in path), location) for path, location in ordered_items),
304+
fp, default=decimal_default, ensure_ascii=False, indent=4
305+
)
306+
if heading_source_map:
307+
with codecs.open(heading_source_map, 'w', encoding='utf-8') as fp:
308+
json.dump(heading_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
309+
286310

287311
def extract_list_to_error_path(path, input):
288312
output = {}
@@ -317,24 +341,6 @@ def extract_dict_to_error_path(path, input):
317341
raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
318342
return output
319343

320-
def extract_list_to_value(input):
321-
output = []
322-
for item in input:
323-
output.append(extract_dict_to_value(item))
324-
return output
325-
326-
def extract_dict_to_value(input):
327-
output = OrderedDict()
328-
for k in input:
329-
if isinstance(input[k], list):
330-
output[k] = extract_list_to_value(input[k])
331-
elif isinstance(input[k], dict):
332-
output[k] = extract_dict_to_value(input[k])
333-
elif isinstance(input[k], Cell):
334-
output[k] = input[k].cell_value
335-
else:
336-
raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
337-
return output
338344

339345
class CSVInput(SpreadsheetInput):
340346
encoding = 'utf-8'
@@ -557,6 +563,7 @@ def path_search(nested_dict, path_list, id_fields=None, path=None, top=False, to
557563

558564

559565
class TemporaryDict(UserDict):
566+
__slots__ = ['keyfield', 'items_no_keyfield', 'data', 'top_sheet']
560567
def __init__(self, keyfield, top_sheet=False):
561568
self.keyfield = keyfield
562569
self.items_no_keyfield = []

flattentool/lib.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from decimal import Decimal
2+
# From http://bugs.python.org/issue16535
3+
class NumberStr(float):
4+
def __init__(self, o):
5+
# We don't call the parent here, since we're deliberately altering it's functionality
6+
# pylint: disable=W0231
7+
self.o = o
8+
9+
def __repr__(self):
10+
return str(self.o)
11+
12+
# This is needed for this trick to work in python 3.4
13+
def __float__(self):
14+
return self
15+
16+
class Cell:
17+
__slots__ = ['cell_value', 'cell_location', 'sub_cells']
18+
def __init__(self, cell_value, cell_location):
19+
self.cell_value = cell_value
20+
self.cell_location = cell_location
21+
self.sub_cells = []
22+
23+
def decimal_default(o):
24+
if isinstance(o, Decimal):
25+
return NumberStr(o)
26+
if isinstance(o, Cell):
27+
return o.cell_value
28+
raise TypeError(repr(o) + " is not JSON serializable")

0 commit comments

Comments
 (0)