Skip to content

Commit ac13aca

Browse files
rollakariv
authored andcommitted
Fix sort to work correctly for numbers (#111)
* Added a test showing the problem * Simplified _sorter for debugging * Improved sorting of numbers * Improved sorting date/time * Fixed linting * Drop datetime stringification because it'ok by default * Fixed implementation for numbers * Added test breaking current approach * Fixed implementation * Fixed linting * Improved readability * Improved readability * Removed debug print * Added more tests * Minor optimization * Added edge case to tests * Rebase number sort on bitstring * Added support for negative numbers * Added a sort decimals test
1 parent ed087d2 commit ac13aca

File tree

4 files changed

+128
-5
lines changed

4 files changed

+128
-5
lines changed

data/numbers.csv

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
a
2+
10
3+
1.1
4+
-0.5
5+
-1000
6+
2
7+
-0.4
8+
1000
9+
0

dataflows/processors/sort_rows.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,42 @@
1+
import re
2+
import decimal
13
from kvfile import KVFile
2-
4+
from bitstring import BitArray
35
from ..helpers.resource_matcher import ResourceMatcher
46

57

68
class KeyCalc(object):
79
def __init__(self, key_spec):
810
self.key_spec = key_spec
11+
self.key_list = re.findall(r'\{(.*?)\}', key_spec)
912

1013
def __call__(self, row):
11-
return self.key_spec.format(**row)
14+
context = row.copy()
15+
for key, value in row.items():
16+
# We need to stringify some types to make them properly comparable
17+
if key in self.key_list:
18+
# numbers
19+
# https://www.h-schmidt.net/FloatConverter/IEEE754.html
20+
if isinstance(value, (int, float, decimal.Decimal)):
21+
bits = BitArray(float=value, length=64)
22+
# invert the sign bit
23+
bits.invert(0)
24+
# invert negative numbers
25+
if value < 0:
26+
bits.invert(range(1, 64))
27+
context[key] = bits.hex
28+
return self.key_spec.format(**context)
1229

1330

1431
def _sorter(rows, key_calc, reverse, batch_size):
1532
db = KVFile()
16-
db.insert(((key_calc(row) + "{:08x}".format(row_num), row) for row_num, row in enumerate(rows)),
17-
batch_size=batch_size)
1833

34+
def process(rows):
35+
for row_num, row in enumerate(rows):
36+
key = key_calc(row) + "{:08x}".format(row_num)
37+
yield (key, row)
38+
39+
db.insert(process(rows), batch_size=batch_size)
1940
for _, value in db.items(reverse=reverse):
2041
yield value
2142

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def read(*paths):
3232
'tabulate',
3333
'tableschema-sql',
3434
'xmljson',
35+
'bitstring>=3',
3536
]
3637
SPEEDUP_REQUIRES = [
3738
'plyvel',

tests/test_lib.py

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,99 @@ def test_sort_reverse_many_rows():
376376
results, _, _ = f.results()
377377
results = results[0]
378378
assert results[0:2] == [{'a': 999, 'b': 4}, {'a': 994, 'b': 4}]
379-
assert results[998:1000] == [{'a': 100, 'b': 0}, {'a': 0, 'b': 0}]
379+
assert results[998:1000] == [{'a': 5, 'b': 0}, {'a': 0, 'b': 0}]
380+
381+
382+
def test_sort_rows_number():
383+
from dataflows import sort_rows
384+
385+
f = Flow(
386+
[
387+
{'a': 0.1},
388+
{'a': -3},
389+
{'a': -4},
390+
{'a': 10},
391+
{'a': 8},
392+
{'a': 0},
393+
{'a': -1000000},
394+
{'a': 1000000},
395+
{'a': -0.1},
396+
{'a': -0.2},
397+
{'a': 0.2},
398+
{'a': -1000001},
399+
{'a': 1000001},
400+
{'a': 6},
401+
{'a': -10},
402+
{'a': -0.001},
403+
{'a': 0.001},
404+
{'a': 1},
405+
{'a': -1},
406+
],
407+
sort_rows(key='{a}'),
408+
)
409+
results, _, _ = f.results()
410+
assert list(results[0]) == [
411+
{'a': -1000001},
412+
{'a': -1000000},
413+
{'a': -10},
414+
{'a': -4},
415+
{'a': -3},
416+
{'a': -1},
417+
{'a': -0.2},
418+
{'a': -0.1},
419+
{'a': -0.001},
420+
{'a': 0},
421+
{'a': 0.001},
422+
{'a': 0.1},
423+
{'a': 0.2},
424+
{'a': 1},
425+
{'a': 6},
426+
{'a': 8},
427+
{'a': 10},
428+
{'a': 1000000},
429+
{'a': 1000001},
430+
]
431+
432+
433+
def test_sort_rows_decimal():
434+
from decimal import Decimal
435+
from dataflows import sort_rows, load
436+
437+
f = Flow(
438+
load('data/numbers.csv', cast_strategy=load.CAST_WITH_SCHEMA),
439+
sort_rows(key='{a}'),
440+
)
441+
results, dp, _ = f.results()
442+
assert list(results[0]) == [
443+
{'a': Decimal('-1000')},
444+
{'a': Decimal('-0.5')},
445+
{'a': Decimal('-0.4')},
446+
{'a': Decimal('0')},
447+
{'a': Decimal('1.1')},
448+
{'a': Decimal('2')},
449+
{'a': Decimal('10')},
450+
{'a': Decimal('1000')}
451+
]
452+
453+
454+
def test_sort_rows_datetime():
455+
import datetime
456+
from dataflows import sort_rows
457+
458+
f = Flow(
459+
[
460+
{'a': datetime.date(2000, 1, 3)},
461+
{'a': datetime.date(2010, 1, 2)},
462+
{'a': datetime.date(2020, 1, 1)},
463+
],
464+
sort_rows(key='{a}'),
465+
)
466+
results, _, _ = f.results()
467+
assert list(results[0]) == [
468+
{'a': datetime.date(2000, 1, 3)},
469+
{'a': datetime.date(2010, 1, 2)},
470+
{'a': datetime.date(2020, 1, 1)},
471+
]
380472

381473

382474
def test_duplicate():

0 commit comments

Comments
 (0)