Skip to content

Commit cc7bda3

Browse files
kxepalstefankoegl
authored andcommitted
Optimize jsonpatch.make_patch for lists. Fixes #24
This optimization tries to work only with really different lists subsequences and trying to preserve as much as possible common ones. This reduces overall of generated operations since now it's possible to simple insertions. Also, it converts pairs of remove-add operations into single replace (if target path is the same) or move (if target value is the same).
1 parent d65bd3e commit cc7bda3

File tree

2 files changed

+271
-22
lines changed

2 files changed

+271
-22
lines changed

jsonpatch.py

Lines changed: 234 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import copy
3939
import functools
4040
import inspect
41+
import itertools
4142
import json
4243
import sys
4344

@@ -53,9 +54,6 @@
5354
# pylint: disable=E0611,W0404
5455
if sys.version_info >= (3, 0):
5556
basestring = (bytes, str) # pylint: disable=C0103,W0622
56-
from itertools import zip_longest
57-
else:
58-
from itertools import izip_longest as zip_longest
5957

6058

6159
class JsonPatchException(Exception):
@@ -282,15 +280,15 @@ def compare_values(path, value, other):
282280
if value == other:
283281
return
284282
if isinstance(value, dict) and isinstance(other, dict):
285-
for operation in compare_dict(path, value, other):
283+
for operation in compare_dicts(path, value, other):
286284
yield operation
287285
elif isinstance(value, list) and isinstance(other, list):
288-
for operation in compare_list(path, value, other):
286+
for operation in compare_lists(path, value, other):
289287
yield operation
290288
else:
291289
yield {'op': 'replace', 'path': '/'.join(path), 'value': other}
292290

293-
def compare_dict(path, src, dst):
291+
def compare_dicts(path, src, dst):
294292
for key in src:
295293
if key not in dst:
296294
yield {'op': 'remove', 'path': '/'.join(path + [key])}
@@ -304,23 +302,10 @@ def compare_dict(path, src, dst):
304302
'path': '/'.join(path + [key]),
305303
'value': dst[key]}
306304

307-
def compare_list(path, src, dst):
308-
lsrc, ldst = len(src), len(dst)
309-
for idx in range(min(lsrc, ldst)):
310-
current = path + [str(idx)]
311-
for operation in compare_values(current, src[idx], dst[idx]):
312-
yield operation
313-
if lsrc < ldst:
314-
for idx in range(lsrc, ldst):
315-
current = path + [str(idx)]
316-
yield {'op': 'add',
317-
'path': '/'.join(current),
318-
'value': dst[idx]}
319-
elif lsrc > ldst:
320-
for idx in reversed(range(ldst, lsrc)):
321-
yield {'op': 'remove', 'path': '/'.join(path + [str(idx)])}
305+
def compare_lists(path, src, dst):
306+
return _compare_lists(path, src, dst)
322307

323-
return cls(list(compare_dict([''], src, dst)))
308+
return cls(list(compare_dicts([''], src, dst)))
324309

325310
def to_string(self):
326311
"""Returns patch set as JSON string."""
@@ -527,3 +512,230 @@ def apply(self, obj):
527512
}).apply(obj)
528513

529514
return obj
515+
516+
517+
def _compare_lists(path, src, dst):
518+
"""Compares two lists objects and return JSON patch about."""
519+
return _optimize(_compare(path, src, dst, *_split_by_common_seq(src, dst)))
520+
521+
522+
def _longest_common_subseq(src, dst):
523+
"""Returns pair of ranges of longest common subsequence for the `src`
524+
and `dst` lists.
525+
526+
>>> src = [1, 2, 3, 4]
527+
>>> dst = [0, 1, 2, 3, 5]
528+
>>> # The longest common subsequence for these lists is [1, 2, 3]
529+
... # which is located at (0, 3) index range for src list and (1, 4) for
530+
... # dst one. Tuple of these ranges we should get back.
531+
... assert ((0, 3), (1, 4)) == _longest_common_subseq(src, dst)
532+
"""
533+
lsrc, ldst = len(src), len(dst)
534+
drange = list(range(ldst))
535+
matrix = [[0] * ldst for _ in range(lsrc)]
536+
z = 0 # length of the longest subsequence
537+
range_src, range_dst = None, None
538+
for i, j in itertools.product(range(lsrc), drange):
539+
if src[i] == dst[j]:
540+
if i == 0 or j == 0:
541+
matrix[i][j] = 1
542+
else:
543+
matrix[i][j] = matrix[i-1][j-1] + 1
544+
if matrix[i][j] > z:
545+
z = matrix[i][j]
546+
if matrix[i][j] == z:
547+
range_src = (i-z+1, i+1)
548+
range_dst = (j-z+1, j+1)
549+
else:
550+
matrix[i][j] = 0
551+
return range_src, range_dst
552+
553+
554+
def _split_by_common_seq(src, dst, bx=(0, -1), by=(0, -1)):
555+
"""Recursively splits the `dst` list onto two parts: left and right.
556+
The left part contains differences on left from common subsequence,
557+
same as the right part by for other side.
558+
559+
To easily understand the process let's take two lists: [0, 1, 2, 3] as
560+
`src` and [1, 2, 4, 5] for `dst`. If we've tried to generate the binary tree
561+
where nodes are common subsequence for both lists, leaves on the left
562+
side are subsequence for `src` list and leaves on the right one for `dst`,
563+
our tree would looks like::
564+
565+
[1, 2]
566+
/ \
567+
[0] []
568+
/ \
569+
[3] [4, 5]
570+
571+
This function generate the similar structure as flat tree, but without
572+
nodes with common subsequences - since we're don't need them - only with
573+
left and right leaves::
574+
575+
[]
576+
/ \
577+
[0] []
578+
/ \
579+
[3] [4, 5]
580+
581+
The `bx` is the absolute range for currently processed subsequence of
582+
`src` list. The `by` means the same, but for the `dst` list.
583+
"""
584+
# Prevent useless comparisons in future
585+
bx = bx if bx[0] != bx[1] else None
586+
by = by if by[0] != by[1] else None
587+
588+
if not src:
589+
return [None, by]
590+
elif not dst:
591+
return [bx, None]
592+
593+
# note that these ranges are relative for processed sublists
594+
x, y = _longest_common_subseq(src, dst)
595+
596+
if x is None or y is None: # no more any common subsequence
597+
return [bx, by]
598+
599+
return [_split_by_common_seq(src[:x[0]], dst[:y[0]],
600+
(bx[0], bx[0] + x[0]),
601+
(by[0], by[0] + y[0])),
602+
_split_by_common_seq(src[x[1]:], dst[y[1]:],
603+
(bx[0] + x[1], bx[0] + len(src)),
604+
(bx[0] + y[1], bx[0] + len(dst)))]
605+
606+
607+
def _compare(path, src, dst, left, right):
608+
"""Same as :func:`_compare_with_shift` but strips emitted `shift` value."""
609+
for op, _ in _compare_with_shift(path, src, dst, left, right, 0):
610+
yield op
611+
612+
613+
def _compare_with_shift(path, src, dst, left, right, shift):
614+
"""Recursively compares differences from `left` and `right` sides
615+
from common subsequences.
616+
617+
The `shift` parameter is used to store index shift which caused
618+
by ``add`` and ``remove`` operations.
619+
620+
Yields JSON patch operations and list index shift.
621+
"""
622+
if isinstance(left, list):
623+
for item, shift in _compare_with_shift(path, src, dst, *left,
624+
shift=shift):
625+
yield item, shift
626+
elif left is not None:
627+
for item, shift in _compare_left(path, src, left, shift):
628+
yield item, shift
629+
630+
if isinstance(right, list):
631+
for item, shift in _compare_with_shift(path, src, dst, *right,
632+
shift=shift):
633+
yield item, shift
634+
elif right is not None:
635+
for item, shift in _compare_right(path, dst, right, shift):
636+
yield item, shift
637+
638+
639+
def _compare_left(path, src, left, shift):
640+
"""Yields JSON patch ``remove`` operations for elements that are only
641+
exists in the `src` list."""
642+
start, end = left
643+
if end == -1:
644+
end = len(src)
645+
# we need to `remove` elements from list tail to not deal with index shift
646+
for idx in reversed(range(start + shift, end + shift)):
647+
current = path + [str(idx)]
648+
yield (
649+
{'op': 'remove',
650+
# yes, there should be any value field, but we'll use it
651+
# to apply `move` optimization a bit later and will remove
652+
# it in _optimize function.
653+
'value': src[idx - shift],
654+
'path': '/'.join(current)},
655+
shift - 1
656+
)
657+
shift -= 1
658+
659+
660+
def _compare_right(path, dst, right, shift):
661+
"""Yields JSON patch ``add`` operations for elements that are only
662+
exists in the `dst` list"""
663+
start, end = right
664+
if end == -1:
665+
end = len(dst)
666+
for idx in range(start, end):
667+
current = path + [str(idx)]
668+
yield (
669+
{'op': 'add', 'path': '/'.join(current), 'value': dst[idx]},
670+
shift + 1
671+
)
672+
shift += 1
673+
674+
675+
def _optimize(operations):
676+
"""Optimizes operations which was produced by lists comparison.
677+
678+
Actually it does two kinds of optimizations:
679+
680+
1. Seeks pair of ``remove`` and ``add`` operations against the same path
681+
and replaces them with ``replace`` operation.
682+
2. Seeks pair of ``remove`` and ``add`` operations for the same value
683+
and replaces them with ``move`` operation.
684+
"""
685+
result = []
686+
ops_by_path = {}
687+
ops_by_value = {}
688+
add_remove = set(['add', 'remove'])
689+
for item in operations:
690+
# could we apply "move" optimization for dict values?
691+
hashable_value = not isinstance(item['value'], (dict, list))
692+
if item['path'] in ops_by_path:
693+
_optimize_using_replace(ops_by_path[item['path']], item)
694+
continue
695+
if hashable_value and item['value'] in ops_by_value:
696+
prev_item = ops_by_value[item['value']]
697+
# ensure that we processing pair of add-remove ops
698+
if set([item['op'], prev_item['op']]) == add_remove:
699+
_optimize_using_move(prev_item, item)
700+
ops_by_value.pop(item['value'])
701+
continue
702+
result.append(item)
703+
ops_by_path[item['path']] = item
704+
if hashable_value:
705+
ops_by_value[item['value']] = item
706+
707+
# cleanup
708+
ops_by_path.clear()
709+
ops_by_value.clear()
710+
for item in result:
711+
if item['op'] == 'remove':
712+
item.pop('value') # strip our hack
713+
yield item
714+
715+
716+
def _optimize_using_replace(prev, cur):
717+
"""Optimises JSON patch by using ``replace`` operation instead of
718+
``remove`` and ``add`` against the same path."""
719+
prev['op'] = 'replace'
720+
if cur['op'] == 'add':
721+
prev['value'] = cur['value']
722+
723+
724+
def _optimize_using_move(prev_item, item):
725+
"""Optimises JSON patch by using ``move`` operation instead of
726+
``remove` and ``add`` against the different paths but for the same value."""
727+
prev_item['op'] = 'move'
728+
move_from, move_to = [
729+
(item['path'], prev_item['path']),
730+
(prev_item['path'], item['path']),
731+
][item['op'] == 'add']
732+
if item['op'] == 'add': # first was remove then add
733+
prev_item['from'] = move_from
734+
prev_item['path'] = move_to
735+
else: # first was add then remove
736+
head, move_from = move_from.rsplit('/', 1)
737+
# since add operation was first it incremented
738+
# overall index shift value. we have to fix this
739+
move_from = int(move_from) - 1
740+
prev_item['from'] = head + '/%d' % move_from
741+
prev_item['path'] = move_to

tests.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,43 @@ def test_add_nested(self):
297297
}
298298
self.assertEqual(expected, res)
299299

300+
def test_should_just_add_new_item_not_rebuild_all_list(self):
301+
src = {'foo': [1, 2, 3]}
302+
dst = {'foo': [3, 1, 2, 3]}
303+
patch = list(jsonpatch.make_patch(src, dst))
304+
self.assertEqual(len(patch), 1)
305+
self.assertEqual(patch[0]['op'], 'add')
306+
res = jsonpatch.apply_patch(src, patch)
307+
self.assertEqual(res, dst)
308+
309+
def test_use_replace_instead_of_remove_add(self):
310+
src = {'foo': [1, 2, 3]}
311+
dst = {'foo': [3, 2, 3]}
312+
patch = list(jsonpatch.make_patch(src, dst))
313+
self.assertEqual(len(patch), 1)
314+
self.assertEqual(patch[0]['op'], 'replace')
315+
res = jsonpatch.apply_patch(src, patch)
316+
self.assertEqual(res, dst)
317+
318+
def test_use_move_instead_of_remove_add(self):
319+
src = {'foo': [4, 1, 2, 3]}
320+
dst = {'foo': [1, 2, 3, 4]}
321+
patch = list(jsonpatch.make_patch(src, dst))
322+
self.assertEqual(len(patch), 1)
323+
self.assertEqual(patch[0]['op'], 'move')
324+
res = jsonpatch.apply_patch(src, patch)
325+
self.assertEqual(res, dst)
326+
327+
def test_use_move_instead_of_add_remove(self):
328+
src = {'foo': [1, 2, 3]}
329+
dst = {'foo': [3, 1, 2]}
330+
patch = list(jsonpatch.make_patch(src, dst))
331+
self.assertEqual(len(patch), 1)
332+
self.assertEqual(patch[0]['op'], 'move')
333+
res = jsonpatch.apply_patch(src, patch)
334+
self.assertEqual(res, dst)
335+
336+
300337

301338
class InvalidInputTests(unittest.TestCase):
302339

0 commit comments

Comments
 (0)