Skip to content

Commit faf6c31

Browse files
committed
chore: refactor cleanChild() for clarity & closer match to CHAMP semantics
Closes: https://github.com/ipfs/go-hamt-ipld/issues/56
1 parent ef9df7f commit faf6c31

File tree

2 files changed

+238
-45
lines changed

2 files changed

+238
-45
lines changed

hamt.go

Lines changed: 63 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -448,54 +448,75 @@ func (n *Node) SetRaw(ctx context.Context, k string, raw []byte) error {
448448
return n.modifyValue(ctx, &hashBits{b: n.hash(kb)}, kb, d)
449449
}
450450

451-
// When deleting elements, we need to perform a compaction such that there is
452-
// a single canonical form of any HAMT with a given set of key/value pairs.
453-
// Any node with less than `bucketSize` elements needs to be collapsed into a
454-
// bucket of Pointers in the parent node.
455-
// TODO(rvagg): I don't think the logic here is correct. A compaction should
456-
// occur strictly when: there are no links to child nodes remaining (assuming
457-
// we've cleaned them first and they haven't caused a cascading collapse to
458-
// here) and the number of direct elements (actual k/v pairs) in this node is
459-
// equal o bucketSize+1. Anything less than bucketSize+1 is invalid for a node
460-
// other than the root node (which probably won't have cleanChild() called on
461-
// it). e.g.
462-
// https://github.com/rvagg/iamap/blob/fad95295b013c8b4f0faac6dd5d9be175f6e606c/iamap.js#L333
463-
// If we perform this depth-first, then it's possible to see the collapse
464-
// cascade upward such that we end up with some parent node with a bucket with
465-
// only bucketSize elements. The canonical form of the HAMT requires that
466-
// any node that could be collapsed into a parent bucket is collapsed and.
467-
func (n *Node) cleanChild(chnd *Node, cindex byte) error {
468-
l := len(chnd.Pointers)
469-
switch {
470-
case l == 0:
471-
return fmt.Errorf("incorrectly formed HAMT")
472-
case l == 1:
473-
// TODO: only do this if its a value, cant do this for shards unless pairs requirements are met.
451+
// the number of links to child nodes this node contains
452+
func (n *Node) directChildCount() int {
453+
count := 0
454+
for _, p := range n.Pointers {
455+
if p.isShard() {
456+
count++
457+
}
458+
}
459+
return count
460+
}
474461

475-
ps := chnd.Pointers[0]
476-
if ps.isShard() {
477-
return nil
462+
// the number of KV entries this node contains
463+
func (n *Node) directKVCount() int {
464+
count := 0
465+
for _, p := range n.Pointers {
466+
if !p.isShard() {
467+
count = count + len(p.KVs)
478468
}
469+
}
470+
return count
471+
}
479472

480-
return n.setPointer(cindex, ps)
481-
case l <= bucketSize:
482-
var chvals []*KV
483-
for _, p := range chnd.Pointers {
484-
if p.isShard() {
485-
return nil
486-
}
473+
// This happens after deletes to ensure that we retain canonical form for the
474+
// given set of data this HAMT contains. This is a key part of the CHAMP
475+
// algorithm. Any node that could be represented as a bucket in a parent node
476+
// should be collapsed as such. This collapsing process could continue back up
477+
// the tree as far as necessary to represent the data in the minimal HAMT form.
478+
// This operation is done from a parent perspective, so we clean the child
479+
// below us first and then our parent cleans us.
480+
func (n *Node) cleanChild(chnd *Node, cindex byte) error {
481+
if chnd.directChildCount() != 0 {
482+
// child has its own children, nothing to collapse
483+
return nil
484+
}
487485

488-
for _, sp := range p.KVs {
489-
if len(chvals) == bucketSize {
490-
return nil
491-
}
492-
chvals = append(chvals, sp)
493-
}
494-
}
495-
return n.setPointer(cindex, &Pointer{KVs: chvals})
496-
default:
486+
if chnd.directKVCount() > bucketSize {
487+
// child contains more local elements than could be collapsed
497488
return nil
498489
}
490+
491+
l := len(chnd.Pointers)
492+
if l == 0 {
493+
return fmt.Errorf("incorrectly formed HAMT")
494+
}
495+
496+
if l == 1 {
497+
// The case where the child node has a single bucket, which we know can
498+
// only contain `bucketSize` elements (maximum), so we need to pull that
499+
// bucket up into this node.
500+
// This case should only happen when it bubbles up from the case below
501+
// where a lower child has its elements compacted into a single bucket. We
502+
// shouldn't be able to reach this block unless a delete has been
503+
// performed on a lower block and we are performing a post-delete clean on
504+
// a parent block.
505+
return n.setPointer(cindex, chnd.Pointers[0])
506+
}
507+
508+
// The case where the child node contains enough elements to fit in a
509+
// single bucket and therefore can't justify its existence as a node on its
510+
// own. So we collapse all entries into a single bucket and replace the
511+
// link to the child with that bucket.
512+
// This may cause cascading collapses if this is the only bucket in the
513+
// current node, that case will be handled by our parent node by the l==1
514+
// case above.
515+
var chvals []*KV
516+
for _, p := range chnd.Pointers {
517+
chvals = append(chvals, p.KVs...)
518+
}
519+
return n.setPointer(cindex, &Pointer{KVs: chvals})
499520
}
500521

501522
// Add a new value, update an existing value, or delete a value from the HAMT,

hamt_test.go

Lines changed: 175 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,174 @@ func TestOverflow(t *testing.T) {
113113
}
114114
}
115115

116+
func TestFillAndCollapse(t *testing.T) {
117+
ctx := context.Background()
118+
cs := cbor.NewCborStore(newMockBlocks())
119+
root := NewNode(cs, UseTreeBitWidth(8), UseHashFunction(identityHash))
120+
val := randValue()
121+
122+
// start with a single node and a single full bucket
123+
if err := root.Set(ctx, "AAAAAA11", val); err != nil {
124+
t.Fatal(err)
125+
}
126+
if err := root.Set(ctx, "AAAAAA12", val); err != nil {
127+
t.Fatal(err)
128+
}
129+
if err := root.Set(ctx, "AAAAAA21", val); err != nil {
130+
t.Fatal(err)
131+
}
132+
133+
st := stats(root)
134+
fmt.Println(st)
135+
printHamt(root)
136+
if st.totalNodes != 1 || st.totalKvs != 3 || st.counts[3] != 1 {
137+
t.Fatal("Should be 1 node with 1 bucket")
138+
}
139+
140+
baseCid, err := cs.Put(ctx, root)
141+
if err != nil {
142+
t.Fatal(err)
143+
}
144+
145+
// add a 4th colliding entry that forces a chain of new nodes to accommodate
146+
// in a new node where there aren't collisions (7th byte)
147+
if err := root.Set(ctx, "AAAAAA22", val); err != nil {
148+
t.Fatal(err)
149+
}
150+
151+
st = stats(root)
152+
fmt.Println(st)
153+
printHamt(root)
154+
if st.totalNodes != 7 || st.totalKvs != 4 || st.counts[2] != 2 {
155+
t.Fatal("Should be 7 nodes with 4 buckets")
156+
}
157+
158+
// remove and we should be back to the same structure as before
159+
if err := root.Delete(ctx, "AAAAAA22"); err != nil {
160+
t.Fatal(err)
161+
}
162+
163+
st = stats(root)
164+
fmt.Println(st)
165+
printHamt(root)
166+
if st.totalNodes != 1 || st.totalKvs != 3 || st.counts[3] != 1 {
167+
t.Fatal("Should be 1 node with 1 bucket")
168+
}
169+
170+
c, err := cs.Put(ctx, root)
171+
if err != nil {
172+
t.Fatal(err)
173+
}
174+
if !c.Equals(baseCid) {
175+
t.Fatal("CID mismatch on mutation")
176+
}
177+
178+
// insert elements that collide at the 4th position so push the tree down by
179+
// 3 nodes
180+
if err := root.Set(ctx, "AAA11AA", val); err != nil {
181+
t.Fatal(err)
182+
}
183+
if err := root.Set(ctx, "AAA12AA", val); err != nil {
184+
t.Fatal(err)
185+
}
186+
if err := root.Set(ctx, "AAA13AA", val); err != nil {
187+
t.Fatal(err)
188+
}
189+
st = stats(root)
190+
fmt.Println(st)
191+
printHamt(root)
192+
if st.totalNodes != 4 || st.totalKvs != 6 || st.counts[3] != 2 {
193+
t.Fatal("Should be 4 nodes with 2 buckets of 3")
194+
}
195+
196+
midCid, err := cs.Put(ctx, root)
197+
if err != nil {
198+
t.Fatal(err)
199+
}
200+
201+
// insert an overflow node that pushes the previous 4 into a separate node
202+
if err := root.Set(ctx, "AAA14AA", val); err != nil {
203+
t.Fatal(err)
204+
}
205+
206+
st = stats(root)
207+
fmt.Println(st)
208+
printHamt(root)
209+
if st.totalNodes != 5 || st.totalKvs != 7 || st.counts[1] != 4 || st.counts[3] != 1 {
210+
t.Fatal("Should be 4 node with 2 buckets")
211+
}
212+
213+
// put the colliding 4th back in that will push down to full height
214+
if err := root.Set(ctx, "AAAAAA22", val); err != nil {
215+
t.Fatal(err)
216+
}
217+
218+
st = stats(root)
219+
fmt.Println(st)
220+
printHamt(root)
221+
if st.totalNodes != 8 || st.totalKvs != 8 || st.counts[1] != 4 || st.counts[2] != 2 {
222+
t.Fatal("Should be 7 nodes with 5 buckets")
223+
}
224+
225+
// rewind back one step
226+
if err := root.Delete(ctx, "AAAAAA22"); err != nil {
227+
t.Fatal(err)
228+
}
229+
230+
st = stats(root)
231+
fmt.Println(st)
232+
printHamt(root)
233+
if st.totalNodes != 5 || st.totalKvs != 7 || st.counts[1] != 4 || st.counts[3] != 1 {
234+
t.Fatal("Should be 4 node with 2 buckets")
235+
}
236+
237+
// rewind another step
238+
if err := root.Delete(ctx, "AAA14AA"); err != nil {
239+
t.Fatal(err)
240+
}
241+
st = stats(root)
242+
fmt.Println(st)
243+
printHamt(root)
244+
if st.totalNodes != 4 || st.totalKvs != 6 || st.counts[3] != 2 {
245+
t.Fatal("Should be 4 nodes with 2 buckets of 3")
246+
}
247+
248+
c, err = cs.Put(ctx, root)
249+
if err != nil {
250+
t.Fatal(err)
251+
}
252+
if !c.Equals(midCid) {
253+
t.Fatal("CID mismatch on mutation")
254+
}
255+
256+
// remove the 3 colliding node so we should be back to the initial state
257+
if err := root.Delete(ctx, "AAA11AA"); err != nil {
258+
t.Fatal(err)
259+
}
260+
if err := root.Delete(ctx, "AAA12AA"); err != nil {
261+
t.Fatal(err)
262+
}
263+
if err := root.Delete(ctx, "AAA13AA"); err != nil {
264+
t.Fatal(err)
265+
}
266+
267+
st = stats(root)
268+
fmt.Println(st)
269+
printHamt(root)
270+
if st.totalNodes != 1 || st.totalKvs != 3 || st.counts[3] != 1 {
271+
t.Fatal("Should be 1 node with 1 bucket")
272+
}
273+
274+
// should have the same CID as original
275+
c, err = cs.Put(ctx, root)
276+
if err != nil {
277+
t.Fatal(err)
278+
}
279+
if !c.Equals(baseCid) {
280+
t.Fatal("CID mismatch on mutation")
281+
}
282+
}
283+
116284
func addAndRemoveKeys(t *testing.T, keys []string, extraKeys []string, options ...Option) {
117285
ctx := context.Background()
118286
vals := make(map[string][]byte)
@@ -129,12 +297,12 @@ func addAndRemoveKeys(t *testing.T, keys []string, extraKeys []string, options .
129297
}
130298
}
131299

132-
// fmt.Println("start flush")
133-
// bef := time.Now()
300+
fmt.Println("start flush")
301+
bef := time.Now()
134302
if err := begn.Flush(ctx); err != nil {
135303
t.Fatal(err)
136304
}
137-
// fmt.Println("flush took: ", time.Since(bef))
305+
fmt.Println("flush took: ", time.Since(bef))
138306
c, err := cs.Put(ctx, begn)
139307
if err != nil {
140308
t.Fatal(err)
@@ -249,6 +417,10 @@ type hamtStats struct {
249417
counts map[int]int
250418
}
251419

420+
func (hs hamtStats) String() string {
421+
return fmt.Sprintf("nodes=%d, kvs=%d, counts=%v", hs.totalNodes, hs.totalKvs, hs.counts)
422+
}
423+
252424
func stats(n *Node) *hamtStats {
253425
st := &hamtStats{counts: make(map[int]int)}
254426
statsrec(n, st)

0 commit comments

Comments
 (0)