Skip to content

Commit 83589ca

Browse files
authored
feat: implement a dirty flag (#74)
This patch introduces a dirty flag and uses it to avoid unnecessary writes. 1. It avoids writes when creating new nodes. Instead, it just caches the new nodes and marks them dirty. 2. It only flushes modified nodes instead of all cached nodes. 3. Flush no longer clears the cache, just the dirty flags.
1 parent 2ae22c5 commit 83589ca

File tree

1 file changed

+10
-26
lines changed

1 file changed

+10
-26
lines changed

hamt.go

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -110,16 +110,9 @@ type Pointer struct {
110110
Link cid.Cid `refmt:"l,omitempty"`
111111

112112
// cached node to avoid too many serialization operations
113-
// TODO(rvagg): we should check that this is actually used optimally. Flush()
114-
// performs a save of all of the cached nodes, but both Copy() and loadChild()
115-
// will set them. In the case of loadChild() we're not expecting a mutation so
116-
// a save is likely going to mean we incur unnecessary serialization when
117-
// we've simply inspected the tree. Copy() will only set a cached form if
118-
// it already exists on the source. It's unclear exactly what Flush() is good
119-
// for in its current form. Users may also need an advisory about memory
120-
// usage of large graphs since they don't have control over this outside of
121-
// Flush().
122113
cache *Node
114+
// dirty flag to indicate that the cached node needs to be flushed
115+
dirty bool
123116
}
124117

125118
// KV represents leaf storage within a HAMT node. A Pointer may hold up to
@@ -451,7 +444,7 @@ func (n *Node) checkSize(ctx context.Context) (uint64, error) {
451444
// child pointer exists in cached form.
452445
func (n *Node) Flush(ctx context.Context) error {
453446
for _, p := range n.Pointers {
454-
if p.cache != nil {
447+
if p.cache != nil && p.dirty {
455448
if err := p.cache.Flush(ctx); err != nil {
456449
return err
457450
}
@@ -461,7 +454,7 @@ func (n *Node) Flush(ctx context.Context) error {
461454
return err
462455
}
463456

464-
p.cache = nil
457+
p.dirty = false
465458
p.Link = c
466459
}
467460
}
@@ -613,6 +606,8 @@ func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.D
613606
return err
614607
}
615608

609+
child.dirty = true
610+
616611
// CHAMP optimization, ensure the HAMT retains its canonical form for the
617612
// current data it contains. This may involve collapsing child nodes if
618613
// they no longer contain enough elements to justify their stand-alone
@@ -659,13 +654,6 @@ func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.D
659654
if len(child.KVs) >= bucketSize {
660655
// bucket is full, create a child node (shard) with all existing bucket
661656
// elements plus the new one and set it in the place of the bucket
662-
// TODO(rvagg): this all of the modifyValue() calls are going to result
663-
// in a store.Put(), this could be improved by allowing NewNode() to take
664-
// the bulk set of elements, or modifying modifyValue() for the case
665-
// where we know for sure that the elements will go into buckets and
666-
// not cause an overflow - i.e. we just need to take each element, hash it
667-
// and consume the correct number of bytes off the digest and figure out
668-
// where it should be in the new node.
669657
sub := NewNode(n.store)
670658
sub.bitWidth = n.bitWidth
671659
sub.hash = n.hash
@@ -675,18 +663,13 @@ func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.D
675663
}
676664

677665
for _, p := range child.KVs {
678-
chhv := &hashBits{b: n.hash([]byte(p.Key)), consumed: hv.consumed}
666+
chhv := &hashBits{b: n.hash(p.Key), consumed: hv.consumed}
679667
if err := sub.modifyValue(ctx, chhv, p.Key, p.Value); err != nil {
680668
return err
681669
}
682670
}
683671

684-
c, err := n.store.Put(ctx, sub)
685-
if err != nil {
686-
return err
687-
}
688-
689-
return n.setPointer(cindex, &Pointer{Link: c})
672+
return n.setPointer(cindex, &Pointer{cache: sub, dirty: true})
690673
}
691674

692675
// otherwise insert the new element into the array in order, the ordering is
@@ -768,6 +751,7 @@ func (n *Node) Copy() *Node {
768751
pp := &Pointer{}
769752
if p.cache != nil {
770753
pp.cache = p.cache.Copy()
754+
pp.dirty = p.dirty
771755
}
772756
pp.Link = p.Link
773757
if p.KVs != nil {
@@ -785,7 +769,7 @@ func (n *Node) Copy() *Node {
785769
// Pointers elements can either contain a bucket of local elements or be a
786770
// link to a child node. In the case of a link, isShard() returns true.
787771
func (p *Pointer) isShard() bool {
788-
return p.Link.Defined()
772+
return p.cache != nil || p.Link.Defined()
789773
}
790774

791775
// ForEach recursively calls function f on each k / val pair found in the HAMT.

0 commit comments

Comments
 (0)