feat: custom chunker registry (#1116)

alanshaw · lidel · web-flow · commit e8ef4a2593a0 · 2026-03-10T15:40:18.000+01:00
* feat: custom chunker registry

* doc: update changelog

* fix: protect chunker registry from concurrent access

Register writes and FromString reads the splitters map with no
synchronization, which is a data race if called concurrently.

- chunker/registry.go: add splittersMu sync.RWMutex, Lock in Register
- chunker/parse.go: RLock/RUnlock around map read in FromString

* fix: validate inputs and reject duplicates in chunker Register

Register silently accepted empty names, names with dashes (unmatchable
by FromString which splits on dash), nil constructors, and duplicate
registrations that could overwrite built-ins.

- chunker/registry.go: panic on empty name, dash in name, nil ctor,
  duplicate name (follows database/sql.Register convention)
- chunker/registry_test.go: use unique names per subtest, add panic
  test cases, add comment explaining why not parallel

* refactor: rename CtorFunc to SplitterFunc

CtorFunc is an uncommon abbreviation. SplitterFunc follows Go naming
conventions (like http.HandlerFunc) and clearly communicates purpose.

- chunker/registry.go: rename type and all usages

* docs: improve chunker registry and FromString godocs

FromString doc only listed built-in chunkers and didn't mention custom
ones. Package doc didn't mention extensibility. SplitterFunc had no doc.

- chunker/registry.go: add SplitterFunc doc, rewrite Register doc with
  init-time usage example and panic contract
- chunker/parse.go: rewrite FromString doc to list built-in chunkers
  and mention Register
- chunker/splitting.go: expand package doc to mention extensibility
  via Register and FromString

* test: cover default, unknown, and strict size-{n} parse paths

- chunker/parse_test.go: add TestParseDefault ("", "default"),
  TestParseUnknown, reject "size-123-extra" in TestParseSize

* docs: note size- parsing strictness change in CHANGELOG

The old FromString silently accepted "size-123-extra" by only reading
the first segment after the dash. The refactored parseSizeString now
validates the format and rejects extra parameters.

* fix: use Register in init to apply validation uniformly

Built-in chunkers were written directly to the map, bypassing the
name and duplicate checks in Register. Call Register instead so the
same rules apply to built-ins and custom chunkers alike.

* refactor: use strings.Cut for chunker name extraction

Replaces strings.Index + manual slicing with strings.Cut.

* refactor: rename ctor parameter to fn in Register

ctor was a leftover from the original CtorFunc type name. fn is the
idiomatic Go convention for function-typed parameters.

* docs: clarify SplitterGen vs SplitterFunc godocs

---------

Co-authored-by: Marcin Rataj &lt;lidel@lidel.org&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,8 +16,12 @@ The following emojis are used to highlight certain changes:
 
 ### Added
 
+- `chunker`: added `Register` function to allow custom chunkers to be registered for use with `FromString`.
+
 ### Changed
 
+- `chunker`: `FromString` now rejects malformed `size-` strings with extra parameters (e.g. `size-123-extra` was previously silently accepted).
+
 ### Removed
 
 - `cmd/boxo-migrate`: removed code for go-ipfs migration -- no longer needed.
diff --git a/chunker/parse.go b/chunker/parse.go
@@ -39,35 +39,51 @@ var (
 	ErrSizeMax  = fmt.Errorf("chunker parameters may not exceed the maximum chunk size of %d", ChunkSizeLimit)
 )
 
-// FromString returns a Splitter depending on the given string:
-// it supports "default" (""), "size-{size}", "rabin", "rabin-{blocksize}",
-// "rabin-{min}-{avg}-{max}" and "buzhash".
+// FromString returns a [Splitter] for the given chunker specification string.
+//
+// Built-in chunkers:
+//
+//   - "" or "default" -- fixed-size chunks using [DefaultBlockSize]
+//   - "size-{size}" -- fixed-size chunks of the given byte size
+//   - "rabin" -- Rabin fingerprint chunking with [DefaultBlockSize] average
+//   - "rabin-{avg}" -- Rabin fingerprint chunking with the given average size
+//   - "rabin-{min}-{avg}-{max}" -- Rabin with explicit bounds
+//   - "buzhash" -- Buzhash content-defined chunking
+//
+// Custom chunkers registered via [Register] are also available.
+// The name is extracted as everything before the first dash.
 func FromString(r io.Reader, chunker string) (Splitter, error) {
-	switch {
-	case chunker == "" || chunker == "default":
+	if chunker == "" || chunker == "default" {
 		return DefaultSplitter(r), nil
-
-	case strings.HasPrefix(chunker, "size-"):
-		sizeStr := strings.Split(chunker, "-")[1]
-		size, err := strconv.Atoi(sizeStr)
-		if err != nil {
-			return nil, err
-		} else if size <= 0 {
-			return nil, ErrSize
-		} else if size > ChunkSizeLimit {
-			return nil, ErrSizeMax
-		}
-		return NewSizeSplitter(r, int64(size)), nil
-
-	case strings.HasPrefix(chunker, "rabin"):
-		return parseRabinString(r, chunker)
-
-	case chunker == "buzhash":
-		return NewBuzhash(r), nil
-
-	default:
+	}
+	name, _, _ := strings.Cut(chunker, "-")
+	splittersMu.RLock()
+	ctor, ok := splitters[name]
+	splittersMu.RUnlock()
+	if !ok {
 		return nil, fmt.Errorf("unrecognized chunker option: %s", chunker)
 	}
+	return ctor(r, chunker)
+}
+
+func parseSizeString(r io.Reader, chunker string) (Splitter, error) {
+	parts := strings.Split(chunker, "-")
+	if len(parts) != 2 {
+		return nil, errors.New("incorrect chunker string format (expected size-{size})")
+	}
+	size, err := strconv.Atoi(parts[1])
+	if err != nil {
+		return nil, err
+	} else if size <= 0 {
+		return nil, ErrSize
+	} else if size > ChunkSizeLimit {
+		return nil, ErrSizeMax
+	}
+	return NewSizeSplitter(r, int64(size)), nil
+}
+
+func parseBuzhashString(r io.Reader, _ string) (Splitter, error) {
+	return NewBuzhash(r), nil
 }
 
 func parseRabinString(r io.Reader, chunker string) (Splitter, error) {
diff --git a/chunker/parse_test.go b/chunker/parse_test.go
@@ -102,4 +102,43 @@ func TestParseSize(t *testing.T) {
 	if err != ErrSizeMax {
 		t.Fatalf("Expected 'ErrSizeMax', got: %#v", err)
 	}
+
+	_, err = FromString(r, "size-123-extra")
+	if err == nil {
+		t.Fatal("expected error for size string with extra parameters")
+	}
+}
+
+func TestParseDefault(t *testing.T) {
+	t.Parallel()
+
+	r := bytes.NewReader(randBuf(t, 1000))
+
+	s, err := FromString(r, "")
+	if err != nil {
+		t.Fatalf("expected success for empty string, got: %v", err)
+	}
+	if s == nil {
+		t.Fatal("expected non-nil splitter for empty string")
+	}
+
+	r.Reset(randBuf(t, 1000))
+	s, err = FromString(r, "default")
+	if err != nil {
+		t.Fatalf("expected success for \"default\", got: %v", err)
+	}
+	if s == nil {
+		t.Fatal("expected non-nil splitter for \"default\"")
+	}
+}
+
+func TestParseUnknown(t *testing.T) {
+	t.Parallel()
+
+	r := bytes.NewReader(randBuf(t, 1000))
+
+	_, err := FromString(r, "unknown-chunker")
+	if err == nil {
+		t.Fatal("expected error for unregistered chunker")
+	}
 }
diff --git a/chunker/registry.go b/chunker/registry.go
@@ -0,0 +1,62 @@
+package chunk
+
+import (
+	"io"
+	"strings"
+	"sync"
+)
+
+// SplitterFunc creates a [Splitter] from a reader and a specification
+// string such as "mychunker-param1-param2". It is used to register
+// custom chunkers via [Register] so they become available globally
+// through [FromString]. The function is responsible for parsing and
+// validating any parameters encoded in the string.
+type SplitterFunc func(r io.Reader, chunker string) (Splitter, error)
+
+var (
+	splittersMu sync.RWMutex
+	splitters   = map[string]SplitterFunc{}
+)
+
+func init() {
+	Register("size", parseSizeString)
+	Register("rabin", parseRabinString)
+	Register("buzhash", parseBuzhashString)
+}
+
+// Register makes a custom chunker available to [FromString] under the given
+// name. The name is matched against the portion of the chunker string before
+// the first dash. For example, passing "mychunker-128" to [FromString]
+// selects the chunker registered as "mychunker", and the [SplitterFunc]
+// receives the full string "mychunker-128" so it can parse its own parameters.
+//
+// Register is typically called from an init function:
+//
+//	func init() {
+//	    chunk.Register("mychunker", func(r io.Reader, s string) (chunk.Splitter, error) {
+//	        // parse parameters from s, return a Splitter
+//	    })
+//	}
+//
+// Register panics if name is empty, contains a dash, fn is nil, or a
+// chunker with the same name is already registered. This follows the
+// convention established by [database/sql.Register].
+//
+// Register is safe for concurrent use.
+func Register(name string, fn SplitterFunc) {
+	splittersMu.Lock()
+	defer splittersMu.Unlock()
+	if name == "" {
+		panic("chunk: Register name is empty")
+	}
+	if strings.Contains(name, "-") {
+		panic("chunk: Register name must not contain a dash: " + name)
+	}
+	if fn == nil {
+		panic("chunk: Register fn is nil")
+	}
+	if _, dup := splitters[name]; dup {
+		panic("chunk: Register called twice for chunker " + name)
+	}
+	splitters[name] = fn
+}
diff --git a/chunker/registry_test.go b/chunker/registry_test.go
@@ -0,0 +1,113 @@
+package chunk_test
+
+import (
+	"bytes"
+	"io"
+	"testing"
+
+	chunk "github.com/ipfs/boxo/chunker"
+)
+
+type noSplits struct {
+	r       io.Reader
+	drained bool
+}
+
+func (ns *noSplits) Reader() io.Reader {
+	return ns.r
+}
+
+func (ns *noSplits) NextBytes() ([]byte, error) {
+	if ns.drained {
+		return nil, io.EOF
+	}
+	ns.drained = true
+	return io.ReadAll(ns.r)
+}
+
+// TestRegister is not parallel because Register mutates package-level state
+// and panics on duplicate names.
+func TestRegister(t *testing.T) {
+	t.Run("name only", func(t *testing.T) {
+		chunk.Register("mockplain", func(r io.Reader, _ string) (chunk.Splitter, error) {
+			return &noSplits{r: r}, nil
+		})
+		r := bytes.NewReader([]byte{1, 2, 3})
+		s, err := chunk.FromString(r, "mockplain")
+		if err != nil {
+			t.Fatal(err)
+		}
+		if _, ok := s.(*noSplits); !ok {
+			t.Fatal("unexpected splitter type")
+		}
+	})
+
+	t.Run("name and params", func(t *testing.T) {
+		const chunkerStr = "mockparams-123"
+		chunk.Register("mockparams", func(r io.Reader, c string) (chunk.Splitter, error) {
+			if c != chunkerStr {
+				t.Fatalf("expected chunker string %q, got %q", chunkerStr, c)
+			}
+			return &noSplits{r: r}, nil
+		})
+		r := bytes.NewReader([]byte{1, 2, 3})
+		s, err := chunk.FromString(r, chunkerStr)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if _, ok := s.(*noSplits); !ok {
+			t.Fatal("unexpected splitter type")
+		}
+	})
+
+	t.Run("unregistered name", func(t *testing.T) {
+		r := bytes.NewReader([]byte{1, 2, 3})
+		_, err := chunk.FromString(r, "nonexistent")
+		if err == nil {
+			t.Fatal("expected error for unregistered chunker")
+		}
+	})
+
+	t.Run("panic on empty name", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Fatal("expected panic for empty name")
+			}
+		}()
+		chunk.Register("", func(r io.Reader, _ string) (chunk.Splitter, error) {
+			return nil, nil
+		})
+	})
+
+	t.Run("panic on name with dash", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Fatal("expected panic for name with dash")
+			}
+		}()
+		chunk.Register("my-format", func(r io.Reader, _ string) (chunk.Splitter, error) {
+			return nil, nil
+		})
+	})
+
+	t.Run("panic on nil ctor", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Fatal("expected panic for nil ctor")
+			}
+		}()
+		chunk.Register("nilctor", nil)
+	})
+
+	t.Run("panic on duplicate name", func(t *testing.T) {
+		defer func() {
+			if r := recover(); r == nil {
+				t.Fatal("expected panic for duplicate name")
+			}
+		}()
+		// "mockplain" was already registered in "name only" subtest.
+		chunk.Register("mockplain", func(r io.Reader, _ string) (chunk.Splitter, error) {
+			return nil, nil
+		})
+	})
+}
diff --git a/chunker/splitting.go b/chunker/splitting.go
@@ -1,7 +1,11 @@
 // Package chunk implements streaming block splitters.
-// Splitters read data from a reader and provide byte slices (chunks)
-// The size and contents of these slices depend on the splitting method
-// used.
+//
+// Splitters read data from a reader and produce byte slices (chunks).
+// The size and contents of these slices depend on the splitting method.
+//
+// Built-in methods include fixed-size, Rabin fingerprint, and Buzhash
+// content-defined chunking. Additional methods can be registered with
+// [Register] and instantiated through [FromString].
 package chunk
 
 import (
@@ -26,16 +30,19 @@ type Splitter interface {
 	NextBytes() ([]byte, error)
 }
 
-// SplitterGen is a splitter generator, given a reader.
+// SplitterGen creates a [Splitter] from a reader.
+// It is used at runtime by callers that already know which chunking
+// strategy and parameters they want (e.g. "fixed-size at 256 KiB").
+// See [SizeSplitterGen] for a convenient way to build one.
 type SplitterGen func(r io.Reader) Splitter
 
 // DefaultSplitter returns a SizeSplitter with the DefaultBlockSize.
 func DefaultSplitter(r io.Reader) Splitter {
 	return NewSizeSplitter(r, DefaultBlockSize)
 }
 
-// SizeSplitterGen returns a SplitterGen function which will create
-// a splitter with the given size when called.
+// SizeSplitterGen returns a [SplitterGen] that creates a fixed-size
+// [Splitter] with the given block size.
 func SizeSplitterGen(size int64) SplitterGen {
 	return func(r io.Reader) Splitter {
 		return NewSizeSplitter(r, size)