cockroachdb
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 34 additions & 5 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 34 additions & 5 deletions
diff --git a/‎crsync/counters.go‎
Lines changed: 164 additions & 0 deletions b/‎crsync/counters.go‎
Lines changed: 164 additions & 0 deletions
diff --git a/‎crsync/counters_test.go‎
Lines changed: 143 additions & 0 deletions b/‎crsync/counters_test.go‎
Lines changed: 143 additions & 0 deletions
@@ -12,7 +12,6 @@ on:
 jobs:
 
   linux:
-    name: go-linux
     strategy:
       matrix:
         go: ["1.22", "1.23"]
@@ -30,7 +29,6 @@ jobs:
       - run: go vet ./...
 
   linux-32bit:
-    name: go-linux-32bit
     strategy:
       matrix:
         go: ["1.22"]
@@ -47,7 +45,6 @@ jobs:
       - run: go test ./...
 
   darwin:
-    name: go-macos
     strategy:
       matrix:
         go: ["1.22"]
@@ -64,7 +61,6 @@ jobs:
       - run: go test ./...
 
   linux-stress:
-    name: go-linux-stress
     strategy:
       matrix:
         go: ["1.22"]
@@ -82,7 +78,6 @@ jobs:
       - run: go test ./... -exec 'stress -p 2 -maxruns 1000' -v
 
   linux-stress-race:
-    name: go-linux-stress-race
     strategy:
       matrix:
         go: ["1.22"]
@@ -98,3 +93,37 @@ jobs:
       - run: go install github.com/cockroachdb/stress@latest
       - run: go test -tags crlib_invariants ./... -race -exec 'stress -p 1 -maxruns 100' -v
       - run: go test ./... -race -exec 'stress -p 1 -maxruns 100' -v
+
+  linux-cockroach-go:
+    runs-on: ubuntu-latest
+    env:
+      GO_BRANCH: cockroach-go1.23.12
+
+    steps:
+      - uses: actions/checkout@v4
+
+      # Step 1: Fetch the branch tip SHA for cache key
+      - name: Get cockroachdb/go commit hash
+        id: go-sha
+        run: |
+          SHA=$(git ls-remote https://github.com/cockroachdb/go.git refs/heads/$GO_BRANCH | cut -f1)
+          echo "GO_SHA=$SHA" >> $GITHUB_ENV
+
+      # Step 2: Restore cache (per branch + commit SHA)
+      - name: Cache custom Go toolchain
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/cockroachdb-go/${{ env.GO_SHA }}
+          key: cockroachdb-${{ env.GO_SHA }}
+
+      # Step 3: Install bootstrap Go (needed to build fork)
+      - name: Install bootstrap Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: "1.23.x"
+
+      # Step 4: Run tests with custom Go
+      - run: ./scripts/run-tests-with-custom-go.sh ./...
+      - run: ./scripts/run-tests-with-custom-go.sh -tags crlib_invariants ./...
+      - run: ./scripts/run-tests-with-custom-go.sh -race ./...
+
@@ -0,0 +1,164 @@
+// Copyright 2025 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package crsync
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// Counter is a single logical counter backed by a sharded implementation
+// (Counters) under the hood.
+//
+// Properties:
+//   - Thread-safe increments: Add() can be called concurrently from many
+//     goroutines.
+//   - Low write contention: Writes are sharded to minimize cache-line
+//     ping‑pong.
+//   - Simple reads: Get() aggregates across shards to return the current value.
+//   - Construction: Use MakeCounter(). The zero value is NOT ready to use.
+//   - Performance: Add is O(1) with low contention; Get is O(NumShards()).
+//   - Consistency: Reads are best-effort snapshots without global locking. Each
+//     shard is read atomically, but the aggregation is not linearizable with
+//     respect to concurrent Add calls. This is typically acceptable for metrics
+//     and counters.
+//
+// Example:
+//
+//	c := MakeCounter()
+//	c.Add(1)
+//	c.Add(41)
+//	fmt.Println(c.Get()) // 42
+type Counter struct {
+	c Counters
+}
+
+// MakeCounter initializes a new Counter.
+func MakeCounter() Counter {
+	return Counter{
+		c: MakeCounters(1),
+	}
+}
+
+// Add atomically adds delta to the counter. It is safe for concurrent use by
+// multiple goroutines; delta may be negative (decrement).
+//
+// Add is very efficient: a single atomic increment on a mostly uncontended
+// cache line.
+func (c *Counter) Add(delta int64) {
+	c.c.Add(0, delta)
+}
+
+// Get the current value of the counter.
+//
+// It safe to call Get() while there are concurrent Add() calls (but there is no
+// guarantee wrt which of those are reflected).
+//
+// Get is O(NumShards()) so it is more expensive than Add().
+func (c *Counter) Get() int64 {
+	return c.c.Get(0)
+}
+
+// Counters is a sharded set of logical counters that can be incremented
+// concurrently with low contention.
+//
+// Use when you need N independent counters that are updated from many
+// goroutines (e.g., metrics like hits/misses/errors, per-state tallies).
+//
+// Properties:
+//   - Thread-safe increments: Add() can be called concurrently from many
+//     goroutines.
+//   - Low write contention: Writes are sharded to minimize cache-line
+//     ping‑pong.
+//   - Simple reads: Get() aggregates across shards to return the current value.
+//   - Construction: Use MakeCounter(). The zero value is NOT ready to use.
+//   - Performance: Add is O(1) with low contention; Get is O(NumShards());
+//   - Consistency: Reads are best-effort snapshots without global locking. Each
+//     shard is read atomically, but the aggregation is not linearizable with
+//     respect to concurrent Add calls. This is typically acceptable for metrics
+//     and counters.
+type Counters struct {
+	numShards uint32
+	// shardSize is the number of counters per shard.
+	shardSize   uint32
+	counters    []atomic.Int64
+	numCounters int
+}
+
+// Number of counters per cache line. We assume the typical 64-byte cache line.
+// Must be a power of 2.
+const countersPerCacheLine = 8
+
+// MakeCounters creates a new Counters with the specified number of counters.
+func MakeCounters(numCounters int) Counters {
+	return makeCounters(NumShards(), numCounters)
+}
+
+func makeCounters(numShards, numCounters int) Counters {
+	// Round up to the nearest cacheline size, to avoid false sharing.
+	shardSize := (numCounters + countersPerCacheLine - 1) &^ (countersPerCacheLine - 1)
+	counters := make([]atomic.Int64, shardSize*numShards+countersPerCacheLine)
+	// Align the slice to a cache line.
+	if r := (uintptr(unsafe.Pointer(&counters[0])) / 8) & (countersPerCacheLine - 1); r != 0 {
+		counters = counters[countersPerCacheLine-r:]
+	}
+	return Counters{
+		numShards:   uint32(numShards),
+		shardSize:   uint32(shardSize),
+		counters:    counters,
+		numCounters: numCounters,
+	}
+}
+
+// Add atomically adds delta to the specified counter. It is safe for concurrent
+// use by multiple goroutines; delta may be negative (decrement).
+//
+// Add is very efficient: a single atomic increment on a mostly uncontended
+// cache line.
+func (c *Counters) Add(counter int, delta int64) {
+	shard := uint32(CPUBiasedInt()) % c.numShards
+	c.counters[shard*c.shardSize+uint32(counter)].Add(delta)
+}
+
+// Get the current value of the specified counter.
+//
+// It safe to call Get() while there are concurrent Add() calls (but there is no
+// guarantee wrt which of those are reflected).
+//
+// Get is O(NumShards()) so it is more expensive than Add().
+func (c *Counters) Get(counter int) int64 {
+	var res int64
+	for shard := range c.numShards {
+		res += c.counters[shard*c.shardSize+uint32(counter)].Load()
+	}
+	return res
+}
+
+// All returns the current values of all counters.
+//   - Length of the returned slice equals the number of logical counters
+//     passed to MakeCounters.
+//   - Safe for concurrent use.
+//   - Complexity is O(NumShards() * numCounters).
+//   - Snapshot semantics: no ordering guarantees w.r.t. concurrent updates.
+func (c *Counters) All() []int64 {
+	res := make([]int64, c.numCounters)
+	for i := range c.numShards {
+		start := int(i * c.shardSize)
+		for j := range res {
+			res[j] += c.counters[start+j].Load()
+		}
+	}
+	return res
+}
@@ -0,0 +1,143 @@
+// Copyright 2025 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package crsync
+
+import (
+	"fmt"
+	"math/rand/v2"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"testing"
+)
+
+// BenchmarkCounters compares the performance of Counters against simple atomic
+// counters, and against sharded counters with 4*P shards and random shard
+// choice. There are two Counters versions (crsync and crync-cr), depending on
+// whether the CockroachDB Go runtime (and cockroach_go tag) is used.
+//
+// # Benchmark results
+//
+// ## Apple M1 Pro (10 core)
+//
+// benchmark      simple       randshards   crsync        crsync-cr
+// c=1/p=1-10     6.96ns ± 0%  9.69ns ± 0%  12.02ns ± 0%  7.34ns ± 1%
+// c=1/p=4-10      169ns ±10%    53ns ± 4%     21ns ±39%    13ns ±26%
+// c=1/p=10-10     752ns ± 3%   125ns ± 0%     51ns ±20%    51ns ± 7% *
+// c=1/p=40-10    3.04µs ± 2%  0.67µs ±13%   0.26µs ±31%  0.29µs ±16%
+// c=10/p=1-10    4.49ns ± 1%  9.72ns ± 0%  12.39ns ± 0%  4.95ns ± 0%
+// c=10/p=4-10     147ns ± 5%    49ns ± 3%     27ns ±33%     6ns ± 1%
+// c=10/p=10-10    790ns ± 6%   106ns ± 0%     47ns ±20%     8ns ± 4% *
+// c=10/p=40-10   3.24µs ± 2%  0.61µs ±11%   0.24µs ± 9%  0.11µs ±28%
+// c=100/p=1-10   4.33ns ± 0%  9.76ns ± 0%  12.41ns ± 0%  4.82ns ± 0%
+// c=100/p=4-10   73.9ns ± 4%  46.0ns ± 5%   21.9ns ±22%   6.2ns ± 6%
+// c=100/p=10-10   197ns ± 1%    94ns ±10%     53ns ±17%    11ns ± 1% *
+// c=100/p=40-10   893ns ± 6%   524ns ± 7%    249ns ±19%   125ns ± 8%
+// .                                              * one worker per core
+//
+// ## Intel(R) Xeon(R) CPU @ 2.80GH (24 core, n2-custom-24-32768 on GCE)
+//
+// benchmark      simple       randshards   crsync       crsync-cr
+// c=1/p=1-24     14.1ns ± 0%  21.0ns ± 1%  37.7ns ± 0%  13.5ns ± 0%
+// c=1/p=4-24     92.9ns ± 1%  50.1ns ± 1%  63.8ns ±29%  13.3ns ± 0%
+// c=1/p=24-24     487ns ±18%  178ns ±105%   144ns ±39%    57ns ±60% *
+// c=1/p=96-24    1.84µs ± 2%  0.59µs ± 3%  0.52µs ± 6%  0.29µs ± 7%
+// c=10/p=1-24    13.8ns ± 0%  21.2ns ± 1%  38.0ns ± 1%  14.1ns ± 3%
+// c=10/p=4-24    91.0ns ± 3%  48.6ns ± 1%  63.8ns ±16%  14.0ns ± 2%
+// c=10/p=24-24    461ns ± 8%   176ns ±53%   146ns ±36%   110ns ±84% *
+// c=10/p=96-24   1.79µs ± 1%  0.55µs ± 8%  0.52µs ± 6%  0.31µs ± 5%
+// c=100/p=1-24   13.7ns ± 0%  22.0ns ± 2%  38.0ns ± 0%  14.1ns ±10%
+// c=100/p=4-24   63.5ns ± 1%  46.4ns ± 2%  66.7ns ±30%  14.2ns ± 5%
+// c=100/p=24-24   295ns ±27%    87ns ± 1%   121ns ±24%    44ns ±71% *
+// c=100/p=96-24  1.11µs ± 2%  0.53µs ± 4%  0.52µs ± 8%  0.31µs ± 5%
+// .                                             * one worker per core
+func BenchmarkCounters(b *testing.B) {
+	forEach := func(b *testing.B, fn func(b *testing.B, c, p int)) {
+		for _, c := range []int{1, 10, 100} {
+			for _, p := range []int{1, 4, runtime.GOMAXPROCS(0), 4 * runtime.GOMAXPROCS(0)} {
+				b.Run(fmt.Sprintf("c=%d/p=%d", c, p), func(b *testing.B) {
+					fn(b, c, p)
+				})
+			}
+		}
+	}
+
+	// simple uses non-sharded atomic counters.
+	b.Run("simple", func(b *testing.B) {
+		forEach(b, func(b *testing.B, c, p int) {
+			counters := make([]atomic.Int64, c)
+			incCounter := func(counter int) {
+				counters[counter].Add(1)
+			}
+			runCountersBenchmark(b, c, p, incCounter)
+		})
+	})
+
+	// randshards uses a 4*N shards with random shard choice.
+	b.Run("randshards", func(b *testing.B) {
+		forEach(b, func(b *testing.B, c, p int) {
+			counters := makeCounters(runtime.GOMAXPROCS(0)*4, c)
+			incCounter := func(counter int) {
+				shard := rand.Uint32N(counters.numShards)
+				counters.counters[shard*counters.shardSize+uint32(counter)].Add(1)
+			}
+			runCountersBenchmark(b, c, p, incCounter)
+		})
+	})
+
+	name := "crsync"
+	if usingCockroachGo {
+		name += "-cr"
+	}
+	b.Run(name, func(b *testing.B) {
+		forEach(b, func(b *testing.B, c, p int) {
+			counters := MakeCounters(c)
+			incCounter := func(counter int) {
+				counters.Add(counter, 1)
+			}
+			runCountersBenchmark(b, c, p, incCounter)
+		})
+	})
+}
+
+func runCountersBenchmark(
+	b *testing.B, numCounters, parallelism int, incCounter func(counter int),
+) {
+	const batchSize = 1000
+	// Each element of ch corresponds to a batch of operations to be performed.
+	ch := make(chan int, 1+b.N/batchSize)
+
+	var wg sync.WaitGroup
+	for range parallelism {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+
+			rng := rand.New(rand.NewPCG(rand.Uint64(), rand.Uint64()))
+			for numOps := range ch {
+				for range numOps {
+					incCounter(rng.IntN(numCounters))
+				}
+			}
+		}()
+	}
+
+	numOps := int64(b.N) * int64(parallelism)
+	for i := int64(0); i < numOps; i += batchSize {
+		ch <- int(min(batchSize, numOps-i))
+	}
+	close(ch)
+	wg.Wait()
+}