Skip to content

Commit 16e2453

Browse files
committed
workload/ycsb: use crrand.Perm64 for pseudoranom keys
This commit reworks the random ycsb key generation to use a pseudorandom permutation of 64-bit integers constructed deterministically from the seed, rather than hashing the key index. The pseudorandom permutation is provided by the crrand.Perm64 type. The per-key computation is likely lighterweight than the previous fnv hash, but more importantly, it avoids duplicates. This ensures that a large IMPORT of random keys will not fail due to a key uniqueness violation. This commit also accordingly renames the --insert-hash flag to --insert-random (leaving an alias with the old name). Epic: none Release note: none
1 parent a5ea3f4 commit 16e2453

File tree

3 files changed

+35
-50
lines changed

3 files changed

+35
-50
lines changed

pkg/ccl/workloadccl/allccl/all_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ func TestDeterministicInitialData(t *testing.T) {
296296
`startrek`: 0xa0249fbdf612734c,
297297
`tpcc`: 0xccfecd06eed59975,
298298
`tpch`: 0xcd2abbd021ed895d,
299-
`ycsb`: 0x0e6012ee6491a0fb,
299+
`ycsb`: 0xa00a7efc9d3b8532,
300300
}
301301

302302
var a bufalloc.ByteAllocator

pkg/workload/ycsb/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ go_library(
2121
"//pkg/workload/histogram",
2222
"//pkg/workload/workloadimpl",
2323
"@com_github_cockroachdb_cockroach_go_v2//crdb/crdbpgxv5",
24+
"@com_github_cockroachdb_crlib//crrand",
2425
"@com_github_cockroachdb_errors//:errors",
2526
"@com_github_jackc_pgx_v5//:pgx",
2627
"@com_github_jackc_pgx_v5//pgconn",

pkg/workload/ycsb/ycsb.go

Lines changed: 33 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,7 @@ package ycsb
88

99
import (
1010
"context"
11-
"encoding/binary"
1211
"fmt"
13-
"hash"
14-
"hash/fnv"
1512
"math"
1613
"math/rand/v2"
1714
"strings"
@@ -26,6 +23,7 @@ import (
2623
"github.com/cockroachdb/cockroach/pkg/workload"
2724
"github.com/cockroachdb/cockroach/pkg/workload/histogram"
2825
"github.com/cockroachdb/cockroach/pkg/workload/workloadimpl"
26+
"github.com/cockroachdb/crlib/crrand"
2927
"github.com/cockroachdb/errors"
3028
"github.com/jackc/pgx/v5"
3129
"github.com/jackc/pgx/v5/pgconn"
@@ -89,17 +87,17 @@ type ycsb struct {
8987
flags workload.Flags
9088
connFlags *workload.ConnFlags
9189

92-
timeString bool
93-
insertHash bool
94-
zeroPadding int
95-
insertStart int
96-
insertCount int
97-
recordCount int
98-
json bool
99-
families bool
100-
rmwInTxn bool
101-
sfu bool
102-
splits int
90+
timeString bool
91+
insertRandom bool
92+
zeroPadding int
93+
insertStart int
94+
insertCount int
95+
recordCount int
96+
json bool
97+
families bool
98+
rmwInTxn bool
99+
sfu bool
100+
splits int
103101

104102
workload string
105103
requestDistribution string
@@ -131,9 +129,15 @@ var ycsbMeta = workload.Meta{
131129
`scan-freq`: {RuntimeOnly: true},
132130
`read-modify-write-freq`: {RuntimeOnly: true},
133131
}
132+
g.flags.SetNormalizeFunc(func(flags *pflag.FlagSet, name string) pflag.NormalizedName {
133+
if name == `insert-hash` {
134+
name = `insert-random`
135+
}
136+
return pflag.NormalizedName(name)
137+
})
134138
g.flags.BoolVar(&g.timeString, `time-string`, false, `Prepend field[0-9] data with current time in microsecond precision.`)
135-
g.flags.BoolVar(&g.insertHash, `insert-hash`, true, `Key to be hashed or ordered.`)
136-
g.flags.IntVar(&g.zeroPadding, `zero-padding`, 1, `Key using "insert-hash=false" has zeros padded to left to make this length of digits.`)
139+
g.flags.BoolVar(&g.insertRandom, `insert-random`, true, `Key to be pseduorandom or ordered.`)
140+
g.flags.IntVar(&g.zeroPadding, `zero-padding`, 1, `Key has zeros padded to left to make this length of digits.`)
137141
g.flags.IntVar(&g.insertStart, `insert-start`, 0, `Key to start initial sequential insertions from. (default 0)`)
138142
g.flags.IntVar(&g.insertCount, `insert-count`, 10000, `Number of rows to sequentially insert before beginning workload.`)
139143
g.flags.IntVar(&g.recordCount, `record-count`, 0, `Key to start workload insertions from. Must be >= insert-start + insert-count. (Default: insert-start + insert-count)`)
@@ -328,9 +332,13 @@ func (g *ycsb) Tables() []workload.Table {
328332
Splits: workload.Tuples(
329333
g.splits,
330334
func(splitIdx int) []interface{} {
335+
w := ycsbWorker{
336+
config: g,
337+
prngPerm: crrand.MakePerm64(RandomSeed.Seed()),
338+
}
331339
step := math.MaxUint64 / uint64(g.splits+1)
332340
return []interface{}{
333-
keyNameFromHash(step * uint64(splitIdx+1)),
341+
w.buildKeyName(step * uint64(splitIdx+1)),
334342
}
335343
},
336344
),
@@ -342,7 +350,7 @@ func (g *ycsb) Tables() []workload.Table {
342350
func(rowIdx int) []interface{} {
343351
w := ycsbWorker{
344352
config: g,
345-
hashFunc: fnv.New64(),
353+
prngPerm: crrand.MakePerm64(RandomSeed.Seed()),
346354
}
347355
key := w.buildKeyName(uint64(g.insertStart + rowIdx))
348356
// TODO(peter): Need to fill in FIELD here, rather than an empty JSONB
@@ -359,12 +367,6 @@ func (g *ycsb) Tables() []workload.Table {
359367
const batchSize = 1000
360368
usertable.InitialRows = workload.BatchedTuples{
361369
NumBatches: (g.insertCount + batchSize - 1) / batchSize,
362-
// If the key sequence is hashed, duplicates are possible. Hash
363-
// collisions are inevitable at large insert counts (they're at
364-
// least inevitable at ~1b rows). Marking that the keys may contain
365-
// duplicates will cause the data loader to use INSERT ... ON
366-
// CONFLICT DO NOTHING statements.
367-
MayContainDuplicates: !g.insertHash,
368370
FillBatch: func(batchIdx int, cb coldata.Batch, _ *bufalloc.ByteAllocator) {
369371
rowBegin, rowEnd := batchIdx*batchSize, (batchIdx+1)*batchSize
370372
if rowEnd > g.insertCount {
@@ -385,7 +387,7 @@ func (g *ycsb) Tables() []workload.Table {
385387

386388
w := ycsbWorker{
387389
config: g,
388-
hashFunc: fnv.New64(),
390+
prngPerm: crrand.MakePerm64(RandomSeed.Seed()),
389391
}
390392
rng := rand.NewPCG(RandomSeed.Seed(), uint64(batchIdx))
391393

@@ -577,7 +579,7 @@ func (g *ycsb) Ops(
577579
requestGen: requestGen,
578580
scanLengthGen: scanLengthGen,
579581
rng: rng,
580-
hashFunc: fnv.New64(),
582+
prngPerm: crrand.MakePerm64(RandomSeed.Seed()),
581583
}
582584
ql.WorkerFns = append(ql.WorkerFns, w.run)
583585
}
@@ -622,8 +624,7 @@ type ycsbWorker struct {
622624
requestGen randGenerator // used to generate random keys for requests
623625
scanLengthGen randGenerator // used to generate length of scan operations
624626
rng *rand.Rand // used to generate random strings for the values
625-
hashFunc hash.Hash64
626-
hashBuf [binary.MaxVarintLen64]byte
627+
prngPerm crrand.Perm64 // used to map the key index to a pseudorandom key
627628
}
628629

629630
func (yw *ycsbWorker) run(ctx context.Context) error {
@@ -691,29 +692,12 @@ const (
691692
readModifyWriteOp operation = `readModifyWrite`
692693
)
693694

694-
func (yw *ycsbWorker) hashKey(key uint64) uint64 {
695-
yw.hashBuf = [binary.MaxVarintLen64]byte{} // clear hashBuf
696-
binary.PutUvarint(yw.hashBuf[:], key)
697-
yw.hashFunc.Reset()
698-
if _, err := yw.hashFunc.Write(yw.hashBuf[:]); err != nil {
699-
panic(err)
700-
}
701-
return yw.hashFunc.Sum64()
702-
}
703-
704695
func (yw *ycsbWorker) buildKeyName(keynum uint64) string {
705-
if yw.config.insertHash {
706-
return keyNameFromHash(yw.hashKey(keynum))
696+
if yw.config.insertRandom {
697+
// Use prngPerm to map the key index to a pseudorandom key.
698+
keynum = yw.prngPerm.At(keynum)
707699
}
708-
return keyNameFromOrder(keynum, yw.config.zeroPadding)
709-
}
710-
711-
func keyNameFromHash(hashedKey uint64) string {
712-
return fmt.Sprintf("user%d", hashedKey)
713-
}
714-
715-
func keyNameFromOrder(keynum uint64, zeroPadding int) string {
716-
return fmt.Sprintf("user%0*d", zeroPadding, keynum)
700+
return fmt.Sprintf("user%0*d", yw.config.zeroPadding, keynum)
717701
}
718702

719703
// Keys are chosen by first drawing from a Zipf distribution, hashing the drawn

0 commit comments

Comments
 (0)