diff --git a/xxhash_simulation/fnv.log b/xxhash_simulation/fnv.log new file mode 100644 index 00000000000..878f477ae48 --- /dev/null +++ b/xxhash_simulation/fnv.log @@ -0,0 +1,119 @@ +FNV-1a Collision Rate Simulation +================================= + +NOTE: We only count TRUE collisions (different inputs → same hash) +Duplicate inputs (same input → same hash) are NOT collisions! + + +========== FNV-1a 32-bit Tests ========== + +=== Random Strings (10K, length=20) [FNV-1a 32-bit] === +Number of inputs: 10000 +Unique hashes: 10000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0116403680 (1.1640e-02) +Duration: 7.167041ms +✓ No true collisions detected + +=== Random Strings (100K, length=20) [FNV-1a 32-bit] === +Number of inputs: 100000 +Unique hashes: 99997 +TRUE collisions: 3 (different inputs, same hash) +Collision rate: 0.0000300000 (3.0000e-05) +Expected collisions: 1.1641415767 (1.1641e+00) +Duration: 77.920439ms +⚠️ WARNING: TRUE COLLISIONS DETECTED! + +=== Sequential Inputs (100K) [FNV-1a 32-bit] === +Number of inputs: 100000 +Unique hashes: 100000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 1.1641415767 (1.1641e+00) +Duration: 50.019091ms +✓ No true collisions detected + +=== With Duplicate Inputs (50K, 20% duplicates) [FNV-1a 32-bit] === +Number of inputs: 50000 +Unique hashes: 48895 +TRUE collisions: 0 (different inputs, same hash) +Duplicate inputs: 1105 (same input, same hash - NOT collisions) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.2910324838 (2.9103e-01) +Duration: 44.62614ms +✓ No true collisions detected + +=== Random Strings (10M, length=20) [FNV-1a 32-bit] === +Number of inputs: 10000000 +Unique hashes: 9988181 +TRUE collisions: 11819 (different inputs, same hash) +Collision rate: 0.0011819000 (1.1819e-03) +Expected collisions: 11641.5310185403 (1.1642e+04) +Duration: 12.749422404s +⚠️ WARNING: TRUE COLLISIONS DETECTED! + +========== FNV-1a 64-bit Tests ========== + +=== Random Strings (100K, length=20) [FNV-1a 64-bit] === +Number of inputs: 100000 +Unique hashes: 100000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000000003 (2.7105e-10) +Duration: 50.356156ms +✓ No true collisions detected + +=== Random Strings (1M, length=20) [FNV-1a 64-bit] === +Number of inputs: 1000000 +Unique hashes: 1000000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000000271 (2.7105e-08) +Duration: 786.467599ms +✓ No true collisions detected + +=== Sequential Inputs (1M) [FNV-1a 64-bit] === +Number of inputs: 1000000 +Unique hashes: 1000000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000000271 (2.7105e-08) +Duration: 738.625184ms +✓ No true collisions detected + +=== Random Strings (10M, length=20) [FNV-1a 64-bit] === +Number of inputs: 10000000 +Unique hashes: 10000000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000027105 (2.7105e-06) +Duration: 10.981820092s +✓ No true collisions detected + +=== Rangom Strings (100M, length=20) [FNV-1a 64-bit] === +Number of inputs: 100000000 +Unique hashes: 100000000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0002710505 (2.7105e-04) +Duration: 2m10.32865214s +✓ No true collisions detected + +=== Summary === + +FNV-1a 32-bit (32-bit output): +- Hash space size: 2^32 = 4,294,967,296 +- Expected first collision after ~77,000 inputs (birthday paradox) +- Collision probability for 100K inputs: ~0.001 (0.1%) +- You WILL see collisions with 100K+ inputs + +FNV-1a 64-bit (64-bit output): +- Hash space size: 2^64 = 18,446,744,073,709,551,616 +- Expected first collision after ~5 billion inputs (birthday paradox) +- Collision probability for 1M inputs: ~0.00000000000005 +- You should NOT see collisions with < 100M inputs + +Key Distinction: +✓ TRUE COLLISION: Different inputs produce the same hash (BAD) +✓ DUPLICATE INPUT: Same input produces the same hash (EXPECTED) diff --git a/xxhash_simulation/fnv_collision.go b/xxhash_simulation/fnv_collision.go new file mode 100644 index 00000000000..0518745fc95 --- /dev/null +++ b/xxhash_simulation/fnv_collision.go @@ -0,0 +1,378 @@ +package main + +import ( + "fmt" + "hash/fnv" + "math" + "math/rand" + "time" +) + +// CollisionStats holds the results of a collision test +type CollisionStats struct { + HashType string + NumInputs int + UniqueHashes int + Collisions int + CollisionRate float64 + ExpectedCollisions float64 + Duration time.Duration + DuplicateInputs int +} + +// GenerateRandomString creates a random string of given length +func GenerateRandomString(length int, r *rand.Rand) string { + const charset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + b := make([]byte, length) + for i := range b { + b[i] = charset[r.Intn(len(charset))] + } + return string(b) +} + +// ComputeFNV32a computes FNV-1a 32-bit hash +func ComputeFNV32a(data string) uint32 { + h := fnv.New32a() + h.Write([]byte(data)) + return h.Sum32() +} + +// ComputeFNV64a computes FNV-1a 64-bit hash +func ComputeFNV64a(data string) uint64 { + h := fnv.New64a() + h.Write([]byte(data)) + return h.Sum64() +} + +// TestCollisionRateFNV32 tests FNV-1a 32-bit collision rate with random inputs +func TestCollisionRateFNV32(numInputs int, stringLength int) CollisionStats { + start := time.Now() + + // Map hash to original input to detect TRUE collisions + hashToInput := make(map[uint32]string) + collisions := 0 + duplicateInputs := 0 + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + for i := 0; i < numInputs; i++ { + // Generate random input + data := GenerateRandomString(stringLength, r) + + // Compute FNV-1a 32-bit hash + hash := ComputeFNV32a(data) + + if originalInput, exists := hashToInput[hash]; exists { + // Hash collision detected - check if it's a TRUE collision + if originalInput != data { + collisions++ + fmt.Printf("FNV32 TRUE COLLISION FOUND!\n") + fmt.Printf(" Input 1: %s (hash: %d)\n", originalInput, hash) + fmt.Printf(" Input 2: %s (hash: %d)\n", data, hash) + } else { + duplicateInputs++ + } + } else { + hashToInput[hash] = data + } + } + + duration := time.Since(start) + collisionRate := float64(collisions) / float64(numInputs) + + // Calculate expected collisions using birthday paradox approximation + hashBits := 32.0 + expectedCollisions := (float64(numInputs) * float64(numInputs-1)) / (2.0 * math.Pow(2, hashBits)) + + if duplicateInputs > 0 { + fmt.Printf("Note: %d duplicate inputs were generated (not counted as collisions)\n", duplicateInputs) + } + + return CollisionStats{ + HashType: "FNV-1a 32-bit", + NumInputs: numInputs, + UniqueHashes: len(hashToInput), + Collisions: collisions, + CollisionRate: collisionRate, + ExpectedCollisions: expectedCollisions, + Duration: duration, + DuplicateInputs: duplicateInputs, + } +} + +// TestCollisionRateFNV64 tests FNV-1a 64-bit collision rate with random inputs +func TestCollisionRateFNV64(numInputs int, stringLength int) CollisionStats { + start := time.Now() + + // Map hash to original input to detect TRUE collisions + hashToInput := make(map[uint64]string) + collisions := 0 + duplicateInputs := 0 + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + for i := 0; i < numInputs; i++ { + // Generate random input + data := GenerateRandomString(stringLength, r) + + // Compute FNV-1a 64-bit hash + hash := ComputeFNV64a(data) + + if originalInput, exists := hashToInput[hash]; exists { + // Hash collision detected - check if it's a TRUE collision + if originalInput != data { + collisions++ + fmt.Printf("FNV64 TRUE COLLISION FOUND!\n") + fmt.Printf(" Input 1: %s (hash: %d)\n", originalInput, hash) + fmt.Printf(" Input 2: %s (hash: %d)\n", data, hash) + } else { + duplicateInputs++ + } + } else { + hashToInput[hash] = data + } + } + + duration := time.Since(start) + collisionRate := float64(collisions) / float64(numInputs) + + // Calculate expected collisions using birthday paradox approximation + hashBits := 64.0 + expectedCollisions := (float64(numInputs) * float64(numInputs-1)) / (2.0 * math.Pow(2, hashBits)) + + if duplicateInputs > 0 { + fmt.Printf("Note: %d duplicate inputs were generated (not counted as collisions)\n", duplicateInputs) + } + + return CollisionStats{ + HashType: "FNV-1a 64-bit", + NumInputs: numInputs, + UniqueHashes: len(hashToInput), + Collisions: collisions, + CollisionRate: collisionRate, + ExpectedCollisions: expectedCollisions, + Duration: duration, + DuplicateInputs: duplicateInputs, + } +} + +// TestSequentialInputsFNV32 tests collision rate with sequential numeric inputs (32-bit) +func TestSequentialInputsFNV32(numInputs int) CollisionStats { + start := time.Now() + + hashToInput := make(map[uint32]string) + collisions := 0 + + for i := 0; i < numInputs; i++ { + data := fmt.Sprintf("input_%d", i) + hash := ComputeFNV32a(data) + + if originalInput, exists := hashToInput[hash]; exists { + if originalInput != data { + collisions++ + fmt.Printf("FNV32 TRUE COLLISION FOUND!\n") + fmt.Printf(" Input 1: %s (hash: %d)\n", originalInput, hash) + fmt.Printf(" Input 2: %s (hash: %d)\n", data, hash) + } + } else { + hashToInput[hash] = data + } + } + + duration := time.Since(start) + collisionRate := float64(collisions) / float64(numInputs) + hashBits := 32.0 + expectedCollisions := (float64(numInputs) * float64(numInputs-1)) / (2.0 * math.Pow(2, hashBits)) + + return CollisionStats{ + HashType: "FNV-1a 32-bit", + NumInputs: numInputs, + UniqueHashes: len(hashToInput), + Collisions: collisions, + CollisionRate: collisionRate, + ExpectedCollisions: expectedCollisions, + Duration: duration, + DuplicateInputs: 0, + } +} + +// TestSequentialInputsFNV64 tests collision rate with sequential numeric inputs (64-bit) +func TestSequentialInputsFNV64(numInputs int) CollisionStats { + start := time.Now() + + hashToInput := make(map[uint64]string) + collisions := 0 + + for i := 0; i < numInputs; i++ { + data := fmt.Sprintf("input_%d", i) + hash := ComputeFNV64a(data) + + if originalInput, exists := hashToInput[hash]; exists { + if originalInput != data { + collisions++ + fmt.Printf("FNV64 TRUE COLLISION FOUND!\n") + fmt.Printf(" Input 1: %s (hash: %d)\n", originalInput, hash) + fmt.Printf(" Input 2: %s (hash: %d)\n", data, hash) + } + } else { + hashToInput[hash] = data + } + } + + duration := time.Since(start) + collisionRate := float64(collisions) / float64(numInputs) + hashBits := 64.0 + expectedCollisions := (float64(numInputs) * float64(numInputs-1)) / (2.0 * math.Pow(2, hashBits)) + + return CollisionStats{ + HashType: "FNV-1a 64-bit", + NumInputs: numInputs, + UniqueHashes: len(hashToInput), + Collisions: collisions, + CollisionRate: collisionRate, + ExpectedCollisions: expectedCollisions, + Duration: duration, + DuplicateInputs: 0, + } +} + +// TestWithDuplicateInputsFNV32 demonstrates the difference between duplicate inputs and collisions +func TestWithDuplicateInputsFNV32(numInputs int, duplicatePercent float64) CollisionStats { + start := time.Now() + + hashToInput := make(map[uint32]string) + collisions := 0 + duplicateInputs := 0 + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + // Generate a pool of unique strings + uniqueStrings := make([]string, int(float64(numInputs)*(1.0-duplicatePercent))) + for i := range uniqueStrings { + uniqueStrings[i] = GenerateRandomString(20, r) + } + + for i := 0; i < numInputs; i++ { + var data string + + if r.Float64() < duplicatePercent && len(uniqueStrings) > 0 { + data = uniqueStrings[r.Intn(len(uniqueStrings))] + } else { + data = GenerateRandomString(20, r) + } + + hash := ComputeFNV32a(data) + + if originalInput, exists := hashToInput[hash]; exists { + if originalInput != data { + collisions++ + fmt.Printf("FNV32 TRUE COLLISION FOUND!\n") + fmt.Printf(" Input 1: %s (hash: %d)\n", originalInput, hash) + fmt.Printf(" Input 2: %s (hash: %d)\n", data, hash) + } else { + duplicateInputs++ + } + } else { + hashToInput[hash] = data + } + } + + duration := time.Since(start) + collisionRate := float64(collisions) / float64(numInputs) + hashBits := 32.0 + expectedCollisions := (float64(numInputs) * float64(numInputs-1)) / (2.0 * math.Pow(2, hashBits)) + + return CollisionStats{ + HashType: "FNV-1a 32-bit", + NumInputs: numInputs, + UniqueHashes: len(hashToInput), + Collisions: collisions, + CollisionRate: collisionRate, + ExpectedCollisions: expectedCollisions, + Duration: duration, + DuplicateInputs: duplicateInputs, + } +} + +// PrintStats displays collision statistics +func PrintStats(testName string, stats CollisionStats) { + fmt.Printf("\n=== %s [%s] ===\n", testName, stats.HashType) + fmt.Printf("Number of inputs: %d\n", stats.NumInputs) + fmt.Printf("Unique hashes: %d\n", stats.UniqueHashes) + fmt.Printf("TRUE collisions: %d (different inputs, same hash)\n", stats.Collisions) + if stats.DuplicateInputs > 0 { + fmt.Printf("Duplicate inputs: %d (same input, same hash - NOT collisions)\n", stats.DuplicateInputs) + } + fmt.Printf("Collision rate: %.10f (%.4e)\n", stats.CollisionRate, stats.CollisionRate) + fmt.Printf("Expected collisions: %.10f (%.4e)\n", stats.ExpectedCollisions, stats.ExpectedCollisions) + fmt.Printf("Duration: %v\n", stats.Duration) + + if stats.Collisions > 0 { + fmt.Printf("⚠️ WARNING: TRUE COLLISIONS DETECTED!\n") + } else { + fmt.Printf("✓ No true collisions detected\n") + } +} + +func main() { + fmt.Println("FNV-1a Collision Rate Simulation") + fmt.Println("=================================") + fmt.Println("\nNOTE: We only count TRUE collisions (different inputs → same hash)") + fmt.Println("Duplicate inputs (same input → same hash) are NOT collisions!\n") + + fmt.Println("\n========== FNV-1a 32-bit Tests ==========") + + // FNV32 Test 1: Small scale (10K inputs - likely to see collisions) + stats1 := TestCollisionRateFNV32(10_000, 20) + PrintStats("Random Strings (10K, length=20)", stats1) + + // FNV32 Test 2: Medium scale (100K inputs - very likely to see collisions) + stats2 := TestCollisionRateFNV32(100_000, 20) + PrintStats("Random Strings (100K, length=20)", stats2) + + // FNV32 Test 3: Sequential inputs (100K) + stats3 := TestSequentialInputsFNV32(100_000) + PrintStats("Sequential Inputs (100K)", stats3) + + // FNV32 Test 4: With duplicate inputs + stats4 := TestWithDuplicateInputsFNV32(50_000, 0.20) + PrintStats("With Duplicate Inputs (50K, 20% duplicates)", stats4) + + stats21 := TestCollisionRateFNV32(10_000_000, 20) + PrintStats("Random Strings (10M, length=20)", stats21) + + fmt.Println("\n========== FNV-1a 64-bit Tests ==========") + + // FNV64 Test 1: Small scale (100K inputs) + stats5 := TestCollisionRateFNV64(100_000, 20) + PrintStats("Random Strings (100K, length=20)", stats5) + + // FNV64 Test 2: Medium scale (1M inputs) + stats6 := TestCollisionRateFNV64(1_000_000, 20) + PrintStats("Random Strings (1M, length=20)", stats6) + + // FNV64 Test 3: Sequential inputs (1M) + stats7 := TestSequentialInputsFNV64(1_000_000) + PrintStats("Sequential Inputs (1M)", stats7) + + stats51 := TestCollisionRateFNV64(10_000_000, 20) + PrintStats("Random Strings (10M, length=20)", stats51) + + stats52 := TestCollisionRateFNV64(1_00_000_000, 20) + PrintStats("Rangom Strings (100M, length=20)", stats52) + + fmt.Println("\n=== Summary ===") + fmt.Println("\nFNV-1a 32-bit (32-bit output):") + fmt.Println("- Hash space size: 2^32 = 4,294,967,296") + fmt.Println("- Expected first collision after ~77,000 inputs (birthday paradox)") + fmt.Println("- Collision probability for 100K inputs: ~0.001 (0.1%)") + fmt.Println("- You WILL see collisions with 100K+ inputs") + + fmt.Println("\nFNV-1a 64-bit (64-bit output):") + fmt.Println("- Hash space size: 2^64 = 18,446,744,073,709,551,616") + fmt.Println("- Expected first collision after ~5 billion inputs (birthday paradox)") + fmt.Println("- Collision probability for 1M inputs: ~0.00000000000005") + fmt.Println("- You should NOT see collisions with < 100M inputs") + + fmt.Println("\nKey Distinction:") + fmt.Println("✓ TRUE COLLISION: Different inputs produce the same hash (BAD)") + fmt.Println("✓ DUPLICATE INPUT: Same input produces the same hash (EXPECTED)") + +} diff --git a/xxhash_simulation/xxhash.log b/xxhash_simulation/xxhash.log new file mode 100644 index 00000000000..71d54283e8c --- /dev/null +++ b/xxhash_simulation/xxhash.log @@ -0,0 +1,91 @@ +xxHash64 Collision Rate Simulation +=================================== + +NOTE: We only count TRUE collisions (different inputs → same hash) +Duplicate inputs (same input → same hash) are NOT collisions! + + +=== Random Strings (100K, length=20) === +Number of inputs: 100000 +Unique hashes: 100000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000000003 (2.7105e-10) +Duration: 61.093329ms +✓ No true collisions detected + +=== Random Strings (1M, length=20) === +Number of inputs: 1000000 +Unique hashes: 1000000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000000271 (2.7105e-08) +Duration: 811.683204ms +✓ No true collisions detected + +=== Random Strings (10M, length=20) === +Number of inputs: 10000000 +Unique hashes: 10000000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000027105 (2.7105e-06) +Duration: 9.192752766s +✓ No true collisions detected + +=== Sequential Inputs (1M) === +Number of inputs: 1000000 +Unique hashes: 1000000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000000271 (2.7105e-08) +Duration: 622.331864ms +✓ No true collisions detected + +=== Similar Strings (1M) === +Number of inputs: 1000000 +Unique hashes: 1000000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000000271 (2.7105e-08) +Duration: 683.829445ms +✓ No true collisions detected +Note: 588 duplicate inputs were generated (not counted as collisions) + +=== Short Random Strings (1M, length=5) === +Number of inputs: 1000000 +Unique hashes: 999412 +TRUE collisions: 0 (different inputs, same hash) +Duplicate inputs: 588 (same input, same hash - NOT collisions) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000000271 (2.7105e-08) +Duration: 518.838767ms +✓ No true collisions detected + +=== With Duplicate Inputs (100K, 20% duplicates) === +Number of inputs: 100000 +Unique hashes: 97748 +TRUE collisions: 0 (different inputs, same hash) +Duplicate inputs: 2252 (same input, same hash - NOT collisions) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0000000003 (2.7105e-10) +Duration: 74.503253ms +✓ No true collisions detected + +=== Random String (100M, length=20) === +Number of inputs: 100000000 +Unique hashes: 100000000 +TRUE collisions: 0 (different inputs, same hash) +Collision rate: 0.0000000000 (0.0000e+00) +Expected collisions: 0.0002710505 (2.7105e-04) +Duration: 2m7.181218892s +✓ No true collisions detected + +=== Summary === +For xxHash64 (64-bit output): +- Expected first collision after ~5 billion inputs (birthday paradox) +- Collision probability for 1M inputs: ~0.00000000000005 +- Hash space size: 2^64 = 18,446,744,073,709,551,616 + +Key Distinction: +✓ TRUE COLLISION: Different inputs produce the same hash (BAD) +✓ DUPLICATE INPUT: Same input produces the same hash (EXPECTED) diff --git a/xxhash_simulation/xxhash_collision.go b/xxhash_simulation/xxhash_collision.go new file mode 100644 index 00000000000..bd3a486f5b4 --- /dev/null +++ b/xxhash_simulation/xxhash_collision.go @@ -0,0 +1,306 @@ +package main + +import ( + "fmt" + "math" + "math/rand" + "time" + + "github.com/cespare/xxhash/v2" +) + +// CollisionStats holds the results of a collision test +type CollisionStats struct { + NumInputs int + UniqueHashes int + Collisions int + CollisionRate float64 + ExpectedCollisions float64 + Duration time.Duration + DuplicateInputs int +} + +// GenerateRandomString creates a random string of given length +func GenerateRandomString(length int, r *rand.Rand) string { + const charset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + b := make([]byte, length) + for i := range b { + b[i] = charset[r.Intn(len(charset))] + } + return string(b) +} + +// TestCollisionRate tests xxHash collision rate with random inputs +func TestCollisionRate(numInputs int, stringLength int) CollisionStats { + start := time.Now() + + // Map hash to original input to detect TRUE collisions + hashToInput := make(map[uint64]string) + collisions := 0 + duplicateInputs := 0 + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + for i := 0; i < numInputs; i++ { + // Generate random input + data := GenerateRandomString(stringLength, r) + + // Compute xxHash64 + hash := xxhash.Sum64String(data) + + if originalInput, exists := hashToInput[hash]; exists { + // Hash collision detected - check if it's a TRUE collision + if originalInput != data { + collisions++ + fmt.Printf("TRUE COLLISION FOUND!\n") + fmt.Printf(" Input 1: %s (hash: %d)\n", originalInput, hash) + fmt.Printf(" Input 2: %s (hash: %d)\n", data, hash) + } else { + duplicateInputs++ + } + } else { + hashToInput[hash] = data + } + } + + duration := time.Since(start) + collisionRate := float64(collisions) / float64(numInputs) + + // Calculate expected collisions using birthday paradox approximation + // E[collisions] ≈ n² / (2 * 2^bits) + hashBits := 64.0 + expectedCollisions := (float64(numInputs) * float64(numInputs-1)) / (2.0 * math.Pow(2, hashBits)) + + if duplicateInputs > 0 { + fmt.Printf("Note: %d duplicate inputs were generated (not counted as collisions)\n", duplicateInputs) + } + + return CollisionStats{ + NumInputs: numInputs, + UniqueHashes: len(hashToInput), + Collisions: collisions, + CollisionRate: collisionRate, + ExpectedCollisions: expectedCollisions, + Duration: duration, + DuplicateInputs: duplicateInputs, + } +} + +// TestSequentialInputs tests collision rate with sequential numeric inputs +func TestSequentialInputs(numInputs int) CollisionStats { + start := time.Now() + + // Map hash to original input to detect TRUE collisions + hashToInput := make(map[uint64]string) + collisions := 0 + + for i := 0; i < numInputs; i++ { + // Generate sequential input + data := fmt.Sprintf("input_%d", i) + + // Compute xxHash64 + hash := xxhash.Sum64String(data) + + if originalInput, exists := hashToInput[hash]; exists { + // Hash collision detected - check if it's a TRUE collision + if originalInput != data { + collisions++ + fmt.Printf("TRUE COLLISION FOUND!\n") + fmt.Printf(" Input 1: %s (hash: %d)\n", originalInput, hash) + fmt.Printf(" Input 2: %s (hash: %d)\n", data, hash) + } + } else { + hashToInput[hash] = data + } + } + + duration := time.Since(start) + collisionRate := float64(collisions) / float64(numInputs) + + hashBits := 64.0 + expectedCollisions := (float64(numInputs) * float64(numInputs-1)) / (2.0 * math.Pow(2, hashBits)) + + return CollisionStats{ + NumInputs: numInputs, + UniqueHashes: len(hashToInput), + Collisions: collisions, + CollisionRate: collisionRate, + ExpectedCollisions: expectedCollisions, + Duration: duration, + DuplicateInputs: 0, + } +} + +// TestSimilarInputs tests collision rate with similar strings +func TestSimilarInputs(numInputs int, baseString string) CollisionStats { + start := time.Now() + + // Map hash to original input to detect TRUE collisions + hashToInput := make(map[uint64]string) + collisions := 0 + + for i := 0; i < numInputs; i++ { + // Generate similar input by appending number + data := fmt.Sprintf("%s_%d", baseString, i) + + // Compute xxHash64 + hash := xxhash.Sum64String(data) + + if originalInput, exists := hashToInput[hash]; exists { + // Hash collision detected - check if it's a TRUE collision + if originalInput != data { + collisions++ + fmt.Printf("TRUE COLLISION FOUND!\n") + fmt.Printf(" Input 1: %s (hash: %d)\n", originalInput, hash) + fmt.Printf(" Input 2: %s (hash: %d)\n", data, hash) + } + } else { + hashToInput[hash] = data + } + } + + duration := time.Since(start) + collisionRate := float64(collisions) / float64(numInputs) + + hashBits := 64.0 + expectedCollisions := (float64(numInputs) * float64(numInputs-1)) / (2.0 * math.Pow(2, hashBits)) + + return CollisionStats{ + NumInputs: numInputs, + UniqueHashes: len(hashToInput), + Collisions: collisions, + CollisionRate: collisionRate, + ExpectedCollisions: expectedCollisions, + Duration: duration, + DuplicateInputs: 0, + } +} + +// TestWithDuplicateInputs demonstrates the difference between duplicate inputs and collisions +func TestWithDuplicateInputs(numInputs int, duplicatePercent float64) CollisionStats { + start := time.Now() + + hashToInput := make(map[uint64]string) + collisions := 0 + duplicateInputs := 0 + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + // Generate a pool of unique strings + uniqueStrings := make([]string, int(float64(numInputs)*(1.0-duplicatePercent))) + for i := range uniqueStrings { + uniqueStrings[i] = GenerateRandomString(20, r) + } + + for i := 0; i < numInputs; i++ { + var data string + + // Randomly decide if we should use a duplicate input + if r.Float64() < duplicatePercent && len(uniqueStrings) > 0 { + // Use an existing string (duplicate input) + data = uniqueStrings[r.Intn(len(uniqueStrings))] + } else { + // Generate new string + data = GenerateRandomString(20, r) + } + + hash := xxhash.Sum64String(data) + + if originalInput, exists := hashToInput[hash]; exists { + if originalInput != data { + collisions++ + fmt.Printf("TRUE COLLISION FOUND!\n") + fmt.Printf(" Input 1: %s (hash: %d)\n", originalInput, hash) + fmt.Printf(" Input 2: %s (hash: %d)\n", data, hash) + } else { + duplicateInputs++ + } + } else { + hashToInput[hash] = data + } + } + + duration := time.Since(start) + collisionRate := float64(collisions) / float64(numInputs) + + hashBits := 64.0 + expectedCollisions := (float64(numInputs) * float64(numInputs-1)) / (2.0 * math.Pow(2, hashBits)) + + return CollisionStats{ + NumInputs: numInputs, + UniqueHashes: len(hashToInput), + Collisions: collisions, + CollisionRate: collisionRate, + ExpectedCollisions: expectedCollisions, + Duration: duration, + DuplicateInputs: duplicateInputs, + } +} + +// PrintStats displays collision statistics +func PrintStats(testName string, stats CollisionStats) { + fmt.Printf("\n=== %s ===\n", testName) + fmt.Printf("Number of inputs: %d\n", stats.NumInputs) + fmt.Printf("Unique hashes: %d\n", stats.UniqueHashes) + fmt.Printf("TRUE collisions: %d (different inputs, same hash)\n", stats.Collisions) + if stats.DuplicateInputs > 0 { + fmt.Printf("Duplicate inputs: %d (same input, same hash - NOT collisions)\n", stats.DuplicateInputs) + } + fmt.Printf("Collision rate: %.10f (%.4e)\n", stats.CollisionRate, stats.CollisionRate) + fmt.Printf("Expected collisions: %.10f (%.4e)\n", stats.ExpectedCollisions, stats.ExpectedCollisions) + fmt.Printf("Duration: %v\n", stats.Duration) + + if stats.Collisions > 0 { + fmt.Printf("⚠️ WARNING: TRUE COLLISIONS DETECTED!\n") + } else { + fmt.Printf("✓ No true collisions detected\n") + } +} + +func main() { + fmt.Println("xxHash64 Collision Rate Simulation") + fmt.Println("===================================") + fmt.Println("\nNOTE: We only count TRUE collisions (different inputs → same hash)") + fmt.Println("Duplicate inputs (same input → same hash) are NOT collisions!\n") + + // Test 1: Small scale random test (100K inputs) + stats1 := TestCollisionRate(100_000, 20) + PrintStats("Random Strings (100K, length=20)", stats1) + + // Test 2: Medium scale random test (1M inputs) + stats2 := TestCollisionRate(1_000_000, 20) + PrintStats("Random Strings (1M, length=20)", stats2) + + // Test 3: Large scale random test (10M inputs) + stats3 := TestCollisionRate(10_000_000, 20) + PrintStats("Random Strings (10M, length=20)", stats3) + + // Test 4: Sequential inputs + stats4 := TestSequentialInputs(1_000_000) + PrintStats("Sequential Inputs (1M)", stats4) + + // Test 5: Similar strings + stats5 := TestSimilarInputs(1_000_000, "user_data") + PrintStats("Similar Strings (1M)", stats5) + + // Test 6: Short strings + stats6 := TestCollisionRate(1_000_000, 5) + PrintStats("Short Random Strings (1M, length=5)", stats6) + + // Test 7: Demonstration with duplicate inputs (20% duplicates) + stats7 := TestWithDuplicateInputs(100_000, 0.20) + PrintStats("With Duplicate Inputs (100K, 20% duplicates)", stats7) + + stats8 := TestCollisionRate(1_00_000_000, 20) + PrintStats("Random String (100M, length=20)", stats8) + + //stats9 := TestCollisionRate(1_000_000_000, 20) + //PrintStats("Random String (100M, length=20)", stats9) + + fmt.Println("\n=== Summary ===") + fmt.Println("For xxHash64 (64-bit output):") + fmt.Println("- Expected first collision after ~5 billion inputs (birthday paradox)") + fmt.Println("- Collision probability for 1M inputs: ~0.00000000000005") + fmt.Println("- Hash space size: 2^64 = 18,446,744,073,709,551,616") + fmt.Println("\nKey Distinction:") + fmt.Println("✓ TRUE COLLISION: Different inputs produce the same hash (BAD)") + fmt.Println("✓ DUPLICATE INPUT: Same input produces the same hash (EXPECTED)") +}