Skip to content
2 changes: 1 addition & 1 deletion packages/core/src/domain/contexts/userContext.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ describe('user context', () => {
findTrackedSession: () =>
({
anonymousId: 'device-123',
}) as SessionContext<string>,
}) as SessionContext,
}

beforeEach(() => {
Expand Down
87 changes: 87 additions & 0 deletions packages/core/src/domain/sampler.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { isSampled, resetSampleDecisionCache, sampleUsingKnuthFactor } from './sampler'

// UUID known to yield a low hash value using the Knuth formula, making it more likely to be sampled
const LOW_HASH_UUID = '29a4b5e3-9859-4290-99fa-4bc4a1a348b9'
// UUID known to yield a high hash value using the Knuth formula, making it less likely to be
// sampled
const HIGH_HASH_UUID = '5321b54a-d6ec-4b24-996d-dd70c617e09a'

// UUID chosen arbitrarily, to be used when the test doesn't actually depend on it.
const ARBITRARY_UUID = '1ff81c8c-6e32-473b-869b-55af08048323'

describe('isSampled', () => {
beforeEach(() => {
resetSampleDecisionCache()
})

it('returns true when sampleRate is 100', () => {
expect(isSampled(ARBITRARY_UUID, 100)).toBeTrue()
})

it('returns false when sampleRate is 0', () => {
expect(isSampled(ARBITRARY_UUID, 0)).toBeFalse()
})

describe('deterministic sampling', () => {
it('a session id with a low hash value should be sampled with a rate close to 0%', () => {
expect(isSampled(LOW_HASH_UUID, 0.1)).toBeTrue()
resetSampleDecisionCache()
expect(isSampled(LOW_HASH_UUID, 0.01)).toBeTrue()
resetSampleDecisionCache()
expect(isSampled(LOW_HASH_UUID, 0.001)).toBeTrue()
resetSampleDecisionCache()
expect(isSampled(LOW_HASH_UUID, 0.0001)).toBeTrue()
resetSampleDecisionCache()
// At some point the sample rate is so low that the session is not sampled even if the hash
// is low. This is not an error: we can probably find a UUID with an even lower hash.
expect(isSampled(LOW_HASH_UUID, 0.0000000001)).toBeFalse()
})

it('a session id with a high hash value should not be sampled even if the rate is close to 100%', () => {
expect(isSampled(HIGH_HASH_UUID, 99.9)).toBeFalse()
resetSampleDecisionCache()
expect(isSampled(HIGH_HASH_UUID, 99.99)).toBeFalse()
resetSampleDecisionCache()
expect(isSampled(HIGH_HASH_UUID, 99.999)).toBeFalse()
resetSampleDecisionCache()
expect(isSampled(HIGH_HASH_UUID, 99.9999)).toBeFalse()
resetSampleDecisionCache()
// At some point the sample rate is so high that the session is sampled even if the hash is
// high. This is not an error: we can probably find a UUID with an even higher hash.
expect(isSampled(HIGH_HASH_UUID, 99.9999999999)).toBeTrue()
})
})

})

describe('sampleUsingKnuthFactor', () => {
it('sampling should be based on the trace id', () => {
// Generated using the dd-trace-go implementation with the following program: https://go.dev/play/p/CUrDJtze8E_e
const inputs: Array<[bigint, number, boolean]> = [
[BigInt('5577006791947779410'), 94.0509, true],
[BigInt('15352856648520921629'), 43.7714, true],
[BigInt('3916589616287113937'), 68.6823, true],
[BigInt('894385949183117216'), 30.0912, true],
[BigInt('12156940908066221323'), 46.889, true],

[BigInt('9828766684487745566'), 15.6519, false],
[BigInt('4751997750760398084'), 81.364, false],
[BigInt('11199607447739267382'), 38.0657, false],
[BigInt('6263450610539110790'), 21.8553, false],
[BigInt('1874068156324778273'), 36.0871, false],
]

for (const [identifier, sampleRate, expected] of inputs) {
expect(sampleUsingKnuthFactor(identifier, sampleRate))
.withContext(`identifier=${identifier}, sampleRate=${sampleRate}`)
.toBe(expected)
}
})

it('should cache sampling decision per sampling rate', () => {
// For the same session id, the sampling decision should be different for trace and profiling, eg. trace should not cache profiling decisions and vice versa
expect(isSampled(HIGH_HASH_UUID, 99.9999999999)).toBeTrue()
expect(isSampled(HIGH_HASH_UUID, 0.0000001)).toBeFalse()
expect(isSampled(HIGH_HASH_UUID, 99.9999999999)).toBeTrue()
})
})
60 changes: 60 additions & 0 deletions packages/core/src/domain/sampler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
const sampleDecisionCache: Map<number, { sessionId: string; decision: boolean }> = new Map()

export function isSampled(sessionId: string, sampleRate: number) {
// Shortcuts for common cases. This is not strictly necessary, but it makes the code faster for
// customers willing to ingest all traces.
if (sampleRate === 100) {
return true
}

if (sampleRate === 0) {
return false
}

const cachedDecision = sampleDecisionCache.get(sampleRate)
if (cachedDecision && sessionId === cachedDecision.sessionId) {
return cachedDecision.decision
}

const decision = sampleUsingKnuthFactor(BigInt(`0x${sessionId.split('-')[4]}`), sampleRate)
sampleDecisionCache.set(sampleRate, { sessionId, decision })
return decision
}

// Exported for tests
export function resetSampleDecisionCache() {
sampleDecisionCache.clear()
}

/**
* Perform sampling using the Knuth factor method. This method offer consistent sampling result
* based on the provided identifier.
*
* @param identifier - The identifier to use for sampling.
* @param sampleRate - The sample rate in percentage between 0 and 100.
*/
export function sampleUsingKnuthFactor(identifier: bigint, sampleRate: number) {
// The formula is:
//
// (identifier * knuthFactor) % 2^64 < sampleRate * 2^64
//
// Because JavaScript numbers are 64-bit floats, we can't represent 64-bit integers, and the
// modulo would be incorrect. Thus, we are using BigInts here.
//
// Implementation in other languages:
// * Go https://github.com/DataDog/dd-trace-go/blob/ec6fbb1f2d517b7b8e69961052adf7136f3af773/ddtrace/tracer/sampler.go#L86-L91
// * Python https://github.com/DataDog/dd-trace-py/blob/0cee2f066fb6e79aa15947c1514c0f406dea47c5/ddtrace/sampling_rule.py#L197
// * Ruby https://github.com/DataDog/dd-trace-rb/blob/1a6e255cdcb7e7e22235ea5955f90f6dfa91045d/lib/datadog/tracing/sampling/rate_sampler.rb#L42
// * C++ https://github.com/DataDog/dd-trace-cpp/blob/159629edc438ae45f2bb318eb7bd51abd05e94b5/src/datadog/trace_sampler.cpp#L58
// * Java https://github.com/DataDog/dd-trace-java/blob/896dd6b380533216e0bdee59614606c8272d313e/dd-trace-core/src/main/java/datadog/trace/common/sampling/DeterministicSampler.java#L48
//
// Note: All implementations have slight variations. Some of them use '<=' instead of '<', and
// use `sampleRate * 2^64 - 1` instead of `sampleRate * 2^64`. The following implementation
// should adhere to the spec and is a bit simpler than using a 2^64-1 limit as there are less
// BigInt arithmetic to write. In practice this does not matter, as we are using floating point
// numbers in the end, and Number(2n**64n-1n) === Number(2n**64n).
const knuthFactor = BigInt('1111111111111111111')
const twoPow64 = BigInt('0x10000000000000000') // 2n ** 64n
const hash = (identifier * knuthFactor) % twoPow64
return Number(hash) <= (sampleRate / 100) * Number(twoPow64)
}
Loading