Skip to content

Commit 37eee51

Browse files
samwilliskevin-dp
andauthored
d2mini: remove almost all hashing (#83)
* remove almost all hashing * tidy * fixes and formating * tidy tests * Prefix string IDs such that they can't clash with null, undefined, or objects. * Rename TieBreakerTaggedValue to TaggedValue --------- Co-authored-by: Kevin De Porre <[email protected]>
1 parent e38b2d5 commit 37eee51

15 files changed

+1490
-1086
lines changed

packages/d2mini/src/indexes.ts

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,19 @@
11
import { MultiSet } from './multiset.js'
2-
import { DefaultMap, hash } from './utils.js'
2+
import { DefaultMap } from './utils.js'
33

44
/**
55
* A map from a difference collection trace's keys -> (value, multiplicities) that changed.
66
* Used in operations like join and reduce where the operation needs to
77
* exploit the key-value structure of the data to run efficiently.
88
*/
99
export class Index<K, V> {
10-
#inner: DefaultMap<K, DefaultMap<string, [V, number]>>
10+
#inner: DefaultMap<K, Map<V, number>>
1111

1212
constructor() {
13-
this.#inner = new DefaultMap<K, DefaultMap<string, [V, number]>>(
14-
() =>
15-
new DefaultMap<string, [V, number]>(() => [undefined as any as V, 0]),
16-
)
17-
// #inner is as map of:
13+
this.#inner = new DefaultMap<K, Map<V, number>>(() => new Map<V, number>())
14+
// #inner is a map of:
1815
// {
19-
// [key]: {
20-
// [hash(value)]: [value, multiplicity]
21-
// }
16+
// [key]: Map<V, number> // Direct value-to-multiplicity mapping
2217
// }
2318
}
2419

@@ -32,14 +27,12 @@ export class Index<K, V> {
3227

3328
get(key: K): [V, number][] {
3429
const valueMap = this.#inner.get(key)
35-
return [...valueMap.values()]
30+
return [...valueMap.entries()]
3631
}
3732

3833
getMultiplicity(key: K, value: V): number {
3934
const valueMap = this.#inner.get(key)
40-
const valueHash = hash(value)
41-
const [, multiplicity] = valueMap.get(valueHash)
42-
return multiplicity
35+
return valueMap.get(value) ?? 0
4336
}
4437

4538
entries() {
@@ -61,31 +54,28 @@ export class Index<K, V> {
6154
addValue(key: K, value: [V, number]): void {
6255
const [val, multiplicity] = value
6356
const valueMap = this.#inner.get(key)
64-
const valueHash = hash(val)
65-
const [, existingMultiplicity] = valueMap.get(valueHash)
57+
const existingMultiplicity = valueMap.get(val) ?? 0
6658
const newMultiplicity = existingMultiplicity + multiplicity
59+
6760
if (multiplicity !== 0) {
6861
if (newMultiplicity === 0) {
69-
valueMap.delete(valueHash)
62+
valueMap.delete(val)
7063
} else {
71-
valueMap.set(valueHash, [val, newMultiplicity])
64+
valueMap.set(val, newMultiplicity)
7265
}
7366
}
7467
}
7568

7669
append(other: Index<K, V>): void {
7770
for (const [key, otherValueMap] of other.entries()) {
7871
const thisValueMap = this.#inner.get(key)
79-
for (const [
80-
valueHash,
81-
[value, multiplicity],
82-
] of otherValueMap.entries()) {
83-
const [, existingMultiplicity] = thisValueMap.get(valueHash)
72+
for (const [value, multiplicity] of otherValueMap.entries()) {
73+
const existingMultiplicity = thisValueMap.get(value) ?? 0
8474
const newMultiplicity = existingMultiplicity + multiplicity
8575
if (newMultiplicity === 0) {
86-
thisValueMap.delete(valueHash)
76+
thisValueMap.delete(value)
8777
} else {
88-
thisValueMap.set(valueHash, [value, newMultiplicity])
78+
thisValueMap.set(value, newMultiplicity)
8979
}
9080
}
9181
}
@@ -100,7 +90,7 @@ export class Index<K, V> {
10090
for (const [key, valueMap] of this.entries()) {
10191
if (!other.has(key)) continue
10292
const otherValues = other.get(key)
103-
for (const [val1, mul1] of valueMap.values()) {
93+
for (const [val1, mul1] of valueMap.entries()) {
10494
for (const [val2, mul2] of otherValues) {
10595
if (mul1 !== 0 && mul2 !== 0) {
10696
result.push([[key, [val1, val2]], mul1 * mul2])
@@ -112,7 +102,7 @@ export class Index<K, V> {
112102
for (const [key, otherValueMap] of other.entries()) {
113103
if (!this.has(key)) continue
114104
const values = this.get(key)
115-
for (const [val2, mul2] of otherValueMap.values()) {
105+
for (const [val2, mul2] of otherValueMap.entries()) {
116106
for (const [val1, mul1] of values) {
117107
if (mul1 !== 0 && mul2 !== 0) {
118108
result.push([[key, [val1, val2]], mul1 * mul2])

packages/d2mini/src/multiset.ts

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
import { DefaultMap, chunkedArrayPush, hash } from './utils.js'
1+
import {
2+
DefaultMap,
3+
chunkedArrayPush,
4+
hash,
5+
globalObjectIdGenerator,
6+
} from './utils.js'
27

38
export type MultiSetArray<T> = [T, number][]
49
export type KeyedData<T> = [key: string, value: T]
@@ -66,6 +71,101 @@ export class MultiSet<T> {
6671
* (record, multiplicity) pair.
6772
*/
6873
consolidate(): MultiSet<T> {
74+
// Check if this looks like a keyed multiset (first item is a tuple of length 2)
75+
if (this.#inner.length > 0) {
76+
const firstItem = this.#inner[0][0]
77+
if (Array.isArray(firstItem) && firstItem.length === 2) {
78+
return this.#consolidateKeyed()
79+
}
80+
}
81+
82+
// Fall back to original method for unkeyed data
83+
return this.#consolidateUnkeyed()
84+
}
85+
86+
/**
87+
* Private method for consolidating keyed multisets where keys are strings/numbers
88+
* and values are compared by reference equality.
89+
*
90+
* This method provides significant performance improvements over the hash-based approach
91+
* by using WeakMap for object reference tracking and avoiding expensive serialization.
92+
*
93+
* Special handling for join operations: When values are tuples of length 2 (common in joins),
94+
* we unpack them and compare each element individually to maintain proper equality semantics.
95+
*/
96+
#consolidateKeyed(): MultiSet<T> {
97+
const consolidated = new Map<string, number>()
98+
const values = new Map<string, T>()
99+
100+
// Use global object ID generator for consistent reference equality
101+
102+
/**
103+
* Special handler for tuples (arrays of length 2) commonly produced by join operations.
104+
* Unpacks the tuple and generates an ID based on both elements to ensure proper
105+
* consolidation of join results like ['A', null] and [null, 'X'].
106+
*/
107+
const getTupleId = (tuple: any[]): string => {
108+
if (tuple.length !== 2) {
109+
throw new Error('Expected tuple of length 2')
110+
}
111+
const [first, second] = tuple
112+
return `${globalObjectIdGenerator.getStringId(first)}|${globalObjectIdGenerator.getStringId(second)}`
113+
}
114+
115+
// Process each item in the multiset
116+
for (const [data, multiplicity] of this.#inner) {
117+
// Verify this is still a keyed item (should be [key, value] pair)
118+
if (!Array.isArray(data) || data.length !== 2) {
119+
// Found non-keyed item, fall back to unkeyed consolidation
120+
return this.#consolidateUnkeyed()
121+
}
122+
123+
const [key, value] = data
124+
125+
// Verify key is string or number as expected for keyed multisets
126+
if (typeof key !== 'string' && typeof key !== 'number') {
127+
// Found non-string/number key, fall back to unkeyed consolidation
128+
return this.#consolidateUnkeyed()
129+
}
130+
131+
// Generate value ID with special handling for join tuples
132+
let valueId: string
133+
if (Array.isArray(value) && value.length === 2) {
134+
// Special case: value is a tuple from join operations
135+
valueId = getTupleId(value)
136+
} else {
137+
// Regular case: use reference/value equality
138+
valueId = globalObjectIdGenerator.getStringId(value)
139+
}
140+
141+
// Create composite key and consolidate
142+
const compositeKey = key + '|' + valueId
143+
consolidated.set(
144+
compositeKey,
145+
(consolidated.get(compositeKey) || 0) + multiplicity,
146+
)
147+
148+
// Store the original data for the first occurrence
149+
if (!values.has(compositeKey)) {
150+
values.set(compositeKey, data as T)
151+
}
152+
}
153+
154+
// Build result array, filtering out zero multiplicities
155+
const result: MultiSetArray<T> = []
156+
for (const [compositeKey, multiplicity] of consolidated) {
157+
if (multiplicity !== 0) {
158+
result.push([values.get(compositeKey)!, multiplicity])
159+
}
160+
}
161+
162+
return new MultiSet(result)
163+
}
164+
165+
/**
166+
* Private method for consolidating unkeyed multisets using the original approach.
167+
*/
168+
#consolidateUnkeyed(): MultiSet<T> {
69169
const consolidated = new DefaultMap<string | number, number>(() => 0)
70170
const values = new Map<string, any>()
71171

packages/d2mini/src/operators/reduce.ts

Lines changed: 21 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import {
77
import { StreamBuilder } from '../d2.js'
88
import { MultiSet } from '../multiset.js'
99
import { Index } from '../indexes.js'
10-
import { hash } from '../utils.js'
1110

1211
/**
1312
* Base operator for reduction operations (version-free)
@@ -45,73 +44,52 @@ export class ReduceOperator<K, V1, V2> extends UnaryOperator<[K, V1], [K, V2]> {
4544
const currOut = this.#indexOut.get(key)
4645
const out = this.#f(curr)
4746

48-
// Create maps for current and previous outputs
49-
const newOutputMap = new Map<
50-
string,
51-
{ value: V2; multiplicity: number }
52-
>()
53-
const oldOutputMap = new Map<
54-
string,
55-
{ value: V2; multiplicity: number }
56-
>()
47+
// Create maps for current and previous outputs using values directly as keys
48+
const newOutputMap = new Map<V2, number>()
49+
const oldOutputMap = new Map<V2, number>()
5750

5851
// Process new output
5952
for (const [value, multiplicity] of out) {
60-
const valueKey = hash(value)
61-
if (newOutputMap.has(valueKey)) {
62-
newOutputMap.get(valueKey)!.multiplicity += multiplicity
63-
} else {
64-
newOutputMap.set(valueKey, { value, multiplicity })
65-
}
53+
const existing = newOutputMap.get(value) ?? 0
54+
newOutputMap.set(value, existing + multiplicity)
6655
}
6756

6857
// Process previous output
6958
for (const [value, multiplicity] of currOut) {
70-
const valueKey = hash(value)
71-
if (oldOutputMap.has(valueKey)) {
72-
oldOutputMap.get(valueKey)!.multiplicity += multiplicity
73-
} else {
74-
oldOutputMap.set(valueKey, { value, multiplicity })
75-
}
59+
const existing = oldOutputMap.get(value) ?? 0
60+
oldOutputMap.set(value, existing + multiplicity)
7661
}
7762

78-
const commonKeys = new Set<string>()
79-
8063
// First, emit removals for old values that are no longer present
81-
for (const [valueKey, { value, multiplicity }] of oldOutputMap) {
82-
const newEntry = newOutputMap.get(valueKey)
83-
if (!newEntry) {
64+
for (const [value, multiplicity] of oldOutputMap) {
65+
if (!newOutputMap.has(value)) {
8466
// Remove the old value entirely
8567
result.push([[key, value], -multiplicity])
8668
this.#indexOut.addValue(key, [value, -multiplicity])
87-
} else {
88-
commonKeys.add(valueKey)
8969
}
9070
}
9171

9272
// Then, emit additions for new values that are not present in old
93-
for (const [valueKey, { value, multiplicity }] of newOutputMap) {
94-
const oldEntry = oldOutputMap.get(valueKey)
95-
if (!oldEntry) {
73+
for (const [value, multiplicity] of newOutputMap) {
74+
if (!oldOutputMap.has(value)) {
9675
// Add the new value only if it has non-zero multiplicity
9776
if (multiplicity !== 0) {
9877
result.push([[key, value], multiplicity])
9978
this.#indexOut.addValue(key, [value, multiplicity])
10079
}
101-
} else {
102-
commonKeys.add(valueKey)
10380
}
10481
}
10582

106-
// Then, emit multiplicity changes for values that were present and are still present
107-
for (const valueKey of commonKeys) {
108-
const newEntry = newOutputMap.get(valueKey)
109-
const oldEntry = oldOutputMap.get(valueKey)
110-
const delta = newEntry!.multiplicity - oldEntry!.multiplicity
111-
// Only emit actual changes, i.e. non-zero deltas
112-
if (delta !== 0) {
113-
result.push([[key, newEntry!.value], delta])
114-
this.#indexOut.addValue(key, [newEntry!.value, delta])
83+
// Finally, emit multiplicity changes for values that were present and are still present
84+
for (const [value, newMultiplicity] of newOutputMap) {
85+
const oldMultiplicity = oldOutputMap.get(value)
86+
if (oldMultiplicity !== undefined) {
87+
const delta = newMultiplicity - oldMultiplicity
88+
// Only emit actual changes, i.e. non-zero deltas
89+
if (delta !== 0) {
90+
result.push([[key, value], delta])
91+
this.#indexOut.addValue(key, [value, delta])
92+
}
11593
}
11694
}
11795
}

0 commit comments

Comments
 (0)