From 4f1dcbcfff7af35b2a41309698aaaad89a78f7e4 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Sat, 13 Sep 2025 17:35:24 +0100 Subject: [PATCH 01/10] impliment prefixed index to remove hashing from joins initial load --- packages/db-ivm/src/hashIndex.ts | 94 --------- packages/db-ivm/src/indexes.ts | 318 +++++++++++++++++++++--------- packages/db-ivm/src/valueIndex.ts | 78 -------- 3 files changed, 220 insertions(+), 270 deletions(-) delete mode 100644 packages/db-ivm/src/hashIndex.ts delete mode 100644 packages/db-ivm/src/valueIndex.ts diff --git a/packages/db-ivm/src/hashIndex.ts b/packages/db-ivm/src/hashIndex.ts deleted file mode 100644 index cc9df1b0c..000000000 --- a/packages/db-ivm/src/hashIndex.ts +++ /dev/null @@ -1,94 +0,0 @@ -import { DefaultMap } from "./utils.js" -import { hash } from "./hashing/index.js" -import type { Hash } from "./hashing/index.js" - -/** - * A map from a difference collection trace's keys -> (value, multiplicities) that changed. - * Used in operations like join and reduce where the operation needs to - * exploit the key-value structure of the data to run efficiently. - */ -export class HashIndex { - #inner: DefaultMap> - - constructor() { - this.#inner = new DefaultMap>( - () => new DefaultMap(() => [undefined as any as V, 0]) - ) - // #inner is as map of: - // { - // [key]: { - // [hash(value)]: [value, multiplicity] - // } - // } - } - - toString(indent = false): string { - return `HashIndex(${JSON.stringify( - [...this.#inner].map(([k, valueMap]) => [k, [...valueMap]]), - undefined, - indent ? 2 : undefined - )})` - } - - get(key: K): Array<[V, number]> { - const valueMap = this.#inner.get(key) - return [...valueMap.values()] - } - - getMultiplicity(key: K, value: V): number { - const valueMap = this.#inner.get(key) - const valueHash = hash(value) - const [, multiplicity] = valueMap.get(valueHash) - return multiplicity - } - - entries() { - return this.#inner.entries() - } - - *entriesIterator(): Generator<[K, [V, number]]> { - for (const [key, valueMap] of this.#inner.entries()) { - for (const [_valueHash, [value, multiplicity]] of valueMap.entries()) { - yield [key, [value, multiplicity]] - } - } - } - - has(key: K): boolean { - return this.#inner.has(key) - } - - delete(key: K): void { - this.#inner.delete(key) - } - - get size(): number { - return this.#inner.size - } - - /** - * Adds a value to the index and does not return anything - * except if the addition caused the value to be removed - * and the key to be left with only a single value. - * In that case, we return the single remaining value. - */ - addValue(key: K, value: [V, number]): [V, number] | void { - const [val, multiplicity] = value - const valueMap = this.#inner.get(key) - const valueHash = hash(val) - const [, existingMultiplicity] = valueMap.get(valueHash) - const newMultiplicity = existingMultiplicity + multiplicity - if (multiplicity !== 0) { - if (newMultiplicity === 0) { - valueMap.delete(valueHash) - if (valueMap.size === 1) { - // Signal that the key only has a single remaining value - return valueMap.entries().next().value![1] - } - } else { - valueMap.set(valueHash, [val, newMultiplicity]) - } - } - this.#inner.set(key, valueMap) - } -} diff --git a/packages/db-ivm/src/indexes.ts b/packages/db-ivm/src/indexes.ts index 27131fc29..16be2a57b 100644 --- a/packages/db-ivm/src/indexes.ts +++ b/packages/db-ivm/src/indexes.ts @@ -1,60 +1,77 @@ import { MultiSet } from "./multiset.js" -import { HashIndex } from "./hashIndex.js" -import { ValueIndex } from "./valueIndex.js" -import { concatIterable, mapIterable } from "./utils.js" - -/** - * A map from a difference collection trace's keys -> (value, multiplicities) that changed. - * Used in operations like join and reduce where the operation needs to - * exploit the key-value structure of the data to run efficiently. - */ -export class Index { - /* - * This is a hybrid Index that composes a ValueIndex and a HashIndex. - * Keys that have only one value are stored in the ValueIndex. - * Keys that have multiple values are stored in the HashIndex, the hash distinguishes between the values. - * This reduces the amount of hashes we need to compute since often times only a small portion of the keys are updated - * so we don't have to hash the keys that are never updated. - * - * Note: The `valueIndex` and `hashIndex` have disjoint keys. - * When a key that has only one value gets a new distinct value, - * it is added to the `hashIndex` and removed from the `valueIndex` and vice versa. - */ - #valueIndex: ValueIndex - #hashIndex: HashIndex +import { hash } from "./hashing/index.js" + +const NO_PREFIX = Symbol(`NO_PREFIX`) +type NO_PREFIX = typeof NO_PREFIX + +type Hash = number +type SingleValue = [TValue, number] +type IndexMap = Map< + TKey, + SingleValue | PrefixMap +> +type PrefixMap = Map< + TPrefix | NO_PREFIX, + SingleValue | ValueMap +> +type ValueMap = Map + +export class Index { + #inner: IndexMap constructor() { - this.#valueIndex = new ValueIndex() - this.#hashIndex = new HashIndex() + this.#inner = new Map() } toString(indent = false): string { - return `Index(\n ${this.#valueIndex.toString(indent)},\n ${this.#hashIndex.toString(indent)}\n)` + return `Index(${JSON.stringify( + [...this.entries()], + undefined, + indent ? 2 : undefined + )})` } - get(key: K): Array<[V, number]> { - if (this.#valueIndex.has(key)) { - return [this.#valueIndex.get(key)!] - } - return this.#hashIndex.get(key) + get size(): number { + return this.#inner.size } - getMultiplicity(key: K, value: V): number { - if (this.#valueIndex.has(key)) { - return this.#valueIndex.getMultiplicity(key) + has(key: TKey): boolean { + return this.#inner.has(key) + } + + get(key: TKey): Array<[TValue, number]> { + return [...this.getIterator(key)] + } + + *getIterator(key: TKey): Iterable<[TValue, number]> { + const prefixMapOrSingleValue = this.#inner.get(key) + if (isSingleValue(prefixMapOrSingleValue)) { + yield prefixMapOrSingleValue + } else if (prefixMapOrSingleValue === undefined) { + return + } else { + for (const singleValueOrValueMap of prefixMapOrSingleValue.values()) { + if (isSingleValue(singleValueOrValueMap)) { + yield singleValueOrValueMap + } else { + for (const valueTuple of singleValueOrValueMap.values()) { + yield valueTuple + } + } + } } - return this.#hashIndex.getMultiplicity(key, value) } /** * This returns an iterator that iterates over all key-value pairs. * @returns An iterable of all key-value pairs (and their multiplicities) in the index. */ - #entries(): Iterable<[K, [V, number]]> { - return concatIterable( - this.#valueIndex.entries(), - this.#hashIndex.entriesIterator() - ) + *entries(): Iterable<[TKey, [TValue, number]]> { + for (const key of this.#inner.keys()) { + for (const valueTuple of this.getIterator(key)) { + yield [key, valueTuple] + } + } } /** @@ -63,86 +80,168 @@ export class Index { * It returns an iterator that you can use if you need to iterate over the values for a given key. * @returns An iterator of all *keys* in the index and their corresponding value iterator. */ - *#entriesIterators(): Iterable<[K, Iterable<[V, number]>]> { - for (const [key, [value, multiplicity]] of this.#valueIndex.entries()) { - yield [key, new Map([[value, multiplicity]])] - } - for (const [key, valueMap] of this.#hashIndex.entries()) { - yield [ - key, - mapIterable(valueMap, ([_hash, [value, multiplicity]]) => [ - value, - multiplicity, - ]), - ] + *#entriesIterators(): Iterable<[TKey, Iterable<[TValue, number]>]> { + for (const key of this.#inner.keys()) { + yield [key, this.getIterator(key)] } } - has(key: K): boolean { - return this.#valueIndex.has(key) || this.#hashIndex.has(key) - } + addValue(key: TKey, valueTuple: SingleValue) { + const [value, multiplicity] = valueTuple + // If the multiplicity is 0, do nothing + if (multiplicity === 0) return - get size(): number { - return this.#valueIndex.size + this.#hashIndex.size - } + const prefixMapOrSingleValue = this.#inner.get(key) - addValue(key: K, value: [V, number]): void { - const containedInValueIndex = this.#valueIndex.has(key) - const containedInHashIndex = this.#hashIndex.has(key) + if (prefixMapOrSingleValue === undefined) { + // This is the first time we see a value for this key we just insert it + // into the index as a single value tuple + this.#inner.set(key, valueTuple) + return + } - if (containedInHashIndex && containedInValueIndex) { - throw new Error( - `Key ${key} is contained in both the value index and the hash index. This should never happen because they should have disjoint keysets.` - ) + const [currentSingleValueForKey, prefixMap] = isSingleValue( + prefixMapOrSingleValue + ) + ? [prefixMapOrSingleValue, undefined] + : [undefined, prefixMapOrSingleValue] + + if (currentSingleValueForKey) { + const [currentValue, currentMultiplicity] = currentSingleValueForKey + // We have a single value for this key, lets check if this is the same value + // and if so we just update the multiplicity. This is a check if its the same + // literal value or object reference. + if (currentValue === value) { + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.#inner.delete(key) + } else { + this.#inner.set(key, [value, newMultiplicity]) + } + return + } } - if (!containedInValueIndex && !containedInHashIndex) { - // This is the first time we see the key - // Add it to the value index - this.#valueIndex.addValue(key, value) - return + // Get the prefix of the new value + const [prefix, suffix] = getPrefix(value) + + if (currentSingleValueForKey) { + const [currentValue, currentMultiplicity] = currentSingleValueForKey + const [currentPrefix, currentSuffix] = getPrefix( + currentValue + ) + if ( + currentPrefix === prefix && + (currentSuffix === suffix || hash(currentSuffix) === hash(suffix)) + ) { + // They are the same value, so we just update the multiplicity + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.#inner.delete(key) + } else { + this.#inner.set(key, [value, newMultiplicity]) + } + return + } else { + // They are different values, so we need to move the current value to a + // new prefix map + const newPrefixMap = new Map< + TPrefix | NO_PREFIX, + SingleValue | ValueMap + >() + this.#inner.set(key, newPrefixMap) + + if (currentPrefix === prefix) { + // They have the same prefix but different suffixes, so we need to add a + // value map for this suffix to the prefix map + const valueMap = new Map() + valueMap.set(hash(currentSuffix), currentSingleValueForKey) + valueMap.set(hash(suffix), valueTuple) + newPrefixMap.set(currentPrefix, valueMap) + } else { + // They have different prefixes, so we can add then as singe values to the + // prefix map + newPrefixMap.set(currentPrefix, currentSingleValueForKey) + newPrefixMap.set(prefix, valueTuple) + } + return + } } - if (containedInValueIndex) { - // This key is already in the value index - // It could be that it's the same value or a different one - // If it's a different value we will need to remove the key from the value index - // and add the key and its two values to the hash index - try { - this.#valueIndex.addValue(key, value) - } catch { - // This is a different value, need to move the key to the hash index - const existingValue = this.#valueIndex.get(key)! - this.#valueIndex.delete(key) - this.#hashIndex.addValue(key, existingValue) - this.#hashIndex.addValue(key, value) + // At this point there is a prefix map for this key, we need the value map or + // single value for this prefix + const valueMapOrSingleValue = prefixMap.get(prefix) + + const [valueMap, currentSingleValueForPrefix] = isSingleValue( + valueMapOrSingleValue + ) + ? [undefined, valueMapOrSingleValue] + : [valueMapOrSingleValue, undefined] + + if (currentSingleValueForPrefix) { + const [currentValue, currentMultiplicity] = currentSingleValueForPrefix + const [currentPrefix, currentSuffix] = getPrefix( + currentValue + ) + if (currentPrefix !== prefix) { + throw new Error(`Mismatching prefixes, this should never happen`) + } + if (currentSuffix === suffix || hash(currentSuffix) === hash(suffix)) { + // They are the same value, so we just update the multiplicity + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + prefixMap.delete(prefix) + } else { + prefixMap.set(prefix, [value, newMultiplicity]) + } + return + } else { + // They have different suffixes, so we need to add a value map for this suffix + // to the prefix map + const valueMap = new Map() + valueMap.set(hash(currentSuffix), currentSingleValueForPrefix) + valueMap.set(hash(suffix), valueTuple) + prefixMap.set(prefix, valueMap) + return } + } + + // At this point there was no single value for the prefix, there *may* be + // a value map for this prefix. If there is not, we can just add the new value + // as a single value to the prefix map + if (!valueMap) { + prefixMap.set(prefix, valueTuple) return } - if (containedInHashIndex) { - // This key is already in the hash index so it already has two or more values. - // However, this new value and multiplicity could cause an existing value to be removed - // and lead to the key having only a single value in which case we need to move it back to the value index - const singleRemainingValue = this.#hashIndex.addValue(key, value) - if (singleRemainingValue) { - // The key only has a single remaining value so we need to move it back to the value index - this.#hashIndex.delete(key) - this.#valueIndex.addValue(key, singleRemainingValue) + // We now know there is a value map for this prefix, we need see if there is a + // current value for the suffix. If there is, we update the multiplicity, otherwise + // we add the new value as a single value to the value map + const suffixHash = hash(suffix) + const currentValueForSuffix = valueMap.get(suffixHash) + if (currentValueForSuffix) { + const [, currentMultiplicity] = currentValueForSuffix + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + valueMap.delete(suffixHash) + } else { + valueMap.set(suffixHash, [value, newMultiplicity]) } - return + } else { + valueMap.set(suffixHash, valueTuple) } } - append(other: Index): void { - for (const [key, value] of other.#entries()) { + append(other: Index): void { + for (const [key, value] of other.entries()) { this.addValue(key, value) } } - join(other: Index): MultiSet<[K, [V, V2]]> { - const result: Array<[[K, [V, V2]], number]> = [] - + join( + other: Index + ): MultiSet<[TKey, [TValue, TValue2]]> { + const result: Array<[[TKey, [TValue, TValue2]], number]> = [] // We want to iterate over the smaller of the two indexes to reduce the // number of operations we need to do. if (this.size <= other.size) { @@ -174,3 +273,26 @@ export class Index { return new MultiSet(result) } } + +function getPrefix( + value: TValue +): [TPrefix | NO_PREFIX, TValue] { + // If the value is an array of two elements and the first element is a string + // or number, then the first element is the prefix. This is used to distinguish + // between values without the need for hashing unless there are multiple values + // for the same prefix. + if ( + Array.isArray(value) && + value.length === 2 && + (typeof value[0] === `string` || typeof value[0] === `number`) + ) { + return [value[0] as TPrefix, value[1] as TValue] + } + return [NO_PREFIX, value] +} + +function isSingleValue( + value: SingleValue | unknown +): value is SingleValue { + return Array.isArray(value) +} diff --git a/packages/db-ivm/src/valueIndex.ts b/packages/db-ivm/src/valueIndex.ts deleted file mode 100644 index 2470e7aa8..000000000 --- a/packages/db-ivm/src/valueIndex.ts +++ /dev/null @@ -1,78 +0,0 @@ -import { hash } from "./hashing/index.js" - -/** - * A map from a difference collection trace's keys -> (value, multiplicities) that changed. - * Used in operations like join and reduce where the operation needs to - * exploit the key-value structure of the data to run efficiently. - */ -export class ValueIndex { - #inner: Map // Maps key to the value and its multiplicity - - constructor() { - this.#inner = new Map() - } - - toString(indent = false): string { - return `ValueIndex(${JSON.stringify( - [...this.#inner.entries()], - undefined, - indent ? 2 : undefined - )})` - } - - get(key: K): [V, number] | undefined { - return this.#inner.get(key) - } - - getMultiplicity(key: K): number { - return this.get(key)?.[1] ?? 0 - } - - entries() { - return this.#inner.entries() - } - - has(key: K): boolean { - return this.#inner.has(key) - } - - delete(key: K): void { - this.#inner.delete(key) - } - - get size(): number { - return this.#inner.size - } - - addValue(key: K, v: [V, number]): void { - const [value, multiplicity] = v - - if (multiplicity === 0) { - return - } - - if (this.has(key)) { - const [currValue, currMultiplicity] = this.get(key)! - if (hash(value) === hash(currValue)) { - // Update the multiplicity - this.#setMultiplicity(key, value, currMultiplicity + multiplicity) - return - } - // Different value, not allowed. - // ValueIndex only supports one value per key. - throw new Error( - `Cannot add value for key ${key} because it already exists in ValueIndex with a different value` - ) - } - - this.#inner.set(key, [value, multiplicity]) - } - - #setMultiplicity(key: K, value: V, multiplicity: number): void { - if (multiplicity === 0) { - this.#inner.delete(key) - } else { - this.#inner.set(key, [value, multiplicity]) - } - } -} From a6b2ea9932014fc914a3fd721af4ece8f1bfa913 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Sat, 13 Sep 2025 19:24:42 +0100 Subject: [PATCH 02/10] comments --- packages/db-ivm/src/indexes.ts | 79 ++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/packages/db-ivm/src/indexes.ts b/packages/db-ivm/src/indexes.ts index 16be2a57b..f3e11541a 100644 --- a/packages/db-ivm/src/indexes.ts +++ b/packages/db-ivm/src/indexes.ts @@ -1,28 +1,57 @@ import { MultiSet } from "./multiset.js" import { hash } from "./hashing/index.js" +import type { Hash } from "./hashing/index.js" +// We use a symbol to represent the absence of a prefix, unprefixed values a stored +// against this key. const NO_PREFIX = Symbol(`NO_PREFIX`) type NO_PREFIX = typeof NO_PREFIX -type Hash = number +// A single value is a tuple of the value and the multiplicity. type SingleValue = [TValue, number] + +// Base map type for the index. Stores single values or prefix maps against a key. type IndexMap = Map< TKey, SingleValue | PrefixMap > + +// Second level map type for the index, stores single values or value maps against a prefix. type PrefixMap = Map< TPrefix | NO_PREFIX, SingleValue | ValueMap > + +// Third level map type for the index, stores single values or value maps against a hash. type ValueMap = Map +/** + * A map from a difference collection trace's keys -> (value, multiplicities) that changed. + * Used in operations like join and reduce where the operation needs to + * exploit the key-value structure of the data to run efficiently. + */ export class Index { + /* + * This index maintains a nested map of keys -> (value, multiplicities), where: + * - initially the values are stored against the key as a single value tuple + * - when a key gets additional values, the values are stored against the key in a + * prefix map + * - the prefix is extract where possible from values that are structured as + * [rowPrimaryKey, rowValue], as they are in the Tanstack DB query pipeline. + * - only when there are multiple values for a given prefix do we fall back to a + * hash to identify identical values, storing them in a third level value map. + */ #inner: IndexMap constructor() { this.#inner = new Map() } + /** + * This method returns a string representation of the index. + * @param indent - Whether to indent the string representation. + * @returns A string representation of the index. + */ toString(indent = false): string { return `Index(${JSON.stringify( [...this.entries()], @@ -31,18 +60,36 @@ export class Index { )})` } + /** + * The size of the index. + */ get size(): number { return this.#inner.size } + /** + * This method checks if the index has a given key. + * @param key - The key to check. + * @returns True if the index has the key, false otherwise. + */ has(key: TKey): boolean { return this.#inner.has(key) } + /** + * This method returns all values for a given key. + * @param key - The key to get the values for. + * @returns An array of value tuples [value, multiplicity]. + */ get(key: TKey): Array<[TValue, number]> { return [...this.getIterator(key)] } + /** + * This method returns an iterator over all values for a given key. + * @param key - The key to get the values for. + * @returns An iterator of value tuples [value, multiplicity]. + */ *getIterator(key: TKey): Iterable<[TValue, number]> { const prefixMapOrSingleValue = this.#inner.get(key) if (isSingleValue(prefixMapOrSingleValue)) { @@ -80,12 +127,17 @@ export class Index { * It returns an iterator that you can use if you need to iterate over the values for a given key. * @returns An iterator of all *keys* in the index and their corresponding value iterator. */ - *#entriesIterators(): Iterable<[TKey, Iterable<[TValue, number]>]> { + *entriesIterators(): Iterable<[TKey, Iterable<[TValue, number]>]> { for (const key of this.#inner.keys()) { yield [key, this.getIterator(key)] } } + /** + * This method adds a value to the index. + * @param key - The key to add the value to. + * @param valueTuple - The value tuple [value, multiplicity] to add to the index. + */ addValue(key: TKey, valueTuple: SingleValue) { const [value, multiplicity] = valueTuple // If the multiplicity is 0, do nothing @@ -232,12 +284,21 @@ export class Index { } } + /** + * This method appends another index to the current index. + * @param other - The index to append to the current index. + */ append(other: Index): void { for (const [key, value] of other.entries()) { this.addValue(key, value) } } + /** + * This method joins two indexes. + * @param other - The index to join with the current index. + * @returns A multiset of the joined values. + */ join( other: Index ): MultiSet<[TKey, [TValue, TValue2]]> { @@ -245,7 +306,7 @@ export class Index { // We want to iterate over the smaller of the two indexes to reduce the // number of operations we need to do. if (this.size <= other.size) { - for (const [key, valueIt] of this.#entriesIterators()) { + for (const [key, valueIt] of this.entriesIterators()) { if (!other.has(key)) continue const otherValues = other.get(key) for (const [val1, mul1] of valueIt) { @@ -257,7 +318,7 @@ export class Index { } } } else { - for (const [key, otherValueIt] of other.#entriesIterators()) { + for (const [key, otherValueIt] of other.entriesIterators()) { if (!this.has(key)) continue const values = this.get(key) for (const [val2, mul2] of otherValueIt) { @@ -274,6 +335,11 @@ export class Index { } } +/** + * This function extracts the prefix from a value. + * @param value - The value to extract the prefix from. + * @returns The prefix and the suffix. + */ function getPrefix( value: TValue ): [TPrefix | NO_PREFIX, TValue] { @@ -291,6 +357,11 @@ function getPrefix( return [NO_PREFIX, value] } +/** + * This function checks if a value is a single value. + * @param value - The value to check. + * @returns True if the value is a single value, false otherwise. + */ function isSingleValue( value: SingleValue | unknown ): value is SingleValue { From d583ced5f0672e6de6f8d6744e544aa5f2f1ad24 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Sat, 13 Sep 2025 19:27:31 +0100 Subject: [PATCH 03/10] changeset --- .changeset/odd-mangos-pick.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/odd-mangos-pick.md diff --git a/.changeset/odd-mangos-pick.md b/.changeset/odd-mangos-pick.md new file mode 100644 index 000000000..f9f839817 --- /dev/null +++ b/.changeset/odd-mangos-pick.md @@ -0,0 +1,5 @@ +--- +"@tanstack/db-ivm": patch +--- + +Change the ivm indexes to use a three level `key->prefix->hash->value` structure, only falling back to structural hashing when there are multiple values for a single prefix. This removes all hashing during the initial run of a query delivering a 2-3x speedup. From 0b2785cfe5b2ea38f8345f5e847fb3ac5b7fafc1 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Thu, 18 Sep 2025 12:57:55 +0100 Subject: [PATCH 04/10] Allow ValueMap for a key without a PrefixMap --- packages/db-ivm/src/indexes.ts | 124 ++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 39 deletions(-) diff --git a/packages/db-ivm/src/indexes.ts b/packages/db-ivm/src/indexes.ts index f3e11541a..5415186a4 100644 --- a/packages/db-ivm/src/indexes.ts +++ b/packages/db-ivm/src/indexes.ts @@ -10,20 +10,20 @@ type NO_PREFIX = typeof NO_PREFIX // A single value is a tuple of the value and the multiplicity. type SingleValue = [TValue, number] -// Base map type for the index. Stores single values or prefix maps against a key. +// Base map type for the index. Stores single values, prefix maps, or value maps against a key. type IndexMap = Map< TKey, - SingleValue | PrefixMap + SingleValue | PrefixMap | ValueMap > // Second level map type for the index, stores single values or value maps against a prefix. -type PrefixMap = Map< +class PrefixMap extends Map< TPrefix | NO_PREFIX, SingleValue | ValueMap -> +> {} // Third level map type for the index, stores single values or value maps against a hash. -type ValueMap = Map +class ValueMap extends Map {} /** * A map from a difference collection trace's keys -> (value, multiplicities) that changed. @@ -91,13 +91,19 @@ export class Index { * @returns An iterator of value tuples [value, multiplicity]. */ *getIterator(key: TKey): Iterable<[TValue, number]> { - const prefixMapOrSingleValue = this.#inner.get(key) - if (isSingleValue(prefixMapOrSingleValue)) { - yield prefixMapOrSingleValue - } else if (prefixMapOrSingleValue === undefined) { + const mapOrSingleValue = this.#inner.get(key) + if (isSingleValue(mapOrSingleValue)) { + yield mapOrSingleValue + } else if (mapOrSingleValue === undefined) { return + } else if (mapOrSingleValue instanceof ValueMap) { + // Direct ValueMap - all values have NO_PREFIX + for (const valueTuple of mapOrSingleValue.values()) { + yield valueTuple + } } else { - for (const singleValueOrValueMap of prefixMapOrSingleValue.values()) { + // PrefixMap - iterate through all prefixes + for (const singleValueOrValueMap of mapOrSingleValue.values()) { if (isSingleValue(singleValueOrValueMap)) { yield singleValueOrValueMap } else { @@ -143,23 +149,17 @@ export class Index { // If the multiplicity is 0, do nothing if (multiplicity === 0) return - const prefixMapOrSingleValue = this.#inner.get(key) + const mapOrSingleValue = this.#inner.get(key) - if (prefixMapOrSingleValue === undefined) { + if (mapOrSingleValue === undefined) { // This is the first time we see a value for this key we just insert it // into the index as a single value tuple this.#inner.set(key, valueTuple) return } - const [currentSingleValueForKey, prefixMap] = isSingleValue( - prefixMapOrSingleValue - ) - ? [prefixMapOrSingleValue, undefined] - : [undefined, prefixMapOrSingleValue] - - if (currentSingleValueForKey) { - const [currentValue, currentMultiplicity] = currentSingleValueForKey + if (isSingleValue(mapOrSingleValue)) { + const [currentValue, currentMultiplicity] = mapOrSingleValue // We have a single value for this key, lets check if this is the same value // and if so we just update the multiplicity. This is a check if its the same // literal value or object reference. @@ -172,16 +172,13 @@ export class Index { } return } - } - - // Get the prefix of the new value - const [prefix, suffix] = getPrefix(value) - if (currentSingleValueForKey) { - const [currentValue, currentMultiplicity] = currentSingleValueForKey + // Get the prefix of both values + const [prefix, suffix] = getPrefix(value) const [currentPrefix, currentSuffix] = getPrefix( currentValue ) + if ( currentPrefix === prefix && (currentSuffix === suffix || hash(currentSuffix) === hash(suffix)) @@ -194,34 +191,74 @@ export class Index { this.#inner.set(key, [value, newMultiplicity]) } return + } + + // They are different values - decide between ValueMap or PrefixMap + if (currentPrefix === NO_PREFIX && prefix === NO_PREFIX) { + // Both values have NO_PREFIX, use ValueMap directly + const valueMap = new ValueMap() + valueMap.set(hash(currentSuffix), mapOrSingleValue) + valueMap.set(hash(suffix), valueTuple) + this.#inner.set(key, valueMap) + return } else { - // They are different values, so we need to move the current value to a - // new prefix map - const newPrefixMap = new Map< - TPrefix | NO_PREFIX, - SingleValue | ValueMap - >() + // At least one has a prefix, use PrefixMap + const newPrefixMap = new PrefixMap() this.#inner.set(key, newPrefixMap) if (currentPrefix === prefix) { // They have the same prefix but different suffixes, so we need to add a // value map for this suffix to the prefix map - const valueMap = new Map() - valueMap.set(hash(currentSuffix), currentSingleValueForKey) + const valueMap = new ValueMap() + valueMap.set(hash(currentSuffix), mapOrSingleValue) valueMap.set(hash(suffix), valueTuple) newPrefixMap.set(currentPrefix, valueMap) } else { - // They have different prefixes, so we can add then as singe values to the + // They have different prefixes, so we can add then as single values to the // prefix map - newPrefixMap.set(currentPrefix, currentSingleValueForKey) + newPrefixMap.set(currentPrefix, mapOrSingleValue) newPrefixMap.set(prefix, valueTuple) } return } } - // At this point there is a prefix map for this key, we need the value map or - // single value for this prefix + // At this point we have either a ValueMap or PrefixMap + const [prefix, suffix] = getPrefix(value) + + if (mapOrSingleValue instanceof ValueMap) { + // Direct ValueMap - all values have NO_PREFIX + if (prefix !== NO_PREFIX) { + // This value has a prefix but existing values don't - need to convert to PrefixMap + const newPrefixMap = new PrefixMap() + newPrefixMap.set(NO_PREFIX, mapOrSingleValue) + newPrefixMap.set(prefix, valueTuple) + this.#inner.set(key, newPrefixMap) + return + } + + // Both existing and new values have NO_PREFIX, add to ValueMap + const suffixHash = hash(suffix) + const currentValueForSuffix = mapOrSingleValue.get(suffixHash) + if (currentValueForSuffix) { + const [, currentMultiplicity] = currentValueForSuffix + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + mapOrSingleValue.delete(suffixHash) + if (mapOrSingleValue.size === 0) { + this.#inner.delete(key) + } + } else { + mapOrSingleValue.set(suffixHash, [value, newMultiplicity]) + } + } else { + mapOrSingleValue.set(suffixHash, valueTuple) + } + return + } + + // PrefixMap case + const prefixMap = mapOrSingleValue const valueMapOrSingleValue = prefixMap.get(prefix) const [valueMap, currentSingleValueForPrefix] = isSingleValue( @@ -243,6 +280,9 @@ export class Index { const newMultiplicity = currentMultiplicity + multiplicity if (newMultiplicity === 0) { prefixMap.delete(prefix) + if (prefixMap.size === 0) { + this.#inner.delete(key) + } } else { prefixMap.set(prefix, [value, newMultiplicity]) } @@ -250,7 +290,7 @@ export class Index { } else { // They have different suffixes, so we need to add a value map for this suffix // to the prefix map - const valueMap = new Map() + const valueMap = new ValueMap() valueMap.set(hash(currentSuffix), currentSingleValueForPrefix) valueMap.set(hash(suffix), valueTuple) prefixMap.set(prefix, valueMap) @@ -276,6 +316,12 @@ export class Index { const newMultiplicity = currentMultiplicity + multiplicity if (newMultiplicity === 0) { valueMap.delete(suffixHash) + if (valueMap.size === 0) { + prefixMap.delete(prefix) + if (prefixMap.size === 0) { + this.#inner.delete(key) + } + } } else { valueMap.set(suffixHash, [value, newMultiplicity]) } From c0674b13ba42a8e7911257423a1f77e53611e0a0 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Thu, 18 Sep 2025 14:39:13 +0100 Subject: [PATCH 05/10] refactor --- packages/db-ivm/src/indexes.ts | 358 ++++++++++++++++++--------------- 1 file changed, 197 insertions(+), 161 deletions(-) diff --git a/packages/db-ivm/src/indexes.ts b/packages/db-ivm/src/indexes.ts index 5415186a4..3c52614eb 100644 --- a/packages/db-ivm/src/indexes.ts +++ b/packages/db-ivm/src/indexes.ts @@ -1,3 +1,38 @@ +/** + * # Optimized Index Data Structure + * + * Multi-level index that adapts storage strategy based on data patterns to minimize memory + * usage, eliminate wasteful lookups, and avoid hashing whenever possible. + * + * ## Storage Strategy + * + * **Single value**: `IndexMap['key'] → [value, multiplicity]` (no hashing needed) + * + * **Multiple unprefixed values**: Direct ValueMap (avoids NO_PREFIX lookup) + * ``` + * IndexMap['key'] → ValueMap { hash(value1) → [value1, mult1], ... } + * ``` + * + * **Values with prefixes**: PrefixMap uses prefix keys directly (no hashing) + * ``` + * IndexMap['key'] → PrefixMap { 'prefix1' → [value1, mult1], NO_PREFIX → ValueMap{...} } + * ``` + * + * **Multiple values per prefix**: ValueMap within PrefixMap (hash only suffixes) + * ``` + * PrefixMap['prefix'] → ValueMap { hash(suffix1) → [full_value1, mult1], ... } + * ``` + * + * ## Dynamic Evolution + * + * Structure automatically evolves as data is added: + * - Single → ValueMap (when both values unprefixed) + * - Single → PrefixMap (when at least one prefixed) + * - ValueMap → PrefixMap (adding prefixed value to unprefixed) + * + * Prefixes extracted from array values: `['prefix', 'suffix']` → prefix='prefix' + */ + import { MultiSet } from "./multiset.js" import { hash } from "./hashing/index.js" import type { Hash } from "./hashing/index.js" @@ -20,10 +55,83 @@ type IndexMap = Map< class PrefixMap extends Map< TPrefix | NO_PREFIX, SingleValue | ValueMap -> {} +> { + /** + * Add a value to the PrefixMap. Returns true if the map becomes empty after the operation. + */ + addValue(value: TValue, multiplicity: number): boolean { + if (multiplicity === 0) return this.size === 0 + + const prefix = getPrefix(value) + const valueMapOrSingleValue = this.get(prefix) + + if (isSingleValue(valueMapOrSingleValue)) { + const [currentValue, currentMultiplicity] = valueMapOrSingleValue + const currentPrefix = getPrefix(currentValue) + + if (currentPrefix !== prefix) { + throw new Error(`Mismatching prefixes, this should never happen`) + } + + if (currentValue === value || hash(currentValue) === hash(value)) { + // Same value, update multiplicity + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.delete(prefix) + } else { + this.set(prefix, [value, newMultiplicity]) + } + } else { + // Different suffixes, need to create ValueMap + const valueMap = new ValueMap() + valueMap.set(hash(currentValue), valueMapOrSingleValue) + valueMap.set(hash(value), [value, multiplicity]) + this.set(prefix, valueMap) + } + } else if (valueMapOrSingleValue === undefined) { + // No existing value for this prefix + this.set(prefix, [value, multiplicity]) + } else { + // Existing ValueMap + const isEmpty = valueMapOrSingleValue.addValue(value, multiplicity) + if (isEmpty) { + this.delete(prefix) + } + } + + return this.size === 0 + } +} // Third level map type for the index, stores single values or value maps against a hash. -class ValueMap extends Map {} +class ValueMap extends Map { + /** + * Add a value to the ValueMap. Returns true if the map becomes empty after the operation. + * @param value - The full value to store + * @param multiplicity - The multiplicity to add + * @param hashKey - Optional hash key to use instead of hashing the full value (used when in PrefixMap context) + */ + addValue(value: TValue, multiplicity: number): boolean { + if (multiplicity === 0) return this.size === 0 + + const key = hash(value) + const currentValue = this.get(key) + + if (currentValue) { + const [, currentMultiplicity] = currentValue + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.delete(key) + } else { + this.set(key, [value, newMultiplicity]) + } + } else { + this.set(key, [value, multiplicity]) + } + + return this.size === 0 + } +} /** * A map from a difference collection trace's keys -> (value, multiplicities) that changed. @@ -152,181 +260,111 @@ export class Index { const mapOrSingleValue = this.#inner.get(key) if (mapOrSingleValue === undefined) { - // This is the first time we see a value for this key we just insert it - // into the index as a single value tuple + // First value for this key this.#inner.set(key, valueTuple) return } if (isSingleValue(mapOrSingleValue)) { - const [currentValue, currentMultiplicity] = mapOrSingleValue - // We have a single value for this key, lets check if this is the same value - // and if so we just update the multiplicity. This is a check if its the same - // literal value or object reference. - if (currentValue === value) { - const newMultiplicity = currentMultiplicity + multiplicity - if (newMultiplicity === 0) { - this.#inner.delete(key) - } else { - this.#inner.set(key, [value, newMultiplicity]) - } - return - } - - // Get the prefix of both values - const [prefix, suffix] = getPrefix(value) - const [currentPrefix, currentSuffix] = getPrefix( - currentValue + // Handle transition from single value to map + this.#handleSingleValueTransition( + key, + mapOrSingleValue, + value, + multiplicity ) + return + } - if ( - currentPrefix === prefix && - (currentSuffix === suffix || hash(currentSuffix) === hash(suffix)) - ) { - // They are the same value, so we just update the multiplicity - const newMultiplicity = currentMultiplicity + multiplicity - if (newMultiplicity === 0) { + if (mapOrSingleValue instanceof ValueMap) { + // Handle existing ValueMap + const prefix = getPrefix(value) + if (prefix !== NO_PREFIX) { + // Convert ValueMap to PrefixMap since we have a prefixed value + const prefixMap = new PrefixMap() + prefixMap.set(NO_PREFIX, mapOrSingleValue) + prefixMap.set(prefix, valueTuple) + this.#inner.set(key, prefixMap) + } else { + // Add to existing ValueMap + const isEmpty = mapOrSingleValue.addValue(value, multiplicity) + if (isEmpty) { this.#inner.delete(key) - } else { - this.#inner.set(key, [value, newMultiplicity]) } - return } - - // They are different values - decide between ValueMap or PrefixMap - if (currentPrefix === NO_PREFIX && prefix === NO_PREFIX) { - // Both values have NO_PREFIX, use ValueMap directly - const valueMap = new ValueMap() - valueMap.set(hash(currentSuffix), mapOrSingleValue) - valueMap.set(hash(suffix), valueTuple) - this.#inner.set(key, valueMap) - return - } else { - // At least one has a prefix, use PrefixMap - const newPrefixMap = new PrefixMap() - this.#inner.set(key, newPrefixMap) - - if (currentPrefix === prefix) { - // They have the same prefix but different suffixes, so we need to add a - // value map for this suffix to the prefix map - const valueMap = new ValueMap() - valueMap.set(hash(currentSuffix), mapOrSingleValue) - valueMap.set(hash(suffix), valueTuple) - newPrefixMap.set(currentPrefix, valueMap) - } else { - // They have different prefixes, so we can add then as single values to the - // prefix map - newPrefixMap.set(currentPrefix, mapOrSingleValue) - newPrefixMap.set(prefix, valueTuple) - } - return + } else { + // Handle existing PrefixMap + const isEmpty = mapOrSingleValue.addValue(value, multiplicity) + if (isEmpty) { + this.#inner.delete(key) } } + } - // At this point we have either a ValueMap or PrefixMap - const [prefix, suffix] = getPrefix(value) - - if (mapOrSingleValue instanceof ValueMap) { - // Direct ValueMap - all values have NO_PREFIX - if (prefix !== NO_PREFIX) { - // This value has a prefix but existing values don't - need to convert to PrefixMap - const newPrefixMap = new PrefixMap() - newPrefixMap.set(NO_PREFIX, mapOrSingleValue) - newPrefixMap.set(prefix, valueTuple) - this.#inner.set(key, newPrefixMap) - return - } + /** + * Handle the transition from a single value to either a ValueMap or PrefixMap + */ + #handleSingleValueTransition( + key: TKey, + currentSingleValue: SingleValue, + newValue: TValue, + multiplicity: number + ) { + const [currentValue, currentMultiplicity] = currentSingleValue - // Both existing and new values have NO_PREFIX, add to ValueMap - const suffixHash = hash(suffix) - const currentValueForSuffix = mapOrSingleValue.get(suffixHash) - if (currentValueForSuffix) { - const [, currentMultiplicity] = currentValueForSuffix - const newMultiplicity = currentMultiplicity + multiplicity - if (newMultiplicity === 0) { - mapOrSingleValue.delete(suffixHash) - if (mapOrSingleValue.size === 0) { - this.#inner.delete(key) - } - } else { - mapOrSingleValue.set(suffixHash, [value, newMultiplicity]) - } + // Check for exact same value (reference equality) + if (currentValue === newValue) { + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.#inner.delete(key) } else { - mapOrSingleValue.set(suffixHash, valueTuple) + this.#inner.set(key, [newValue, newMultiplicity]) } return } - // PrefixMap case - const prefixMap = mapOrSingleValue - const valueMapOrSingleValue = prefixMap.get(prefix) + // Get prefixes for both values + const newPrefix = getPrefix(newValue) + const currentPrefix = getPrefix(currentValue) - const [valueMap, currentSingleValueForPrefix] = isSingleValue( - valueMapOrSingleValue - ) - ? [undefined, valueMapOrSingleValue] - : [valueMapOrSingleValue, undefined] - - if (currentSingleValueForPrefix) { - const [currentValue, currentMultiplicity] = currentSingleValueForPrefix - const [currentPrefix, currentSuffix] = getPrefix( - currentValue - ) - if (currentPrefix !== prefix) { - throw new Error(`Mismatching prefixes, this should never happen`) - } - if (currentSuffix === suffix || hash(currentSuffix) === hash(suffix)) { - // They are the same value, so we just update the multiplicity - const newMultiplicity = currentMultiplicity + multiplicity - if (newMultiplicity === 0) { - prefixMap.delete(prefix) - if (prefixMap.size === 0) { - this.#inner.delete(key) - } - } else { - prefixMap.set(prefix, [value, newMultiplicity]) - } - return + // Check if they're the same value by prefix/suffix comparison + if ( + currentPrefix === newPrefix && + (currentValue === newValue || hash(currentValue) === hash(newValue)) + ) { + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.#inner.delete(key) } else { - // They have different suffixes, so we need to add a value map for this suffix - // to the prefix map - const valueMap = new ValueMap() - valueMap.set(hash(currentSuffix), currentSingleValueForPrefix) - valueMap.set(hash(suffix), valueTuple) - prefixMap.set(prefix, valueMap) - return + this.#inner.set(key, [newValue, newMultiplicity]) } - } - - // At this point there was no single value for the prefix, there *may* be - // a value map for this prefix. If there is not, we can just add the new value - // as a single value to the prefix map - if (!valueMap) { - prefixMap.set(prefix, valueTuple) return } - // We now know there is a value map for this prefix, we need see if there is a - // current value for the suffix. If there is, we update the multiplicity, otherwise - // we add the new value as a single value to the value map - const suffixHash = hash(suffix) - const currentValueForSuffix = valueMap.get(suffixHash) - if (currentValueForSuffix) { - const [, currentMultiplicity] = currentValueForSuffix - const newMultiplicity = currentMultiplicity + multiplicity - if (newMultiplicity === 0) { - valueMap.delete(suffixHash) - if (valueMap.size === 0) { - prefixMap.delete(prefix) - if (prefixMap.size === 0) { - this.#inner.delete(key) - } - } + // Different values - choose appropriate map type + if (currentPrefix === NO_PREFIX && newPrefix === NO_PREFIX) { + // Both have NO_PREFIX, use ValueMap directly + const valueMap = new ValueMap() + valueMap.set(hash(currentValue), currentSingleValue) + valueMap.set(hash(newValue), [newValue, multiplicity]) + this.#inner.set(key, valueMap) + } else { + // At least one has a prefix, use PrefixMap + const prefixMap = new PrefixMap() + + if (currentPrefix === newPrefix) { + // Same prefix, different suffixes - need ValueMap within PrefixMap + const valueMap = new ValueMap() + valueMap.set(hash(currentValue), currentSingleValue) + valueMap.set(hash(newValue), [newValue, multiplicity]) + prefixMap.set(currentPrefix, valueMap) } else { - valueMap.set(suffixHash, [value, newMultiplicity]) + // Different prefixes - store as separate single values + prefixMap.set(currentPrefix, currentSingleValue) + prefixMap.set(newPrefix, [newValue, multiplicity]) } - } else { - valueMap.set(suffixHash, valueTuple) + + this.#inner.set(key, prefixMap) } } @@ -386,21 +424,19 @@ export class Index { * @param value - The value to extract the prefix from. * @returns The prefix and the suffix. */ -function getPrefix( - value: TValue -): [TPrefix | NO_PREFIX, TValue] { - // If the value is an array of two elements and the first element is a string - // or number, then the first element is the prefix. This is used to distinguish - // between values without the need for hashing unless there are multiple values - // for the same prefix. +function getPrefix(value: TValue): TPrefix | NO_PREFIX { + // If the value is an array and the first element is a string or number, then the + // first element is the prefix. This is used to distinguish between values without + // the need for hashing unless there are multiple values for the same prefix. if ( Array.isArray(value) && - value.length === 2 && - (typeof value[0] === `string` || typeof value[0] === `number`) + (typeof value[0] === `string` || + typeof value[0] === `number` || + typeof value[0] === `bigint`) ) { - return [value[0] as TPrefix, value[1] as TValue] + return value[0] as TPrefix } - return [NO_PREFIX, value] + return NO_PREFIX } /** From 9de8956af6fc6dc24a56a2b235ead09fb1d74924 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Fri, 19 Sep 2025 11:51:32 +0100 Subject: [PATCH 06/10] first refactor --- packages/db-ivm/src/operators/join.ts | 306 +++++++++++++----------- packages/db/src/query/compiler/joins.ts | 2 - 2 files changed, 167 insertions(+), 141 deletions(-) diff --git a/packages/db-ivm/src/operators/join.ts b/packages/db-ivm/src/operators/join.ts index 259cfbc05..a8d77b5e9 100644 --- a/packages/db-ivm/src/operators/join.ts +++ b/packages/db-ivm/src/operators/join.ts @@ -2,9 +2,6 @@ import { BinaryOperator, DifferenceStreamWriter } from "../graph.js" import { StreamBuilder } from "../d2.js" import { MultiSet } from "../multiset.js" import { Index } from "../indexes.js" -import { negate } from "./negate.js" -import { map } from "./map.js" -import { concat } from "./concat.js" import type { DifferenceStreamReader } from "../graph.js" import type { IStreamBuilder, KeyValue, PipedOperator } from "../types.js" @@ -14,66 +11,174 @@ import type { IStreamBuilder, KeyValue, PipedOperator } from "../types.js" export type JoinType = `inner` | `left` | `right` | `full` | `anti` /** - * Operator that joins two input streams + * Helper to build delta index and mass map from messages + */ +function buildDelta(messages: Array): [Index, Map] { + const delta = new Index() + const deltaMass = new Map() + + for (const message of messages) { + const multiSetMessage = message as unknown as MultiSet<[K, V]> + for (const [item, multiplicity] of multiSetMessage.getInner()) { + const [key, value] = item + delta.addValue(key, [value, multiplicity]) + deltaMass.set(key, (deltaMass.get(key) || 0) + multiplicity) + } + } + + return [delta, deltaMass] +} + +/** + * Check if a key has presence (non-zero mass) + */ +function hasPresence(mass: Map, key: K): boolean { + return (mass.get(key) || 0) !== 0 +} + +/** + * Operator that joins two input streams using direct join algorithms */ export class JoinOperator extends BinaryOperator< - [K, V1] | [K, V2] | [K, [V1, V2]] + [K, V1] | [K, V2] | [K, [V1, V2]] | [K, [V1 | null, V2 | null]] > { #indexA = new Index() #indexB = new Index() + #massA = new Map() // sum of multiplicities per key on side A + #massB = new Map() // sum of multiplicities per key on side B + #mode: JoinType constructor( id: number, inputA: DifferenceStreamReader<[K, V1]>, inputB: DifferenceStreamReader<[K, V2]>, - output: DifferenceStreamWriter<[K, [V1, V2]]> + output: DifferenceStreamWriter, + mode: JoinType = 'inner' ) { super(id, inputA, inputB, output) + this.#mode = mode } run(): void { - const deltaA = new Index() - const deltaB = new Index() + const start = performance.now() + // 1) Ingest messages and build deltas (no state mutation yet) + const [deltaA, deltaMassA] = buildDelta(this.inputAMessages()) + const [deltaB, deltaMassB] = buildDelta(this.inputBMessages()) - // Process input A - process ALL messages, not just the first one - const messagesA = this.inputAMessages() - for (const message of messagesA) { - const multiSetMessage = message as unknown as MultiSet<[K, V1]> - for (const [item, multiplicity] of multiSetMessage.getInner()) { - const [key, value] = item - deltaA.addValue(key, [value, multiplicity]) - } + const results = new MultiSet() + + // 2) INNER part (used by inner/left/right/full, but NOT anti) + if (this.#mode === 'inner' || this.#mode === 'left' || this.#mode === 'right' || this.#mode === 'full') { + // Emit deltaA ⋈ indexB + results.extend(deltaA.join(this.#indexB)) + + // Create logical indexA ⊎ deltaA and join with deltaB + const tempIndexA = new Index() + tempIndexA.append(this.#indexA) + tempIndexA.append(deltaA) + results.extend(tempIndexA.join(deltaB)) } - // Process input B - process ALL messages, not just the first one - const messagesB = this.inputBMessages() - for (const message of messagesB) { - const multiSetMessage = message as unknown as MultiSet<[K, V2]> - for (const [item, multiplicity] of multiSetMessage.getInner()) { - const [key, value] = item - deltaB.addValue(key, [value, multiplicity]) + // 3) OUTER/ANTI specifics + + // LEFT side nulls or anti-left (depend only on B's presence) + if (this.#mode === 'left' || this.#mode === 'full' || this.#mode === 'anti') { + // 3a) New/deleted left rows that are currently unmatched + // For initial state, check final presence after applying deltaB + for (const [key, valueIterator] of deltaA.entriesIterators()) { + const finalMassB = (this.#massB.get(key) || 0) + (deltaMassB.get(key) || 0) + if (finalMassB === 0) { + for (const [value, multiplicity] of valueIterator) { + if (multiplicity !== 0) { + results.extend([[[key, [value, null]], multiplicity]]) + } + } + } + } + + // 3b) Right-side presence transitions flip match status for *current* left rows + for (const key of deltaMassB.keys()) { + const wasEmpty = !hasPresence(this.#massB, key) + const currentMass = this.#massB.get(key) || 0 + const deltaMass = deltaMassB.get(key) || 0 + const willEmpty = (currentMass + deltaMass) === 0 + + if (wasEmpty && !willEmpty) { + // B: 0 -> >0 — retract previously unmatched left-at-k + for (const [value, multiplicity] of this.#indexA.getIterator(key)) { + if (multiplicity !== 0) { + results.extend([[[key, [value, null]], -multiplicity]]) + } + } + } else if (!wasEmpty && willEmpty) { + // B: >0 -> 0 — emit left-at-k as unmatched + for (const [value, multiplicity] of this.#indexA.getIterator(key)) { + if (multiplicity !== 0) { + results.extend([[[key, [value, null]], multiplicity]]) + } + } + } } } - // Process results - const results = new MultiSet<[K, [V1, V2]]>() + // RIGHT side nulls (depend only on A's presence) + if (this.#mode === 'right' || this.#mode === 'full') { + // 3a) New/deleted right rows that are currently unmatched + // For initial state, check final presence after applying deltaA + for (const [key, valueIterator] of deltaB.entriesIterators()) { + const finalMassA = (this.#massA.get(key) || 0) + (deltaMassA.get(key) || 0) + if (finalMassA === 0) { + for (const [value, multiplicity] of valueIterator) { + if (multiplicity !== 0) { + results.extend([[[key, [null, value]], multiplicity]]) + } + } + } + } - // Join deltaA with existing indexB - results.extend(deltaA.join(this.#indexB)) + // 3b) Left-side presence transitions flip match status for *current* right rows + for (const key of deltaMassA.keys()) { + const wasEmpty = !hasPresence(this.#massA, key) + const currentMass = this.#massA.get(key) || 0 + const deltaMass = deltaMassA.get(key) || 0 + const willEmpty = (currentMass + deltaMass) === 0 - // Append deltaA to indexA - this.#indexA.append(deltaA) + if (wasEmpty && !willEmpty) { + // A: 0 -> >0 — retract previously unmatched right-at-k + for (const [value, multiplicity] of this.#indexB.getIterator(key)) { + if (multiplicity !== 0) { + results.extend([[[key, [null, value]], -multiplicity]]) + } + } + } else if (!wasEmpty && willEmpty) { + // A: >0 -> 0 — emit right-at-k as unmatched + for (const [value, multiplicity] of this.#indexB.getIterator(key)) { + if (multiplicity !== 0) { + results.extend([[[key, [null, value]], multiplicity]]) + } + } + } + } + } - // Join existing indexA with deltaB - results.extend(this.#indexA.join(deltaB)) + // 4) Commit — update state + this.#indexA.append(deltaA) + this.#indexB.append(deltaB) + + // Update masses + for (const [key, deltaMass] of deltaMassA) { + this.#massA.set(key, (this.#massA.get(key) || 0) + deltaMass) + } + for (const [key, deltaMass] of deltaMassB) { + this.#massB.set(key, (this.#massB.get(key) || 0) + deltaMass) + } // Send results if (results.getInner().length > 0) { this.output.sendData(results) } - - // Append deltaB to indexB - this.#indexB.append(deltaB) + const end = performance.now() + console.log(`join took ${end - start}ms`) } } @@ -91,62 +196,20 @@ export function join< other: IStreamBuilder>, type: JoinType = `inner` ): PipedOperator> { - switch (type) { - case `inner`: - return innerJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - case `anti`: - return antiJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - case `left`: - return leftJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - case `right`: - return rightJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - case `full`: - return fullJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - default: - throw new Error(`Join type ${type} is invalid`) - } -} - -/** - * Joins two input streams - * @param other - The other stream to join with - */ -export function innerJoin< - K, - V1 extends T extends KeyValue ? VT : never, - V2, - T, ->( - other: IStreamBuilder> -): PipedOperator> { - return (stream: IStreamBuilder): IStreamBuilder> => { + return (stream: IStreamBuilder): IStreamBuilder> => { if (stream.graph !== other.graph) { throw new Error(`Cannot join streams from different graphs`) } - const output = new StreamBuilder>( + const output = new StreamBuilder>( stream.graph, - new DifferenceStreamWriter>() + new DifferenceStreamWriter>() ) const operator = new JoinOperator( stream.graph.getNextOperatorId(), stream.connectReader() as DifferenceStreamReader>, other.connectReader(), - output.writer + output.writer, + type ) stream.graph.addOperator(operator) stream.graph.addStream(output.connectReader()) @@ -155,7 +218,22 @@ export function innerJoin< } /** - * Joins two input streams + * Joins two input streams (inner join) + * @param other - The other stream to join with + */ +export function innerJoin< + K, + V1 extends T extends KeyValue ? VT : never, + V2, + T, +>( + other: IStreamBuilder> +): PipedOperator> { + return join(other, 'inner') as unknown as PipedOperator> +} + +/** + * Joins two input streams (anti join) * @param other - The other stream to join with */ export function antiJoin< @@ -166,24 +244,11 @@ export function antiJoin< >( other: IStreamBuilder> ): PipedOperator> { - return ( - stream: IStreamBuilder - ): IStreamBuilder> => { - const matchedLeft = stream.pipe( - innerJoin(other), - map(([key, [valueLeft, _valueRight]]) => [key, valueLeft]) - ) - const anti = stream.pipe( - concat(matchedLeft.pipe(negate())), - // @ts-ignore TODO: fix this - map(([key, value]) => [key, [value, null]]) - ) - return anti as IStreamBuilder> - } + return join(other, 'anti') as unknown as PipedOperator> } /** - * Joins two input streams + * Joins two input streams (left join) * @param other - The other stream to join with */ export function leftJoin< @@ -194,21 +259,11 @@ export function leftJoin< >( other: IStreamBuilder> ): PipedOperator> { - return ( - stream: IStreamBuilder - ): IStreamBuilder> => { - const left = stream - const right = other - const inner = left.pipe(innerJoin(right)) - const anti = left.pipe(antiJoin(right)) - return inner.pipe(concat(anti)) as IStreamBuilder< - KeyValue - > - } + return join(other, 'left') as unknown as PipedOperator> } /** - * Joins two input streams + * Joins two input streams (right join) * @param other - The other stream to join with */ export function rightJoin< @@ -219,24 +274,11 @@ export function rightJoin< >( other: IStreamBuilder> ): PipedOperator> { - return ( - stream: IStreamBuilder - ): IStreamBuilder> => { - const left = stream as IStreamBuilder> - const right = other - const inner = left.pipe(innerJoin(right)) - const anti = right.pipe( - antiJoin(left), - map(([key, [a, b]]) => [key, [b, a]]) - ) - return inner.pipe(concat(anti)) as IStreamBuilder< - KeyValue - > - } + return join(other, 'right') as unknown as PipedOperator> } /** - * Joins two input streams + * Joins two input streams (full join) * @param other - The other stream to join with */ export function fullJoin< @@ -247,19 +289,5 @@ export function fullJoin< >( other: IStreamBuilder> ): PipedOperator> { - return ( - stream: IStreamBuilder - ): IStreamBuilder> => { - const left = stream as IStreamBuilder> - const right = other - const inner = left.pipe(innerJoin(right)) - const antiLeft = left.pipe(antiJoin(right)) - const antiRight = right.pipe( - antiJoin(left), - map(([key, [a, b]]) => [key, [b, a]]) - ) - return inner.pipe(concat(antiLeft), concat(antiRight)) as IStreamBuilder< - KeyValue - > - } + return join(other, 'full') as unknown as PipedOperator> } diff --git a/packages/db/src/query/compiler/joins.ts b/packages/db/src/query/compiler/joins.ts index 961937ecd..38ae04993 100644 --- a/packages/db/src/query/compiler/joins.ts +++ b/packages/db/src/query/compiler/joins.ts @@ -1,5 +1,4 @@ import { - consolidate, filter, join as joinOperator, map, @@ -296,7 +295,6 @@ function processJoin( return mainPipeline.pipe( joinOperator(joinedPipeline, joinClause.type as JoinType), - consolidate(), processJoinResults(joinClause.type) ) } From 709e9ac89641fb18251da42df0ab266bfb9e69ef Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Fri, 19 Sep 2025 12:00:31 +0100 Subject: [PATCH 07/10] second pass --- packages/db-ivm/src/multiset.ts | 6 ++ packages/db-ivm/src/operators/join.ts | 132 +++++++++++++++----------- 2 files changed, 83 insertions(+), 55 deletions(-) diff --git a/packages/db-ivm/src/multiset.ts b/packages/db-ivm/src/multiset.ts index 1e793345e..44ba297ed 100644 --- a/packages/db-ivm/src/multiset.ts +++ b/packages/db-ivm/src/multiset.ts @@ -209,6 +209,12 @@ export class MultiSet { chunkedArrayPush(this.#inner, otherArray) } + add(item: T, multiplicity: number): void { + if (multiplicity !== 0) { + this.#inner.push([item, multiplicity]) + } + } + getInner(): MultiSetArray { return this.#inner } diff --git a/packages/db-ivm/src/operators/join.ts b/packages/db-ivm/src/operators/join.ts index a8d77b5e9..6982e1fec 100644 --- a/packages/db-ivm/src/operators/join.ts +++ b/packages/db-ivm/src/operators/join.ts @@ -13,27 +13,22 @@ export type JoinType = `inner` | `left` | `right` | `full` | `anti` /** * Helper to build delta index and mass map from messages */ -function buildDelta(messages: Array): [Index, Map] { +function buildDelta( + messages: Array +): [Index, Map] { const delta = new Index() const deltaMass = new Map() - + for (const message of messages) { - const multiSetMessage = message as unknown as MultiSet<[K, V]> + const multiSetMessage = message as MultiSet<[K, V]> for (const [item, multiplicity] of multiSetMessage.getInner()) { const [key, value] = item delta.addValue(key, [value, multiplicity]) deltaMass.set(key, (deltaMass.get(key) || 0) + multiplicity) } } - - return [delta, deltaMass] -} -/** - * Check if a key has presence (non-zero mass) - */ -function hasPresence(mass: Map, key: K): boolean { - return (mass.get(key) || 0) !== 0 + return [delta, deltaMass] } /** @@ -53,14 +48,13 @@ export class JoinOperator extends BinaryOperator< inputA: DifferenceStreamReader<[K, V1]>, inputB: DifferenceStreamReader<[K, V2]>, output: DifferenceStreamWriter, - mode: JoinType = 'inner' + mode: JoinType = `inner` ) { super(id, inputA, inputB, output) this.#mode = mode } run(): void { - const start = performance.now() // 1) Ingest messages and build deltas (no state mutation yet) const [deltaA, deltaMassA] = buildDelta(this.inputAMessages()) const [deltaB, deltaMassB] = buildDelta(this.inputBMessages()) @@ -68,53 +62,61 @@ export class JoinOperator extends BinaryOperator< const results = new MultiSet() // 2) INNER part (used by inner/left/right/full, but NOT anti) - if (this.#mode === 'inner' || this.#mode === 'left' || this.#mode === 'right' || this.#mode === 'full') { - // Emit deltaA ⋈ indexB + if ( + this.#mode === `inner` || + this.#mode === `left` || + this.#mode === `right` || + this.#mode === `full` + ) { + // Emit the three standard delta terms: ΔA⋈B_old, A_old⋈ΔB, ΔA⋈ΔB + // This avoids copying the entire left index each tick results.extend(deltaA.join(this.#indexB)) - - // Create logical indexA ⊎ deltaA and join with deltaB - const tempIndexA = new Index() - tempIndexA.append(this.#indexA) - tempIndexA.append(deltaA) - results.extend(tempIndexA.join(deltaB)) + results.extend(this.#indexA.join(deltaB)) + results.extend(deltaA.join(deltaB)) } // 3) OUTER/ANTI specifics // LEFT side nulls or anti-left (depend only on B's presence) - if (this.#mode === 'left' || this.#mode === 'full' || this.#mode === 'anti') { - // 3a) New/deleted left rows that are currently unmatched + if ( + this.#mode === `left` || + this.#mode === `full` || + this.#mode === `anti` + ) { + // 3a) New/deleted left rows that are currently unmatched // For initial state, check final presence after applying deltaB for (const [key, valueIterator] of deltaA.entriesIterators()) { - const finalMassB = (this.#massB.get(key) || 0) + (deltaMassB.get(key) || 0) + const finalMassB = + (this.#massB.get(key) || 0) + (deltaMassB.get(key) || 0) if (finalMassB === 0) { for (const [value, multiplicity] of valueIterator) { if (multiplicity !== 0) { - results.extend([[[key, [value, null]], multiplicity]]) + results.add([key, [value, null]], multiplicity) } } } } // 3b) Right-side presence transitions flip match status for *current* left rows - for (const key of deltaMassB.keys()) { - const wasEmpty = !hasPresence(this.#massB, key) - const currentMass = this.#massB.get(key) || 0 - const deltaMass = deltaMassB.get(key) || 0 - const willEmpty = (currentMass + deltaMass) === 0 + for (const [key, deltaMass] of deltaMassB) { + const before = this.#massB.get(key) || 0 + const after = before + deltaMass + + // Skip if presence doesn't flip (0->0, >0->different>0) + if ((before === 0) === (after === 0)) continue - if (wasEmpty && !willEmpty) { + if (before === 0 && after !== 0) { // B: 0 -> >0 — retract previously unmatched left-at-k for (const [value, multiplicity] of this.#indexA.getIterator(key)) { if (multiplicity !== 0) { - results.extend([[[key, [value, null]], -multiplicity]]) + results.add([key, [value, null]], -multiplicity) } } - } else if (!wasEmpty && willEmpty) { + } else if (before !== 0 && after === 0) { // B: >0 -> 0 — emit left-at-k as unmatched for (const [value, multiplicity] of this.#indexA.getIterator(key)) { if (multiplicity !== 0) { - results.extend([[[key, [value, null]], multiplicity]]) + results.add([key, [value, null]], multiplicity) } } } @@ -122,39 +124,41 @@ export class JoinOperator extends BinaryOperator< } // RIGHT side nulls (depend only on A's presence) - if (this.#mode === 'right' || this.#mode === 'full') { + if (this.#mode === `right` || this.#mode === `full`) { // 3a) New/deleted right rows that are currently unmatched // For initial state, check final presence after applying deltaA for (const [key, valueIterator] of deltaB.entriesIterators()) { - const finalMassA = (this.#massA.get(key) || 0) + (deltaMassA.get(key) || 0) + const finalMassA = + (this.#massA.get(key) || 0) + (deltaMassA.get(key) || 0) if (finalMassA === 0) { for (const [value, multiplicity] of valueIterator) { if (multiplicity !== 0) { - results.extend([[[key, [null, value]], multiplicity]]) + results.add([key, [null, value]], multiplicity) } } } } // 3b) Left-side presence transitions flip match status for *current* right rows - for (const key of deltaMassA.keys()) { - const wasEmpty = !hasPresence(this.#massA, key) - const currentMass = this.#massA.get(key) || 0 - const deltaMass = deltaMassA.get(key) || 0 - const willEmpty = (currentMass + deltaMass) === 0 + for (const [key, deltaMass] of deltaMassA) { + const before = this.#massA.get(key) || 0 + const after = before + deltaMass - if (wasEmpty && !willEmpty) { + // Skip if presence doesn't flip (0->0, >0->different>0) + if ((before === 0) === (after === 0)) continue + + if (before === 0 && after !== 0) { // A: 0 -> >0 — retract previously unmatched right-at-k for (const [value, multiplicity] of this.#indexB.getIterator(key)) { if (multiplicity !== 0) { - results.extend([[[key, [null, value]], -multiplicity]]) + results.add([key, [null, value]], -multiplicity) } } - } else if (!wasEmpty && willEmpty) { + } else if (before !== 0 && after === 0) { // A: >0 -> 0 — emit right-at-k as unmatched for (const [value, multiplicity] of this.#indexB.getIterator(key)) { if (multiplicity !== 0) { - results.extend([[[key, [null, value]], multiplicity]]) + results.add([key, [null, value]], multiplicity) } } } @@ -162,9 +166,12 @@ export class JoinOperator extends BinaryOperator< } // 4) Commit — update state + // IMPORTANT: All emissions use pre-append snapshots of indexA/indexB. + // For unmatched-on-delta (3a), use final presence (mass + deltaMass) to avoid churn. + // Append deltas and update masses only after all emissions. this.#indexA.append(deltaA) this.#indexB.append(deltaB) - + // Update masses for (const [key, deltaMass] of deltaMassA) { this.#massA.set(key, (this.#massA.get(key) || 0) + deltaMass) @@ -177,8 +184,6 @@ export class JoinOperator extends BinaryOperator< if (results.getInner().length > 0) { this.output.sendData(results) } - const end = performance.now() - console.log(`join took ${end - start}ms`) } } @@ -196,7 +201,9 @@ export function join< other: IStreamBuilder>, type: JoinType = `inner` ): PipedOperator> { - return (stream: IStreamBuilder): IStreamBuilder> => { + return ( + stream: IStreamBuilder + ): IStreamBuilder> => { if (stream.graph !== other.graph) { throw new Error(`Cannot join streams from different graphs`) } @@ -229,7 +236,10 @@ export function innerJoin< >( other: IStreamBuilder> ): PipedOperator> { - return join(other, 'inner') as unknown as PipedOperator> + return join(other, `inner`) as unknown as PipedOperator< + T, + KeyValue + > } /** @@ -244,7 +254,10 @@ export function antiJoin< >( other: IStreamBuilder> ): PipedOperator> { - return join(other, 'anti') as unknown as PipedOperator> + return join(other, `anti`) as unknown as PipedOperator< + T, + KeyValue + > } /** @@ -259,7 +272,10 @@ export function leftJoin< >( other: IStreamBuilder> ): PipedOperator> { - return join(other, 'left') as unknown as PipedOperator> + return join(other, `left`) as unknown as PipedOperator< + T, + KeyValue + > } /** @@ -274,7 +290,10 @@ export function rightJoin< >( other: IStreamBuilder> ): PipedOperator> { - return join(other, 'right') as unknown as PipedOperator> + return join(other, `right`) as unknown as PipedOperator< + T, + KeyValue + > } /** @@ -289,5 +308,8 @@ export function fullJoin< >( other: IStreamBuilder> ): PipedOperator> { - return join(other, 'full') as unknown as PipedOperator> + return join(other, `full`) as unknown as PipedOperator< + T, + KeyValue + > } From 8c97863178a8380f41c86ed57f5d4df92966918d Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Fri, 19 Sep 2025 12:50:20 +0100 Subject: [PATCH 08/10] futher optimisations --- packages/db-ivm/src/operators/join.ts | 219 +++++++++++++++++--------- 1 file changed, 147 insertions(+), 72 deletions(-) diff --git a/packages/db-ivm/src/operators/join.ts b/packages/db-ivm/src/operators/join.ts index 6982e1fec..3fe4ad041 100644 --- a/packages/db-ivm/src/operators/join.ts +++ b/packages/db-ivm/src/operators/join.ts @@ -1,3 +1,53 @@ +/** + * # Direct Join Algorithms for Incremental View Maintenance + * + * High-performance join operations implementing all join types (inner, left, right, full, anti) + * with minimal state and optimized performance. + * + * ## Algorithm + * + * For each tick, the algorithm processes incoming changes (deltas) and emits join results: + * + * 1. **Build deltas**: Extract new/changed/deleted rows from input messages + * 2. **Inner results**: Emit `ΔA⋈B_old + A_old⋈ΔB + ΔA⋈ΔB` (matched pairs) + * 3. **Outer results**: For unmatched rows, emit null-extended tuples: + * - New unmatched rows from deltas (when opposite side empty) + * - Presence transitions: when key goes `0→>0` (retract nulls) or `>0→0` (emit nulls) + * 4. **Update state**: Append deltas to indexes and update mass counters + * + * **Mass tracking** enables O(1) presence checks instead of scanning index buckets. + * + * ## State + * + * **Indexes** store the actual data: + * - `indexA: Index` - all left-side rows accumulated over time + * - `indexB: Index` - all right-side rows accumulated over time + * + * **Mass maps** track presence efficiently: + * - `massA/massB: Map` - sum of multiplicities per key + * - Used for O(1) presence checks: `mass.get(key) !== 0` means key exists + * - Avoids scanning entire index buckets just to check if key has any rows + * + * ## Join Types + * + * - **Inner**: Standard delta terms only + * - **Outer**: Inner results + null-extended unmatched rows with transition handling + * - **Anti**: Unmatched rows only (no inner results) + * + * ## Key Optimizations + * + * - **No temp copying**: Uses `(A⊎ΔA)⋈ΔB = A⋈ΔB ⊎ ΔA⋈ΔB` distributive property + * - **Early-out checks**: Skip phases when no deltas present + * - **Zero-entry pruning**: Keep maps compact, O(distinct keys) memory + * - **Final presence logic**: Avoid emit→retract churn within same tick + * + * ## Correctness + * + * - **Ordering**: Pre-append snapshots for emissions, post-emit state updates + * - **Presence**: Key matched iff mass ≠ 0, transitions trigger null handling + * - **Bag semantics**: Proper multiplicity handling including negatives + */ + import { BinaryOperator, DifferenceStreamWriter } from "../graph.js" import { StreamBuilder } from "../d2.js" import { MultiSet } from "../multiset.js" @@ -24,7 +74,14 @@ function buildDelta( for (const [item, multiplicity] of multiSetMessage.getInner()) { const [key, value] = item delta.addValue(key, [value, multiplicity]) - deltaMass.set(key, (deltaMass.get(key) || 0) + multiplicity) + + // Keep deltaMass small by deleting zero entries + const next = (deltaMass.get(key) || 0) + multiplicity + if (next === 0) { + deltaMass.delete(key) + } else { + deltaMass.set(key, next) + } } } @@ -59,64 +116,72 @@ export class JoinOperator extends BinaryOperator< const [deltaA, deltaMassA] = buildDelta(this.inputAMessages()) const [deltaB, deltaMassB] = buildDelta(this.inputBMessages()) + // Early-out checks + const hasDeltaA = deltaA.size > 0 + const hasDeltaB = deltaB.size > 0 + const hasDeltaMassA = deltaMassA.size > 0 + const hasDeltaMassB = deltaMassB.size > 0 + + // If nothing happened, bail early + if (!(hasDeltaA || hasDeltaB || hasDeltaMassA || hasDeltaMassB)) return + + // Precompute mode flags to avoid repeated string comparisons + const mode = this.#mode + const emitInner = + mode === `inner` || mode === `left` || mode === `right` || mode === `full` + const emitLeftNulls = mode === `left` || mode === `full` + const emitRightNulls = mode === `right` || mode === `full` + const emitAntiLeft = mode === `anti` + const results = new MultiSet() // 2) INNER part (used by inner/left/right/full, but NOT anti) - if ( - this.#mode === `inner` || - this.#mode === `left` || - this.#mode === `right` || - this.#mode === `full` - ) { - // Emit the three standard delta terms: ΔA⋈B_old, A_old⋈ΔB, ΔA⋈ΔB + if (emitInner && (hasDeltaA || hasDeltaB)) { + // Emit the three standard delta terms: DeltaA⋈B_old, A_old⋈DeltaB, DeltaA⋈DeltaB // This avoids copying the entire left index each tick - results.extend(deltaA.join(this.#indexB)) - results.extend(this.#indexA.join(deltaB)) - results.extend(deltaA.join(deltaB)) + if (hasDeltaA) results.extend(deltaA.join(this.#indexB)) + if (hasDeltaB) results.extend(this.#indexA.join(deltaB)) + if (hasDeltaA && hasDeltaB) results.extend(deltaA.join(deltaB)) } // 3) OUTER/ANTI specifics // LEFT side nulls or anti-left (depend only on B's presence) - if ( - this.#mode === `left` || - this.#mode === `full` || - this.#mode === `anti` - ) { - // 3a) New/deleted left rows that are currently unmatched - // For initial state, check final presence after applying deltaB - for (const [key, valueIterator] of deltaA.entriesIterators()) { - const finalMassB = - (this.#massB.get(key) || 0) + (deltaMassB.get(key) || 0) - if (finalMassB === 0) { - for (const [value, multiplicity] of valueIterator) { - if (multiplicity !== 0) { - results.add([key, [value, null]], multiplicity) + if ((emitLeftNulls || emitAntiLeft) && (hasDeltaA || hasDeltaMassB)) { + // 3a) New/deleted left rows that are currently unmatched (only if DeltaA changed) + if (hasDeltaA) { + // For initial state, check final presence after applying deltaB + for (const [key, valueIterator] of deltaA.entriesIterators()) { + const finalMassB = + (this.#massB.get(key) || 0) + (deltaMassB.get(key) || 0) + if (finalMassB === 0) { + for (const [value, multiplicity] of valueIterator) { + if (multiplicity !== 0) { + results.add([key, [value, null]], multiplicity) + } } } } } - // 3b) Right-side presence transitions flip match status for *current* left rows - for (const [key, deltaMass] of deltaMassB) { - const before = this.#massB.get(key) || 0 - const after = before + deltaMass + // 3b) Right-side presence transitions (only if some RHS masses changed) + if (hasDeltaMassB) { + for (const [key, deltaMass] of deltaMassB) { + const before = this.#massB.get(key) || 0 + if (deltaMass === 0) continue + const after = before + deltaMass - // Skip if presence doesn't flip (0->0, >0->different>0) - if ((before === 0) === (after === 0)) continue + // Skip if presence doesn't flip (0->0, >0->different>0) + if ((before === 0) === (after === 0)) continue - if (before === 0 && after !== 0) { - // B: 0 -> >0 — retract previously unmatched left-at-k - for (const [value, multiplicity] of this.#indexA.getIterator(key)) { + const it = this.#indexA.getIterator(key) + const retract = before === 0 // 0->!0 => retract, else (>0->0) emit + for (const [value, multiplicity] of it) { if (multiplicity !== 0) { - results.add([key, [value, null]], -multiplicity) - } - } - } else if (before !== 0 && after === 0) { - // B: >0 -> 0 — emit left-at-k as unmatched - for (const [value, multiplicity] of this.#indexA.getIterator(key)) { - if (multiplicity !== 0) { - results.add([key, [value, null]], multiplicity) + results.add( + [key, [value, null]], + retract ? -multiplicity : +multiplicity + ) } } } @@ -124,41 +189,41 @@ export class JoinOperator extends BinaryOperator< } // RIGHT side nulls (depend only on A's presence) - if (this.#mode === `right` || this.#mode === `full`) { - // 3a) New/deleted right rows that are currently unmatched - // For initial state, check final presence after applying deltaA - for (const [key, valueIterator] of deltaB.entriesIterators()) { - const finalMassA = - (this.#massA.get(key) || 0) + (deltaMassA.get(key) || 0) - if (finalMassA === 0) { - for (const [value, multiplicity] of valueIterator) { - if (multiplicity !== 0) { - results.add([key, [null, value]], multiplicity) + if (emitRightNulls && (hasDeltaB || hasDeltaMassA)) { + // 3a) New/deleted right rows that are currently unmatched (only if DeltaB changed) + if (hasDeltaB) { + // For initial state, check final presence after applying deltaA + for (const [key, valueIterator] of deltaB.entriesIterators()) { + const finalMassA = + (this.#massA.get(key) || 0) + (deltaMassA.get(key) || 0) + if (finalMassA === 0) { + for (const [value, multiplicity] of valueIterator) { + if (multiplicity !== 0) { + results.add([key, [null, value]], multiplicity) + } } } } } - // 3b) Left-side presence transitions flip match status for *current* right rows - for (const [key, deltaMass] of deltaMassA) { - const before = this.#massA.get(key) || 0 - const after = before + deltaMass + // 3b) Left-side presence transitions (only if some LHS masses changed) + if (hasDeltaMassA) { + for (const [key, deltaMass] of deltaMassA) { + const before = this.#massA.get(key) || 0 + if (deltaMass === 0) continue + const after = before + deltaMass - // Skip if presence doesn't flip (0->0, >0->different>0) - if ((before === 0) === (after === 0)) continue + // Skip if presence doesn't flip (0->0, >0->different>0) + if ((before === 0) === (after === 0)) continue - if (before === 0 && after !== 0) { - // A: 0 -> >0 — retract previously unmatched right-at-k - for (const [value, multiplicity] of this.#indexB.getIterator(key)) { + const it = this.#indexB.getIterator(key) + const retract = before === 0 // 0->!0 => retract, else (>0->0) emit + for (const [value, multiplicity] of it) { if (multiplicity !== 0) { - results.add([key, [null, value]], -multiplicity) - } - } - } else if (before !== 0 && after === 0) { - // A: >0 -> 0 — emit right-at-k as unmatched - for (const [value, multiplicity] of this.#indexB.getIterator(key)) { - if (multiplicity !== 0) { - results.add([key, [null, value]], multiplicity) + results.add( + [key, [null, value]], + retract ? -multiplicity : +multiplicity + ) } } } @@ -172,12 +237,22 @@ export class JoinOperator extends BinaryOperator< this.#indexA.append(deltaA) this.#indexB.append(deltaB) - // Update masses + // Update masses and keep maps small by deleting zero entries for (const [key, deltaMass] of deltaMassA) { - this.#massA.set(key, (this.#massA.get(key) || 0) + deltaMass) + const next = (this.#massA.get(key) || 0) + deltaMass + if (next === 0) { + this.#massA.delete(key) + } else { + this.#massA.set(key, next) + } } for (const [key, deltaMass] of deltaMassB) { - this.#massB.set(key, (this.#massB.get(key) || 0) + deltaMass) + const next = (this.#massB.get(key) || 0) + deltaMass + if (next === 0) { + this.#massB.delete(key) + } else { + this.#massB.set(key, next) + } } // Send results From 6c5879fe47fc15c3bb7531c05a09b1b3a0d9ec34 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Fri, 19 Sep 2025 12:53:06 +0100 Subject: [PATCH 09/10] changeset --- .changeset/fast-joins-redesign.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/fast-joins-redesign.md diff --git a/.changeset/fast-joins-redesign.md b/.changeset/fast-joins-redesign.md new file mode 100644 index 000000000..ef9e20e35 --- /dev/null +++ b/.changeset/fast-joins-redesign.md @@ -0,0 +1,5 @@ +--- +"@tanstack/db-ivm": patch +--- + +Redesign of the join operators with direct algorithms for major performance improvements by replacing composition-based joins (inner+anti) with implementation using mass tracking. Delivers significant performance gains while maintaining full correctness for all join types (inner, left, right, full, anti). From d7d8b4893f097f14e8817b2fe9491a6593174270 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Fri, 19 Sep 2025 13:31:42 +0100 Subject: [PATCH 10/10] format --- packages/db/src/query/compiler/joins.ts | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/packages/db/src/query/compiler/joins.ts b/packages/db/src/query/compiler/joins.ts index 38ae04993..8fc3ed2da 100644 --- a/packages/db/src/query/compiler/joins.ts +++ b/packages/db/src/query/compiler/joins.ts @@ -1,9 +1,4 @@ -import { - filter, - join as joinOperator, - map, - tap, -} from "@tanstack/db-ivm" +import { filter, join as joinOperator, map, tap } from "@tanstack/db-ivm" import { CollectionInputNotFoundError, InvalidJoinCondition,