diff --git a/.changeset/fix-left-join-where-filter.md b/.changeset/fix-left-join-where-filter.md new file mode 100644 index 000000000..1a03f0797 --- /dev/null +++ b/.changeset/fix-left-join-where-filter.md @@ -0,0 +1,7 @@ +--- +'@tanstack/db': patch +--- + +fix(db): don't push WHERE clauses to nullable side of outer joins + +The query optimizer incorrectly pushed single-source WHERE clauses into subqueries and collection index optimization for the nullable side of outer joins. This pre-filtered the data before the join, converting rows that should have been excluded by the WHERE into unmatched outer-join rows that incorrectly survived the residual filter. diff --git a/packages/db/src/query/optimizer.ts b/packages/db/src/query/optimizer.ts index e1020c284..e962f296f 100644 --- a/packages/db/src/query/optimizer.ts +++ b/packages/db/src/query/optimizer.ts @@ -219,8 +219,11 @@ export function optimizeQuery(query: QueryIR): OptimizationResult { /** * Extracts collection-specific WHERE clauses from a query for index optimization. - * This analyzes the original query to identify WHERE clauses that can be pushed down - * to specific collections, but only for simple queries without joins. + * This analyzes the original query to identify single-source WHERE clauses that + * reference collection sources (not subqueries), including joined collections. + * + * For outer joins, clauses referencing the nullable side are excluded because + * using them to pre-filter collection data would change join semantics. * * @param query - The original QueryIR to analyze * @returns Map of source aliases to their WHERE clauses @@ -246,10 +249,19 @@ function extractSourceWhereClauses( // Group clauses by single-source vs multi-source const groupedClauses = groupWhereClauses(analyzedClauses) + // Determine which source aliases are on the nullable side of outer joins. + // WHERE clauses for these sources must not be used for index optimization + // because they should filter the final joined result, not the input data. + const nullableSources = getNullableJoinSources(query) + // Only include single-source clauses that reference collections directly + // and are not on the nullable side of an outer join for (const [sourceAlias, whereClause] of groupedClauses.singleSource) { // Check if this source alias corresponds to a collection reference - if (isCollectionReference(query, sourceAlias)) { + if ( + isCollectionReference(query, sourceAlias) && + !nullableSources.has(sourceAlias) + ) { sourceWhereClauses.set(sourceAlias, whereClause) } } @@ -283,6 +295,36 @@ function isCollectionReference(query: QueryIR, sourceAlias: string): boolean { return false } +/** + * Returns the set of source aliases that are on the nullable side of outer joins. + * + * For a LEFT join the joined (right) side is nullable. + * For a RIGHT join the main (left/from) side is nullable. + * For a FULL join both sides are nullable. + * + * WHERE clauses that reference only a nullable source must not be pushed down + * into that source's subquery or used for index optimization, because doing so + * changes the join semantics: rows that should be excluded by the WHERE become + * unmatched outer-join rows (with the nullable side set to undefined) and + * incorrectly survive residual filtering. + */ +function getNullableJoinSources(query: QueryIR): Set { + const nullable = new Set() + if (query.join) { + const mainAlias = query.from.alias + for (const join of query.join) { + const joinedAlias = join.from.alias + if (join.type === `left` || join.type === `full`) { + nullable.add(joinedAlias) + } + if (join.type === `right` || join.type === `full`) { + nullable.add(mainAlias) + } + } + } + return nullable +} + /** * Applies recursive predicate pushdown optimization. * @@ -635,10 +677,25 @@ function applyOptimizations( // Track which single-source clauses were actually optimized const actuallyOptimized = new Set() + // Determine which source aliases are on the nullable side of outer joins. + const nullableSources = getNullableJoinSources(query) + + // Build a filtered copy of singleSource that excludes nullable-side clauses. + // Pushing a WHERE clause into the nullable side's subquery pre-filters the + // data before the join, converting "matched but WHERE-excluded" rows into + // "unmatched" outer-join rows. These are indistinguishable from genuinely + // unmatched rows, so the residual WHERE cannot correct the result. + const pushableSingleSource = new Map>() + for (const [source, clause] of groupedClauses.singleSource) { + if (!nullableSources.has(source)) { + pushableSingleSource.set(source, clause) + } + } + // Optimize the main FROM clause and track what was optimized const optimizedFrom = optimizeFromWithTracking( query.from, - groupedClauses.singleSource, + pushableSingleSource, actuallyOptimized, ) @@ -648,7 +705,7 @@ function applyOptimizations( ...joinClause, from: optimizeFromWithTracking( joinClause.from, - groupedClauses.singleSource, + pushableSingleSource, actuallyOptimized, ), })) @@ -663,12 +720,7 @@ function applyOptimizations( } // Determine if we need residual clauses (when query has outer JOINs) - const hasOuterJoins = - query.join && - query.join.some( - (join) => - join.type === `left` || join.type === `right` || join.type === `full`, - ) + const hasOuterJoins = nullableSources.size > 0 // Add single-source clauses for (const [source, clause] of groupedClauses.singleSource) { diff --git a/packages/db/tests/query/indexes.test.ts b/packages/db/tests/query/indexes.test.ts index 9b221443c..a826861aa 100644 --- a/packages/db/tests/query/indexes.test.ts +++ b/packages/db/tests/query/indexes.test.ts @@ -679,13 +679,14 @@ describe(`Query Index Optimization`, () => { ], } - // Should use index optimization for both WHERE clauses - // since they each touch only a single source and both sources are indexed + // The WHERE clause on the non-nullable (left) side uses its index. + // The WHERE clause on the nullable (right) side of the LEFT JOIN is NOT + // pushed down to avoid changing join semantics, so the right side does a full scan. expectIndexUsage(combinedStats, { shouldUseIndex: true, - shouldUseFullScan: false, - indexCallCount: 2, // Both item.status='active' and other.status='active' can use indexes - fullScanCallCount: 0, + shouldUseFullScan: true, + indexCallCount: 1, // Only item.status='active' uses index (non-nullable side) + fullScanCallCount: 1, // other collection does full scan (nullable side) }) } finally { tracker1.restore() @@ -1177,21 +1178,18 @@ describe(`Query Index Optimization`, () => { { id: `1`, name: `Alice`, otherName: `Other Active Item` }, ]) - // We should have done a full scan of the right collection + // The right collection does a full scan (no index on status) expect(tracker2.stats.queriesExecuted).toEqual([ { type: `fullScan`, }, ]) - // We should have done an index lookup on the 1st collection to find active items + // In a RIGHT join, the left (from) side is nullable. The WHERE clause + // eq(item.status, 'active') is NOT pushed down to avoid changing join + // semantics, so the left collection does NOT do an index lookup for status. + // It only does the index lookup for the join key (id) used by lazy loading. expect(tracker1.stats.queriesExecuted).toEqual([ - { - field: `status`, - operation: `eq`, - type: `index`, - value: `active`, - }, { type: `index`, operation: `in`, @@ -1281,14 +1279,12 @@ describe(`Query Index Optimization`, () => { }, ]) - // We should have done an index lookup on the left collection to find active items - // because it has an index on the join key + // In a RIGHT join, the left (from) side is nullable. The WHERE clause + // eq(item.status, 'active') is NOT pushed down to avoid changing join + // semantics, so the left collection does a full scan. expect(tracker1.stats.queriesExecuted).toEqual([ { - type: `index`, - operation: `eq`, - field: `status`, - value: `active`, + type: `fullScan`, }, ]) } finally { diff --git a/packages/db/tests/query/join.test.ts b/packages/db/tests/query/join.test.ts index d0918564e..441be59bc 100644 --- a/packages/db/tests/query/join.test.ts +++ b/packages/db/tests/query/join.test.ts @@ -1840,6 +1840,66 @@ function createJoinTests(autoIndex: `off` | `eager`): void { expect(result.join2!.value).toBe(1) expect(result.join2!.other).toBe(30) }) + + test(`where with isUndefined on right-side field should filter entire joined rows`, () => { + type Left = { + id: string + rightId: string | null + } + + type Right = { + id: string + payload: string | null | undefined + } + + const leftCollection = createCollection( + mockSyncCollectionOptions({ + id: `test-left-isundefined-field`, + getKey: (item) => item.id, + initialData: [ + { id: `l1`, rightId: `r1` }, + { id: `l2`, rightId: `r2` }, + { id: `l3`, rightId: `r3` }, + { id: `l4`, rightId: null }, + ], + autoIndex, + }), + ) + + const rightCollection = createCollection( + mockSyncCollectionOptions({ + id: `test-right-isundefined-field`, + getKey: (item) => item.id, + initialData: [ + { id: `r1`, payload: `ok` }, + { id: `r2`, payload: null }, + { id: `r3`, payload: undefined }, + ], + autoIndex, + }), + ) + + const lq = createLiveQueryCollection({ + startSync: true, + query: (q) => + q + .from({ l: leftCollection }) + .leftJoin({ r: rightCollection }, ({ l, r }) => eq(l.rightId, r.id)) + .where(({ r }) => isUndefined(r?.payload)) + .select(({ l, r }) => ({ leftId: l.id, right: r })), + }) + + const data = lq.toArray + + // l1 joins r1 (payload='ok') → payload is defined → exclude + // l2 joins r2 (payload=null) → payload is null, not undefined → exclude + // l3 joins r3 (payload=undefined) → payload is undefined → include + // l4 has no match (rightId=null) → right is undefined, so r?.payload is undefined → include + expect(data.sort((a, b) => a.leftId.localeCompare(b.leftId))).toEqual([ + { leftId: `l3`, right: { id: `r3`, payload: undefined } }, + { leftId: `l4`, right: undefined }, + ]) + }) } describe(`Query JOIN Operations`, () => { diff --git a/packages/db/tests/query/optimizer.test.ts b/packages/db/tests/query/optimizer.test.ts index 77ff8cbed..0952ac2ba 100644 --- a/packages/db/tests/query/optimizer.test.ts +++ b/packages/db/tests/query/optimizer.test.ts @@ -1649,35 +1649,15 @@ describe(`Query Optimizer`, () => { const { optimizedQuery } = optimizeQuery(query) // The WHERE clause should remain in the main query to preserve LEFT JOIN semantics - // It should NOT be completely moved to the subquery + // It must NOT be pushed down to the nullable (right) side because that would + // pre-filter the right data, converting excluded matches into unmatched rows expect(optimizedQuery.where).toHaveLength(1) - expect(optimizedQuery.where![0]).toEqual({ - expression: createEq( - createPropRef(`teamMember`, `user_id`), - createValue(100), - ), - residual: true, - }) + expect(optimizedQuery.where![0]).toEqual( + createEq(createPropRef(`teamMember`, `user_id`), createValue(100)), + ) - // If the optimizer creates a subquery for teamMember, the WHERE clause should also be copied there - // but a residual copy must remain in the main query - if ( - optimizedQuery.join && - optimizedQuery.join[0]?.from.type === `queryRef` - ) { - const teamMemberSubquery = optimizedQuery.join[0].from.query - // The subquery may have the WHERE clause for optimization - if (teamMemberSubquery.where && teamMemberSubquery.where.length > 0) { - // But the main query MUST still have it to preserve semantics - expect(optimizedQuery.where).toContainEqual({ - expression: createEq( - createPropRef(`teamMember`, `user_id`), - createValue(100), - ), - residual: true, - }) - } - } + // The optimizer should NOT create a subquery for the nullable side + expect(optimizedQuery.join![0]!.from.type).toBe(`collectionRef`) }) test(`should preserve WHERE clause semantics when pushing down to RIGHT JOIN`, () => { @@ -1715,32 +1695,14 @@ describe(`Query Optimizer`, () => { const { optimizedQuery } = optimizeQuery(query) // The WHERE clause should remain in the main query to preserve RIGHT JOIN semantics - // It should NOT be completely moved to the subquery + // It must NOT be pushed down to the nullable (left/from) side expect(optimizedQuery.where).toHaveLength(1) - expect(optimizedQuery.where![0]).toEqual({ - expression: createEq( - createPropRef(`user`, `department_id`), - createValue(1), - ), - residual: true, - }) + expect(optimizedQuery.where![0]).toEqual( + createEq(createPropRef(`user`, `department_id`), createValue(1)), + ) - // If the optimizer creates a subquery for users, the WHERE clause should also be copied there - // but a residual copy must remain in the main query - if (optimizedQuery.from.type === `queryRef`) { - const userSubquery = optimizedQuery.from.query - // The subquery may have the WHERE clause for optimization - if (userSubquery.where && userSubquery.where.length > 0) { - // But the main query MUST still have it to preserve semantics - expect(optimizedQuery.where).toContainEqual({ - expression: createEq( - createPropRef(`user`, `department_id`), - createValue(1), - ), - residual: true, - }) - } - } + // The optimizer should NOT create a subquery for the nullable side (from) + expect(optimizedQuery.from.type).toBe(`collectionRef`) }) test(`should preserve WHERE clause semantics when pushing down to FULL JOIN`, () => { @@ -1778,35 +1740,15 @@ describe(`Query Optimizer`, () => { const { optimizedQuery } = optimizeQuery(query) // The WHERE clause should remain in the main query to preserve FULL JOIN semantics - // It should NOT be completely moved to the subquery + // It must NOT be pushed down to any nullable side expect(optimizedQuery.where).toHaveLength(1) - expect(optimizedQuery.where![0]).toEqual({ - expression: createGt( - createPropRef(`payment`, `amount`), - createValue(100), - ), - residual: true, - }) + expect(optimizedQuery.where![0]).toEqual( + createGt(createPropRef(`payment`, `amount`), createValue(100)), + ) - // If the optimizer creates a subquery for payments, the WHERE clause should also be copied there - // but a residual copy must remain in the main query - if ( - optimizedQuery.join && - optimizedQuery.join[0]?.from.type === `queryRef` - ) { - const paymentSubquery = optimizedQuery.join[0].from.query - // The subquery may have the WHERE clause for optimization - if (paymentSubquery.where && paymentSubquery.where.length > 0) { - // But the main query MUST still have it to preserve semantics - expect(optimizedQuery.where).toContainEqual({ - expression: createGt( - createPropRef(`payment`, `amount`), - createValue(100), - ), - residual: true, - }) - } - } + // The optimizer should NOT create subqueries for nullable sides + expect(optimizedQuery.from.type).toBe(`collectionRef`) + expect(optimizedQuery.join![0]!.from.type).toBe(`collectionRef`) }) test(`should allow WHERE clause pushdown for INNER JOIN (semantics preserved)`, () => {