Skip to content

Commit 9d311ee

Browse files
viiryaclaude
andcommitted
fix: Only use null-aware anti join for NOT IN, not NOT EXISTS
The previous implementation incorrectly applied null-aware semantics to ALL LeftAnti joins, including NOT EXISTS subqueries. This was wrong because: - **NOT IN**: Uses three-valued logic (TRUE/FALSE/UNKNOWN), requires null-aware - **NOT EXISTS**: Uses two-valued logic (TRUE/FALSE), should NOT be null-aware ```sql -- Setup: customers has (1, 2, 3, NULL), banned has (2, NULL) -- NOT IN - Correctly returns empty (null-aware) SELECT * FROM customers WHERE id NOT IN (SELECT id FROM banned); -- Result: Empty (correct - NULL in subquery makes all comparisons UNKNOWN) -- NOT EXISTS - Was incorrectly returning empty (bug) SELECT * FROM customers c WHERE NOT EXISTS (SELECT 1 FROM banned b WHERE c.id = b.id); -- Expected: (1, 3, NULL) - NULL=NULL is FALSE, so no matches for these rows -- Actual (buggy): Empty - incorrectly using null-aware semantics ``` In `decorrelate_predicate_subquery.rs`, line 424: ```rust let null_aware = matches!(join_type, JoinType::LeftAnti); ``` This set `null_aware=true` for ALL LeftAnti joins, but it should only be true for NOT IN (InSubquery), not NOT EXISTS (Exists). The `SubqueryInfo` struct already distinguishes between them: - **NOT IN**: Created with `new_with_in_expr()` → `in_predicate_opt` is `Some(...)` - **NOT EXISTS**: Created with `new()` → `in_predicate_opt` is `None` Fixed by checking both conditions: ```rust let null_aware = matches!(join_type, JoinType::LeftAnti) && in_predicate_opt.is_some(); // Only NOT IN, not NOT EXISTS ``` **File**: `datafusion/optimizer/src/decorrelate_predicate_subquery.rs` - Updated null_aware detection to only apply to NOT IN (lines 420-426) - Added comprehensive comments explaining the distinction - Check `in_predicate_opt.is_some()` to distinguish NOT IN from NOT EXISTS **File**: `datafusion/sqllogictest/test_files/null_aware_anti_join.slt` Added 5 new test scenarios (Tests 14-18): **Test 14**: Direct comparison of NOT IN vs NOT EXISTS with NULLs - NOT IN with NULL → empty result (null-aware) - NOT EXISTS with NULL → returns non-matching rows (NOT null-aware) - EXPLAIN verification **Test 15**: NOT EXISTS with no NULLs **Test 16**: NOT EXISTS with correlated subquery **Test 17**: NOT EXISTS with all-NULL subquery - Shows that NOT EXISTS returns all rows (NULL=NULL is FALSE) - Compares with NOT IN which correctly returns empty **Test 18**: Nested NOT EXISTS and NOT IN - Verifies correct interaction between the two ```bash cargo test -p datafusion-sqllogictest --test sqllogictests -- null_aware_anti_join cargo test -p datafusion-sqllogictest --test sqllogictests subquery.slt cargo test -p datafusion-optimizer --lib cargo test -p datafusion-physical-plan --lib hash_join ``` This fix ensures DataFusion correctly implements SQL semantics: - NOT IN subqueries now correctly use null-aware anti join (three-valued logic) - NOT EXISTS subqueries now correctly use regular anti join (two-valued logic) Users can now reliably use both NOT IN and NOT EXISTS with confidence that NULL handling follows SQL standards. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 8d2f08f commit 9d311ee

File tree

2 files changed

+159
-4
lines changed

2 files changed

+159
-4
lines changed

datafusion/optimizer/src/decorrelate_predicate_subquery.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -418,10 +418,12 @@ fn build_join(
418418
}
419419

420420
// Determine if this should be a null-aware anti join
421-
// For LeftAnti joins (NOT IN), we need null-aware semantics if:
422-
// 1. The join type is LeftAnti
423-
// 2. The join predicate involves nullable columns (conservative: assume nullable)
424-
let null_aware = matches!(join_type, JoinType::LeftAnti);
421+
// Null-aware semantics are only needed for NOT IN subqueries, not NOT EXISTS:
422+
// - NOT IN: Uses three-valued logic, requires null-aware handling
423+
// - NOT EXISTS: Uses two-valued logic, regular anti join is correct
424+
// We can distinguish them: NOT IN has in_predicate_opt, NOT EXISTS does not
425+
let null_aware = matches!(join_type, JoinType::LeftAnti)
426+
&& in_predicate_opt.is_some();
425427

426428
// join our sub query into the main plan
427429
let new_plan = if null_aware {

datafusion/sqllogictest/test_files/null_aware_anti_join.slt

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,141 @@ SELECT * FROM outer_table
229229
WHERE id NOT IN (SELECT DISTINCT id FROM duplicates_with_null);
230230
----
231231

232+
#############
233+
## Test 14: NOT EXISTS vs NOT IN - Demonstrating the difference
234+
#############
235+
236+
# NOT EXISTS should NOT use null-aware semantics
237+
# It uses two-valued logic (TRUE/FALSE), not three-valued logic (TRUE/FALSE/UNKNOWN)
238+
239+
# Setup tables for comparison
240+
statement ok
241+
CREATE TABLE customers(id INT, name TEXT) AS VALUES
242+
(1, 'Alice'),
243+
(2, 'Bob'),
244+
(3, 'Charlie'),
245+
(NULL, 'Dave');
246+
247+
statement ok
248+
CREATE TABLE banned(id INT) AS VALUES
249+
(2),
250+
(NULL);
251+
252+
# Test 14a: NOT IN with NULL in subquery - Returns EMPTY (null-aware)
253+
query IT rowsort
254+
SELECT * FROM customers WHERE id NOT IN (SELECT id FROM banned);
255+
----
256+
257+
# Test 14b: NOT EXISTS with NULL in subquery - Returns rows (NOT null-aware)
258+
# This should return (1, 'Alice'), (3, 'Charlie'), (NULL, 'Dave')
259+
# Because NOT EXISTS uses two-valued logic: NULL = NULL is FALSE, so no match found
260+
query IT rowsort
261+
SELECT * FROM customers c
262+
WHERE NOT EXISTS (SELECT 1 FROM banned b WHERE c.id = b.id);
263+
----
264+
1 Alice
265+
3 Charlie
266+
NULL Dave
267+
268+
# Test 14c: Verify with EXPLAIN that NOT EXISTS doesn't use null-aware
269+
query TT
270+
EXPLAIN SELECT * FROM customers c
271+
WHERE NOT EXISTS (SELECT 1 FROM banned b WHERE c.id = b.id);
272+
----
273+
logical_plan
274+
01)LeftAnti Join: c.id = __correlated_sq_1.id
275+
02)--SubqueryAlias: c
276+
03)----TableScan: customers projection=[id, name]
277+
04)--SubqueryAlias: __correlated_sq_1
278+
05)----SubqueryAlias: b
279+
06)------TableScan: banned projection=[id]
280+
physical_plan
281+
01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(id@0, id@0)]
282+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
283+
03)--DataSourceExec: partitions=1, partition_sizes=[1]
284+
285+
#############
286+
## Test 15: NOT EXISTS - No NULLs
287+
#############
288+
289+
statement ok
290+
CREATE TABLE active_customers(id INT) AS VALUES (1), (3);
291+
292+
# Should return only Bob (id=2) and Dave (id=NULL)
293+
query IT rowsort
294+
SELECT * FROM customers c
295+
WHERE NOT EXISTS (SELECT 1 FROM active_customers a WHERE c.id = a.id);
296+
----
297+
2 Bob
298+
NULL Dave
299+
300+
#############
301+
## Test 16: NOT EXISTS - Correlated subquery
302+
#############
303+
304+
statement ok
305+
CREATE TABLE orders_test(order_id INT, customer_id INT) AS VALUES
306+
(1, 100),
307+
(2, 200),
308+
(3, NULL);
309+
310+
statement ok
311+
CREATE TABLE customers_test(customer_id INT, name TEXT) AS VALUES
312+
(100, 'Alice'),
313+
(200, 'Bob'),
314+
(300, 'Charlie'),
315+
(NULL, 'Unknown');
316+
317+
# Find customers with no orders
318+
# Should return Charlie (300) and Unknown (NULL)
319+
query IT rowsort
320+
SELECT * FROM customers_test c
321+
WHERE NOT EXISTS (
322+
SELECT 1 FROM orders_test o WHERE o.customer_id = c.customer_id
323+
);
324+
----
325+
300 Charlie
326+
NULL Unknown
327+
328+
#############
329+
## Test 17: NOT EXISTS with all NULL subquery
330+
#############
331+
332+
statement ok
333+
CREATE TABLE all_null_banned(id INT) AS VALUES (NULL), (NULL);
334+
335+
# NOT EXISTS should return all rows because NULL = NULL is FALSE (no matches)
336+
query IT rowsort
337+
SELECT * FROM customers c
338+
WHERE NOT EXISTS (SELECT 1 FROM all_null_banned b WHERE c.id = b.id);
339+
----
340+
1 Alice
341+
2 Bob
342+
3 Charlie
343+
NULL Dave
344+
345+
# Compare with NOT IN which returns empty
346+
query IT rowsort
347+
SELECT * FROM customers WHERE id NOT IN (SELECT id FROM all_null_banned);
348+
----
349+
350+
#############
351+
## Test 18: Nested NOT EXISTS and NOT IN
352+
#############
353+
354+
# NOT EXISTS outside, NOT IN inside - should work correctly
355+
query IT rowsort
356+
SELECT * FROM customers c
357+
WHERE NOT EXISTS (
358+
SELECT 1 FROM banned b
359+
WHERE c.id = b.id
360+
AND b.id NOT IN (SELECT id FROM active_customers)
361+
);
362+
----
363+
1 Alice
364+
3 Charlie
365+
NULL Dave
366+
232367
#############
233368
## Cleanup
234369
#############
@@ -259,3 +394,21 @@ DROP TABLE payments;
259394

260395
statement ok
261396
DROP TABLE duplicates_with_null;
397+
398+
statement ok
399+
DROP TABLE customers;
400+
401+
statement ok
402+
DROP TABLE banned;
403+
404+
statement ok
405+
DROP TABLE active_customers;
406+
407+
statement ok
408+
DROP TABLE orders_test;
409+
410+
statement ok
411+
DROP TABLE customers_test;
412+
413+
statement ok
414+
DROP TABLE all_null_banned;

0 commit comments

Comments
 (0)