Skip to content

Commit 3a24505

Browse files
committed
Add more tests
1 parent 39d14e9 commit 3a24505

File tree

1 file changed

+180
-0
lines changed

1 file changed

+180
-0
lines changed

datafusion/core/tests/parquet/row_group_pruning.rs

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1761,3 +1761,183 @@ async fn test_limit_pruning() -> datafusion_common::error::Result<()> {
17611761

17621762
Ok(())
17631763
}
1764+
1765+
// Helper function to create a batch with two Int32 columns
1766+
fn make_two_col_i32_batch(
1767+
name_a: &str,
1768+
name_b: &str,
1769+
values_a: Vec<i32>,
1770+
values_b: Vec<i32>,
1771+
) -> datafusion_common::error::Result<RecordBatch> {
1772+
let schema = Arc::new(Schema::new(vec![
1773+
Field::new(name_a, DataType::Int32, false),
1774+
Field::new(name_b, DataType::Int32, false),
1775+
]));
1776+
let array_a: ArrayRef = Arc::new(Int32Array::from(values_a));
1777+
let array_b: ArrayRef = Arc::new(Int32Array::from(values_b));
1778+
RecordBatch::try_new(schema, vec![array_a, array_b]).map_err(DataFusionError::from)
1779+
}
1780+
1781+
#[tokio::test]
1782+
async fn test_limit_pruning_complex_filter() -> datafusion_common::error::Result<()> {
1783+
// Test Case 1: Complex filter with two columns (a = 1 AND b > 1 AND b < 4)
1784+
// Row Group 0: a=[1,1,1], b=[0,2,3] -> Partially matched, 2 rows match (b=2,3)
1785+
// Row Group 1: a=[1,1,1], b=[2,2,2] -> Fully matched, 3 rows
1786+
// Row Group 2: a=[1,1,1], b=[2,3,3] -> Fully matched, 3 rows
1787+
// Row Group 3: a=[1,1,1], b=[2,2,3] -> Fully matched, 3 rows
1788+
// Row Group 4: a=[2,2,2], b=[2,2,2] -> Not matched (a != 1)
1789+
// Row Group 5: a=[1,1,1], b=[5,6,7] -> Not matched (b >= 4)
1790+
1791+
// With LIMIT 5, we need RG1 (3 rows) + RG2 (2 rows from 3) = 5 rows
1792+
// RG4 and RG5 should be pruned by statistics
1793+
// RG3 should be pruned by limit
1794+
// RG0 is partially matched, so it depends on the order
1795+
1796+
let schema = Arc::new(Schema::new(vec![
1797+
Field::new("a", DataType::Int32, false),
1798+
Field::new("b", DataType::Int32, false),
1799+
]));
1800+
let query = "SELECT a, b FROM t WHERE a = 1 AND b > 1 AND b < 4 LIMIT 5";
1801+
1802+
let batches = vec![
1803+
make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![0, 2, 3])?,
1804+
make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 2])?,
1805+
make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 3, 3])?,
1806+
make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 3])?,
1807+
make_two_col_i32_batch("a", "b", vec![2, 2, 2], vec![2, 2, 2])?,
1808+
make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![5, 6, 7])?,
1809+
];
1810+
1811+
RowGroupPruningTest::new()
1812+
.with_scenario(Scenario::Int)
1813+
.with_query(query)
1814+
.with_expected_errors(Some(0))
1815+
.with_expected_rows(5)
1816+
.with_pruned_files(Some(0))
1817+
.with_matched_by_stats(Some(4)) // RG0,1,2,3 are matched
1818+
.with_pruned_by_stats(Some(2)) // RG4,5 are pruned
1819+
.with_limit_pruned_row_groups(Some(2)) // RG0, RG3 is pruned by limit
1820+
.test_row_group_prune_with_custom_data(schema, batches, 3)
1821+
.await;
1822+
1823+
Ok(())
1824+
}
1825+
1826+
#[tokio::test]
1827+
async fn test_limit_pruning_multiple_fully_matched(
1828+
) -> datafusion_common::error::Result<()> {
1829+
// Test Case 2: Limit requires multiple fully matched row groups
1830+
// Row Group 0: a=[5,5,5,5] -> Fully matched, 4 rows
1831+
// Row Group 1: a=[5,5,5,5] -> Fully matched, 4 rows
1832+
// Row Group 2: a=[5,5,5,5] -> Fully matched, 4 rows
1833+
// Row Group 3: a=[5,5,5,5] -> Fully matched, 4 rows
1834+
// Row Group 4: a=[1,2,3,4] -> Not matched
1835+
1836+
// With LIMIT 8, we need RG0 (4 rows) + RG1 (4 rows) 8 rows
1837+
// RG2,3 should be pruned by limit
1838+
// RG4 should be pruned by statistics
1839+
1840+
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
1841+
let query = "SELECT a FROM t WHERE a = 5 LIMIT 8";
1842+
1843+
let batches = vec![
1844+
make_i32_batch("a", vec![5, 5, 5, 5])?,
1845+
make_i32_batch("a", vec![5, 5, 5, 5])?,
1846+
make_i32_batch("a", vec![5, 5, 5, 5])?,
1847+
make_i32_batch("a", vec![5, 5, 5, 5])?,
1848+
make_i32_batch("a", vec![1, 2, 3, 4])?,
1849+
];
1850+
1851+
RowGroupPruningTest::new()
1852+
.with_scenario(Scenario::Int)
1853+
.with_query(query)
1854+
.with_expected_errors(Some(0))
1855+
.with_expected_rows(8)
1856+
.with_pruned_files(Some(0))
1857+
.with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
1858+
.with_pruned_by_stats(Some(1)) // RG4 pruned
1859+
.with_limit_pruned_row_groups(Some(2)) // RG2,3 pruned by limit
1860+
.test_row_group_prune_with_custom_data(schema, batches, 4)
1861+
.await;
1862+
1863+
Ok(())
1864+
}
1865+
1866+
#[tokio::test]
1867+
async fn test_limit_pruning_no_fully_matched() -> datafusion_common::error::Result<()> {
1868+
// Test Case 3: No fully matched row groups - all are partially matched
1869+
// Row Group 0: a=[1,2,3] -> Partially matched, 1 row (a=2)
1870+
// Row Group 1: a=[2,3,4] -> Partially matched, 1 row (a=2)
1871+
// Row Group 2: a=[2,5,6] -> Partially matched, 1 row (a=2)
1872+
// Row Group 3: a=[2,7,8] -> Partially matched, 1 row (a=2)
1873+
// Row Group 4: a=[9,10,11] -> Not matched
1874+
1875+
// With LIMIT 3, we need to scan RG0,1,2 to get 3 matching rows
1876+
// Cannot prune much by limit since all matching RGs are partial
1877+
// RG4 should be pruned by statistics
1878+
1879+
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
1880+
let query = "SELECT a FROM t WHERE a = 2 LIMIT 3";
1881+
1882+
let batches = vec![
1883+
make_i32_batch("a", vec![1, 2, 3])?,
1884+
make_i32_batch("a", vec![2, 3, 4])?,
1885+
make_i32_batch("a", vec![2, 5, 6])?,
1886+
make_i32_batch("a", vec![2, 7, 8])?,
1887+
make_i32_batch("a", vec![9, 10, 11])?,
1888+
];
1889+
1890+
RowGroupPruningTest::new()
1891+
.with_scenario(Scenario::Int)
1892+
.with_query(query)
1893+
.with_expected_errors(Some(0))
1894+
.with_expected_rows(3)
1895+
.with_pruned_files(Some(0))
1896+
.with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
1897+
.with_pruned_by_stats(Some(1)) // RG4 pruned
1898+
.with_limit_pruned_row_groups(Some(0)) // RG3 pruned by limit
1899+
.test_row_group_prune_with_custom_data(schema, batches, 3)
1900+
.await;
1901+
1902+
Ok(())
1903+
}
1904+
1905+
#[tokio::test]
1906+
async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error::Result<()>
1907+
{
1908+
// Test Case 4: Limit exceeds all fully matched rows, need partially matched
1909+
// Row Group 0: a=[10,11,12,12] -> Partially matched, 1 row (a=10)
1910+
// Row Group 1: a=[10,10,10,10] -> Fully matched, 4 rows
1911+
// Row Group 2: a=[10,10,10,10] -> Fully matched, 4 rows
1912+
// Row Group 3: a=[10,13,14,11] -> Partially matched, 1 row (a=10)
1913+
// Row Group 4: a=[20,21,22,22] -> Not matched
1914+
1915+
// With LIMIT 10, we need RG1 (4) + RG2 (4) = 8 from fully matched
1916+
// Still need 2 more, so we need to scan partially matched RG0 and RG3
1917+
// All matching row groups should be scanned, only RG4 pruned by statistics
1918+
1919+
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
1920+
let query = "SELECT a FROM t WHERE a = 10 LIMIT 10";
1921+
1922+
let batches = vec![
1923+
make_i32_batch("a", vec![10, 11, 12, 12])?,
1924+
make_i32_batch("a", vec![10, 10, 10, 10])?,
1925+
make_i32_batch("a", vec![10, 10, 10, 10])?,
1926+
make_i32_batch("a", vec![10, 13, 14, 11])?,
1927+
make_i32_batch("a", vec![20, 21, 22, 22])?,
1928+
];
1929+
1930+
RowGroupPruningTest::new()
1931+
.with_scenario(Scenario::Int)
1932+
.with_query(query)
1933+
.with_expected_errors(Some(0))
1934+
.with_expected_rows(10) // Total: 1 + 3 + 4 + 1 = 9 (less than limit)
1935+
.with_pruned_files(Some(0))
1936+
.with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
1937+
.with_pruned_by_stats(Some(1)) // RG4 pruned
1938+
.with_limit_pruned_row_groups(Some(0)) // No limit pruning since we need all RGs
1939+
.test_row_group_prune_with_custom_data(schema, batches, 4)
1940+
.await;
1941+
1942+
Ok(())
1943+
}

0 commit comments

Comments
 (0)