Skip to content

Commit 24e815a

Browse files
committed
use batched iteration for intersect and union
1 parent a979901 commit 24e815a

File tree

2 files changed

+61
-14
lines changed

2 files changed

+61
-14
lines changed

cpp/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ build/
2727
Testing/
2828
build-support/boost_*
2929
vcpkg_installed/
30+
_deps/
3031

3132
# Build directories created by Clion
3233
cmake-build-*/

cpp/src/parquet/row_selection.cc

Lines changed: 60 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,20 @@ int64_t RowSelection::row_count() const {
7676

7777
RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection& rhs) {
7878
RowSelection result;
79+
80+
// Use iterators to get batches
81+
auto lhs_iter = lhs.NewIterator();
82+
auto rhs_iter = rhs.NewIterator();
83+
84+
auto lhs_batch = lhs_iter->NextRanges();
85+
auto rhs_batch = rhs_iter->NextRanges();
7986
size_t lhs_idx = 0;
8087
size_t rhs_idx = 0;
8188

82-
while (lhs_idx < lhs.ranges_.size() && rhs_idx < rhs.ranges_.size()) {
83-
const auto& left = lhs.ranges_[lhs_idx];
84-
const auto& right = rhs.ranges_[rhs_idx];
89+
while (!lhs_batch.empty() && !rhs_batch.empty()) {
90+
// Get current ranges from batches
91+
const auto& left = lhs_batch[lhs_idx];
92+
const auto& right = rhs_batch[rhs_idx];
8593

8694
int64_t left_end = left.start + left.length - 1;
8795
int64_t right_end = right.start + right.length - 1;
@@ -95,11 +103,19 @@ RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection
95103
result.ranges_.push_back(IntervalRange{start, end - start + 1});
96104
}
97105

98-
// Advance the iterator with smaller end
106+
// Advance the index with smaller end
99107
if (left_end < right_end) {
100108
lhs_idx++;
109+
if (lhs_idx >= lhs_batch.size()) {
110+
lhs_batch = lhs_iter->NextRanges();
111+
lhs_idx = 0;
112+
}
101113
} else {
102114
rhs_idx++;
115+
if (rhs_idx >= rhs_batch.size()) {
116+
rhs_batch = rhs_iter->NextRanges();
117+
rhs_idx = 0;
118+
}
103119
}
104120
}
105121

@@ -116,37 +132,67 @@ RowSelection RowSelection::Union(const RowSelection& lhs, const RowSelection& rh
116132
return lhs;
117133
}
118134

135+
// Use iterators to get batches
136+
auto lhs_iter = lhs.NewIterator();
137+
auto rhs_iter = rhs.NewIterator();
138+
139+
auto lhs_batch = lhs_iter->NextRanges();
140+
auto rhs_batch = rhs_iter->NextRanges();
119141
size_t lhs_idx = 0;
120142
size_t rhs_idx = 0;
121143

122144
// Start with whichever range has the smaller start
123145
IntervalRange current;
124-
if (lhs.ranges_[0].start <= rhs.ranges_[0].start) {
125-
current = lhs.ranges_[lhs_idx++];
146+
if (lhs_batch[0].start <= rhs_batch[0].start) {
147+
current = lhs_batch[lhs_idx++];
148+
if (lhs_idx >= lhs_batch.size()) {
149+
lhs_batch = lhs_iter->NextRanges();
150+
lhs_idx = 0;
151+
}
126152
} else {
127-
current = rhs.ranges_[rhs_idx++];
153+
current = rhs_batch[rhs_idx++];
154+
if (rhs_idx >= rhs_batch.size()) {
155+
rhs_batch = rhs_iter->NextRanges();
156+
rhs_idx = 0;
157+
}
128158
}
129159

130-
while (lhs_idx < lhs.ranges_.size() || rhs_idx < rhs.ranges_.size()) {
160+
while (!lhs_batch.empty() || !rhs_batch.empty()) {
131161
IntervalRange next;
132162

133-
if (rhs_idx >= rhs.ranges_.size()) {
163+
if (rhs_batch.empty()) {
134164
// Only lhs ranges remain
135-
next = lhs.ranges_[lhs_idx++];
136-
} else if (lhs_idx >= lhs.ranges_.size()) {
165+
next = lhs_batch[lhs_idx++];
166+
if (lhs_idx >= lhs_batch.size()) {
167+
lhs_batch = lhs_iter->NextRanges();
168+
lhs_idx = 0;
169+
}
170+
} else if (lhs_batch.empty()) {
137171
// Only rhs ranges remain
138-
next = rhs.ranges_[rhs_idx++];
172+
next = rhs_batch[rhs_idx++];
173+
if (rhs_idx >= rhs_batch.size()) {
174+
rhs_batch = rhs_iter->NextRanges();
175+
rhs_idx = 0;
176+
}
139177
} else {
140178
// Both have ranges - pick the one with smaller start
141-
const auto& left = lhs.ranges_[lhs_idx];
142-
const auto& right = rhs.ranges_[rhs_idx];
179+
const auto& left = lhs_batch[lhs_idx];
180+
const auto& right = rhs_batch[rhs_idx];
143181

144182
if (left.start <= right.start) {
145183
next = left;
146184
lhs_idx++;
185+
if (lhs_idx >= lhs_batch.size()) {
186+
lhs_batch = lhs_iter->NextRanges();
187+
lhs_idx = 0;
188+
}
147189
} else {
148190
next = right;
149191
rhs_idx++;
192+
if (rhs_idx >= rhs_batch.size()) {
193+
rhs_batch = rhs_iter->NextRanges();
194+
rhs_idx = 0;
195+
}
150196
}
151197
}
152198

0 commit comments

Comments
 (0)