Skip to content

Commit d83584c

Browse files
authored
Create new block when filter OrdinalBytesRefBlock (elastic#136444)
Currently, when filtering OrdinalBytesRefBlock and OrdinalBytesRefVector, we reuse the existing dictionary and filter only the ordinals. Although this is semantically correct, it can break HashAggregationOperator when ordinal optimizations are enabled in BlockHash. Example: given an incoming OrdinalBytesRefBlock after filtering: dict: [apple, banana, orange] ordinals: [0, 0, 0, 0, 1, 1] Here, orange is not referenced. During grouping and hashing, all dictionary entries [apple, banana, orange] are hashed with group IDs [1, 2, 3] (0 reserved for null), and the block produces groupings [1, 1, 1, 1, 2, 2]. The output is correct, but when evaluating partial/final results, we cannot exclude orange (groupId=3) unless we enter slow mode and track every seen group. This can cause an IndexOutOfBoundsException if orange is the largest group and the over-allocation is not enough to cover it, or orange may be included in the result even though it was excluded previously. This change enforces the creation of a new Block/Vector when filtering OrdinalBytesRefBlock and OrdinalBytesRefVector. Closes elastic#136423
1 parent 6ccbbce commit d83584c

File tree

6 files changed

+132
-51
lines changed

6 files changed

+132
-51
lines changed

docs/changelog/136444.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 136444
2+
summary: Create new block when filter `OrdinalBytesRefBlock`
3+
area: ES|QL
4+
type: bug
5+
issues:
6+
- 136423

x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/data/OrdinalBytesRefBlock.java

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -91,41 +91,33 @@ public OrdinalBytesRefBlock asOrdinals() {
9191

9292
@Override
9393
public BytesRefBlock filter(int... positions) {
94-
if (positions.length * ordinals.getTotalValueCount() >= bytes.getPositionCount() * ordinals.getPositionCount()) {
95-
OrdinalBytesRefBlock result = null;
96-
IntBlock filteredOrdinals = ordinals.filter(positions);
97-
try {
98-
result = new OrdinalBytesRefBlock(filteredOrdinals, bytes);
99-
bytes.incRef();
100-
} finally {
101-
if (result == null) {
102-
filteredOrdinals.close();
94+
// Do not build a filtered block using the same dictionary, because dictionary entries that are not referenced
95+
// may reappear when hashing the dictionary in BlockHash.
96+
// TODO: merge this BytesRefArrayBlock#filter
97+
final OrdinalBytesRefVector vector = asVector();
98+
if (vector != null) {
99+
return vector.filter(positions).asBlock();
100+
}
101+
BytesRef scratch = new BytesRef();
102+
try (BytesRefBlock.Builder builder = blockFactory().newBytesRefBlockBuilder(positions.length)) {
103+
for (int pos : positions) {
104+
if (isNull(pos)) {
105+
builder.appendNull();
106+
continue;
103107
}
104-
}
105-
return result;
106-
} else {
107-
// TODO: merge this BytesRefArrayBlock#filter
108-
BytesRef scratch = new BytesRef();
109-
try (BytesRefBlock.Builder builder = blockFactory().newBytesRefBlockBuilder(positions.length)) {
110-
for (int pos : positions) {
111-
if (isNull(pos)) {
112-
builder.appendNull();
113-
continue;
114-
}
115-
int valueCount = getValueCount(pos);
116-
int first = getFirstValueIndex(pos);
117-
if (valueCount == 1) {
118-
builder.appendBytesRef(getBytesRef(getFirstValueIndex(pos), scratch));
119-
} else {
120-
builder.beginPositionEntry();
121-
for (int c = 0; c < valueCount; c++) {
122-
builder.appendBytesRef(getBytesRef(first + c, scratch));
123-
}
124-
builder.endPositionEntry();
108+
int valueCount = getValueCount(pos);
109+
int first = getFirstValueIndex(pos);
110+
if (valueCount == 1) {
111+
builder.appendBytesRef(getBytesRef(getFirstValueIndex(pos), scratch));
112+
} else {
113+
builder.beginPositionEntry();
114+
for (int c = 0; c < valueCount; c++) {
115+
builder.appendBytesRef(getBytesRef(first + c, scratch));
125116
}
117+
builder.endPositionEntry();
126118
}
127-
return builder.mvOrdering(mvOrdering()).build();
128119
}
120+
return builder.mvOrdering(mvOrdering()).build();
129121
}
130122
}
131123

x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/data/OrdinalBytesRefVector.java

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -99,26 +99,14 @@ public BytesRefVector getDictionaryVector() {
9999

100100
@Override
101101
public BytesRefVector filter(int... positions) {
102-
if (positions.length >= ordinals.getPositionCount()) {
103-
OrdinalBytesRefVector result = null;
104-
IntVector filteredOrdinals = ordinals.filter(positions);
105-
try {
106-
result = new OrdinalBytesRefVector(filteredOrdinals, bytes);
107-
bytes.incRef();
108-
} finally {
109-
if (result == null) {
110-
filteredOrdinals.close();
111-
}
112-
}
113-
return result;
114-
} else {
115-
final BytesRef scratch = new BytesRef();
116-
try (BytesRefVector.Builder builder = blockFactory().newBytesRefVectorBuilder(positions.length)) {
117-
for (int p : positions) {
118-
builder.appendBytesRef(getBytesRef(p, scratch));
119-
}
120-
return builder.build();
102+
// Do not build a filtered block using the same dictionary, because dictionary entries that are not referenced
103+
// may reappear when hashing the dictionary in BlockHash.
104+
final BytesRef scratch = new BytesRef();
105+
try (BytesRefVector.Builder builder = blockFactory().newBytesRefVectorBuilder(positions.length)) {
106+
for (int p : positions) {
107+
builder.appendBytesRef(getBytesRef(p, scratch));
121108
}
109+
return builder.build();
122110
}
123111
}
124112

x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/data/BasicBlockTests.java

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import static org.hamcrest.Matchers.greaterThan;
5252
import static org.hamcrest.Matchers.instanceOf;
5353
import static org.hamcrest.Matchers.is;
54+
import static org.hamcrest.Matchers.not;
5455
import static org.hamcrest.Matchers.notNullValue;
5556
import static org.hamcrest.Matchers.nullValue;
5657
import static org.hamcrest.Matchers.sameInstance;
@@ -1518,6 +1519,67 @@ public void testRefCountingDocVector() {
15181519
assertThat(breaker.getUsed(), is(0L));
15191520
}
15201521

1522+
public void testFilterOrdinalBytesRefBlock() {
1523+
int dictSize = between(1, 10);
1524+
int positionCount = between(1, 100);
1525+
try (
1526+
var dictBuilder = blockFactory.newBytesRefVectorBuilder(dictSize);
1527+
var ordinalBuilder = blockFactory.newIntBlockBuilder(positionCount)
1528+
) {
1529+
for (int i = 0; i < dictSize; i++) {
1530+
dictBuilder.appendBytesRef(new BytesRef("value" + i));
1531+
}
1532+
for (int i = 0; i < positionCount; i++) {
1533+
int valueCount = randomIntBetween(0, 2);
1534+
switch (valueCount) {
1535+
case 0 -> ordinalBuilder.appendNull();
1536+
case 1 -> ordinalBuilder.appendInt(randomIntBetween(0, dictSize - 1));
1537+
default -> {
1538+
ordinalBuilder.beginPositionEntry();
1539+
for (int v = 0; v < valueCount; v++) {
1540+
ordinalBuilder.appendInt(randomIntBetween(0, dictSize - 1));
1541+
}
1542+
ordinalBuilder.endPositionEntry();
1543+
}
1544+
}
1545+
}
1546+
try (var block = new OrdinalBytesRefBlock(ordinalBuilder.build(), dictBuilder.build())) {
1547+
int[] masks = new int[between(1, 100)];
1548+
for (int i = 0; i < masks.length; i++) {
1549+
masks[i] = randomIntBetween(0, positionCount - 1);
1550+
}
1551+
try (var filtered = block.filter(masks)) {
1552+
assertThat(filtered, not(instanceOf(OrdinalBytesRefBlock.class)));
1553+
}
1554+
}
1555+
}
1556+
}
1557+
1558+
public void testFilterOrdinalBytesRefVector() {
1559+
int dictSize = between(1, 10);
1560+
int positionCount = between(1, 100);
1561+
try (
1562+
var dictBuilder = blockFactory.newBytesRefVectorBuilder(dictSize);
1563+
var ordinalBuilder = blockFactory.newIntVectorBuilder(positionCount)
1564+
) {
1565+
for (int i = 0; i < dictSize; i++) {
1566+
dictBuilder.appendBytesRef(new BytesRef("value" + i));
1567+
}
1568+
for (int i = 0; i < positionCount; i++) {
1569+
ordinalBuilder.appendInt(randomIntBetween(0, dictSize - 1));
1570+
}
1571+
try (var vector = new OrdinalBytesRefVector(ordinalBuilder.build(), dictBuilder.build())) {
1572+
int[] masks = new int[between(1, 100)];
1573+
for (int i = 0; i < masks.length; i++) {
1574+
masks[i] = randomIntBetween(0, positionCount - 1);
1575+
}
1576+
try (var filtered = vector.filter(masks)) {
1577+
assertThat(filtered, not(instanceOf(OrdinalBytesRefVector.class)));
1578+
}
1579+
}
1580+
}
1581+
}
1582+
15211583
/**
15221584
* Take an object with exactly 1 reference and assert that ref counting works fine.
15231585
* Assumes that {@link Releasable#close()} and {@link RefCounted#decRef()} are equivalent.

x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3381,3 +3381,31 @@ VALUES(color):keyword | color:keyword
33813381
yellow | yellow
33823382
// end::mv-group-values-expand-result[]
33833383
;
3384+
3385+
filterOrdinalValues
3386+
required_capability: fix_filter_ordinals
3387+
from employees
3388+
| MV_EXPAND job_positions
3389+
| WHERE job_positions != "Purchase Manager"
3390+
| STATS employees=count(*) BY job_positions
3391+
| SORT job_positions
3392+
| LIMIT 1000
3393+
;
3394+
3395+
employees:long | job_positions:keyword
3396+
18 | Accountant
3397+
13 | Architect
3398+
11 | Business Analyst
3399+
13 | Data Scientist
3400+
10 | Head Human Resources
3401+
15 | Internship
3402+
14 | Junior Developer
3403+
20 | Principal Support Engineer
3404+
13 | Python Developer
3405+
15 | Reporting Analyst
3406+
20 | Senior Python Developer
3407+
15 | Senior Team Lead
3408+
11 | Support Engineer
3409+
15 | Tech Lead
3410+
;
3411+

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1509,7 +1509,12 @@ public enum Cap {
15091509
/**
15101510
* Pack dimension values in TS command
15111511
*/
1512-
PACK_DIMENSIONS_IN_TS
1512+
PACK_DIMENSIONS_IN_TS,
1513+
1514+
/**
1515+
* Create new block when filtering OrdinalBytesRefBlock
1516+
*/
1517+
FIX_FILTER_ORDINALS,
15131518

15141519
;
15151520

0 commit comments

Comments
 (0)