Skip to content

Commit 9b6d18f

Browse files
authored
compaction now only identifies multi-value dimensions for dimension schemas which can possibly produce them (#18760)
changes: * added `DimensionSchema.canBeMultiValued`, defaulting to false but true for `StringDimensionSchema` and `NewSpatialDimensionSchema` * `CompactionTask.identifyMultiValuedDimensions` now checks `DimensionSchema.canBeMultiValued()` instead of checking for `ColumnType.STRING` from `DimensionSchema.getColumnType()` * added test to confirm behavior using `AutoTypeColumnSchema` with a specified `castToType` of `ColumnType.STRING`, which prior to changes would identify as multi-valued despite auto never producing them
1 parent 5626213 commit 9b6d18f

File tree

5 files changed

+70
-17
lines changed

5 files changed

+70
-17
lines changed

indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@
8484
import org.apache.druid.segment.Segment;
8585
import org.apache.druid.segment.column.ColumnCapabilities;
8686
import org.apache.druid.segment.column.ColumnHolder;
87-
import org.apache.druid.segment.column.ColumnType;
8887
import org.apache.druid.segment.incremental.AppendableIndexSpec;
8988
import org.apache.druid.segment.indexing.CombinedDataSchema;
9089
import org.apache.druid.segment.indexing.DataSchema;
@@ -466,13 +465,13 @@ public boolean isPerfectRollup()
466465
* Checks if multi-valued string dimensions need to be analyzed by downloading the segments.
467466
* This method returns true only for MSQ engine when either of the following holds true:
468467
* <ul>
469-
* <li> Range partitioning is done on a string dimension or an unknown dimension
468+
* <li> Range partitioning is done on a possibly multi-valued string dimension or an unknown dimension
470469
* (since MSQ does not support partitioning on a multi-valued string dimension) </li>
471-
* <li> Rollup is done on a string dimension or an unknown dimension
470+
* <li> Rollup is done on a multi-valued string dimension or an unknown dimension
472471
* (since MSQ requires multi-valued string dimensions to be converted to arrays for rollup) </li>
473472
* </ul>
474-
* @return false for native engine, true for MSQ engine only when partitioning or rollup is done on a string
475-
* or unknown dimension.
473+
* @return false for native engine, true for MSQ engine only when partitioning or rollup is done on a multi-valued
474+
* string or unknown dimension.
476475
*/
477476
boolean identifyMultiValuedDimensions()
478477
{
@@ -481,28 +480,31 @@ boolean identifyMultiValuedDimensions()
481480
}
482481
// Rollup can be true even when granularitySpec is not known since rollup is then decided based on segment analysis
483482
final boolean isPossiblyRollup = granularitySpec == null || !Boolean.FALSE.equals(granularitySpec.isRollup());
484-
boolean isRangePartitioned = tuningConfig != null
485-
&& tuningConfig.getPartitionsSpec() instanceof DimensionRangePartitionsSpec;
483+
final DimensionRangePartitionsSpec rangeSpec;
484+
if (tuningConfig != null && tuningConfig.getPartitionsSpec() instanceof DimensionRangePartitionsSpec) {
485+
rangeSpec = (DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec();
486+
} else {
487+
rangeSpec = null;
488+
}
489+
final boolean isRangePartitioned = rangeSpec != null;
486490

487491
if (dimensionsSpec == null || dimensionsSpec.getDimensions().isEmpty()) {
488492
return isPossiblyRollup || isRangePartitioned;
489493
} else {
490-
boolean isRollupOnStringDimension = isPossiblyRollup &&
491-
dimensionsSpec.getDimensions()
492-
.stream()
493-
.anyMatch(dim -> ColumnType.STRING.equals(dim.getColumnType()));
494+
boolean isRollupOnMultiValueStringDimension = isPossiblyRollup &&
495+
dimensionsSpec.getDimensions()
496+
.stream()
497+
.anyMatch(DimensionSchema::canBeMultiValued);
494498

495-
boolean isPartitionedOnStringDimension =
499+
boolean isPartitionedOnMultiValueStringDimension =
496500
isRangePartitioned &&
497501
dimensionsSpec.getDimensions()
498502
.stream()
499503
.anyMatch(
500-
dim -> ColumnType.STRING.equals(dim.getColumnType())
501-
&& ((DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec())
502-
.getPartitionDimensions()
503-
.contains(dim.getName())
504+
dim -> dim.canBeMultiValued()
505+
&& rangeSpec.getPartitionDimensions().contains(dim.getName())
504506
);
505-
return isRollupOnStringDimension || isPartitionedOnStringDimension;
507+
return isRollupOnMultiValueStringDimension || isPartitionedOnMultiValueStringDimension;
506508
}
507509
}
508510

indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskTest.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@
9797
import org.apache.druid.query.aggregation.firstlast.last.DoubleLastAggregatorFactory;
9898
import org.apache.druid.query.expression.TestExprMacroTable;
9999
import org.apache.druid.query.filter.SelectorDimFilter;
100+
import org.apache.druid.segment.AutoTypeColumnSchema;
100101
import org.apache.druid.segment.IndexIO;
101102
import org.apache.druid.segment.IndexMergerV9;
102103
import org.apache.druid.segment.IndexSpec;
@@ -1674,6 +1675,33 @@ public void testMSQRangePartitionOnStringNeedsMVDInfo()
16741675
Assert.assertTrue(compactionTask.identifyMultiValuedDimensions());
16751676
}
16761677

1678+
@Test
1679+
public void testMSQRangePartitionOnAutoStringDoesNotNeedMVDInfo()
1680+
{
1681+
final Builder builder = new Builder(
1682+
DATA_SOURCE,
1683+
segmentCacheManagerFactory
1684+
);
1685+
builder.inputSpec(new CompactionIntervalSpec(COMPACTION_INTERVAL, SegmentUtils.hashIds(SEGMENTS)));
1686+
builder.compactionRunner(new TestMSQCompactionRunner());
1687+
1688+
DimensionSchema stringDim = new AutoTypeColumnSchema("string_dim_1", ColumnType.STRING, null);
1689+
builder.tuningConfig(TuningConfigBuilder.forCompactionTask()
1690+
.withForceGuaranteedRollup(true)
1691+
.withPartitionsSpec(
1692+
new DimensionRangePartitionsSpec(
1693+
3,
1694+
null,
1695+
List.of(stringDim.getName()),
1696+
false
1697+
)
1698+
)
1699+
.build());
1700+
builder.dimensionsSpec(new DimensionsSpec(ImmutableList.of(stringDim)));
1701+
CompactionTask compactionTask = builder.build();
1702+
Assert.assertFalse(compactionTask.identifyMultiValuedDimensions());
1703+
}
1704+
16771705
@Test
16781706
public void testChooseFinestGranularityWithNulls()
16791707
{

processing/src/main/java/org/apache/druid/data/input/impl/DimensionSchema.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,17 @@ public boolean hasBitmapIndex()
176176
@JsonIgnore
177177
public abstract ColumnType getColumnType();
178178

179+
/**
180+
* Returns true if the {@link DimensionHandler#makeIndexer()} of this schema can produce multi-valued
181+
* {@link ColumnType#STRING} columns. This method is used by MSQ compaction to determine if it needs to download the
182+
* segments to check if any are actually multi-valued.
183+
*/
184+
@JsonIgnore
185+
public boolean canBeMultiValued()
186+
{
187+
return false;
188+
}
189+
179190
@JsonIgnore
180191
public DimensionHandler getDimensionHandler()
181192
{

processing/src/main/java/org/apache/druid/data/input/impl/NewSpatialDimensionSchema.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,12 @@ public ColumnType getColumnType()
6969
return ColumnType.STRING;
7070
}
7171

72+
@Override
73+
public boolean canBeMultiValued()
74+
{
75+
return true;
76+
}
77+
7278
@Override
7379
public DimensionHandler getDimensionHandler()
7480
{

processing/src/main/java/org/apache/druid/data/input/impl/StringDimensionSchema.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@ public ColumnType getColumnType()
6464
return ColumnType.STRING;
6565
}
6666

67+
@Override
68+
public boolean canBeMultiValued()
69+
{
70+
return true;
71+
}
72+
6773
@Override
6874
public DimensionHandler getDimensionHandler()
6975
{

0 commit comments

Comments
 (0)