Skip to content

Commit 8a2d2ed

Browse files
cecemeikfaraz
andauthored
Add embedded test for object-storage-encoding format for nested field (#18818)
* nested-it * checkstyle * license * fix-test * emitter-config * setup * prefix * small-update * Update embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/NestedDataFormatsTest.java Co-authored-by: Kashif Faraz <[email protected]> * datasourceWithDefaultFormat * prefix * Update embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/NestedDataFormatsTest.java Co-authored-by: Kashif Faraz <[email protected]> * style * sql * nested * client-sql * format --------- Co-authored-by: Kashif Faraz <[email protected]>
1 parent 69505a3 commit 8a2d2ed

File tree

5 files changed

+169
-8
lines changed

5 files changed

+169
-8
lines changed
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.druid.testing.embedded.indexing;
21+
22+
import org.apache.druid.indexing.common.task.TaskBuilder;
23+
import org.apache.druid.java.util.common.StringUtils;
24+
import org.apache.druid.segment.IndexSpec;
25+
import org.apache.druid.segment.nested.NestedCommonFormatColumnFormatSpec;
26+
import org.apache.druid.segment.nested.ObjectStorageEncoding;
27+
import org.apache.druid.testing.embedded.EmbeddedBroker;
28+
import org.apache.druid.testing.embedded.EmbeddedClusterApis;
29+
import org.apache.druid.testing.embedded.EmbeddedCoordinator;
30+
import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
31+
import org.apache.druid.testing.embedded.EmbeddedHistorical;
32+
import org.apache.druid.testing.embedded.EmbeddedIndexer;
33+
import org.apache.druid.testing.embedded.EmbeddedOverlord;
34+
import org.apache.druid.testing.embedded.EmbeddedRouter;
35+
import org.apache.druid.testing.embedded.junit5.EmbeddedClusterTestBase;
36+
import org.junit.jupiter.api.Assertions;
37+
import org.junit.jupiter.api.BeforeAll;
38+
import org.junit.jupiter.api.Test;
39+
40+
/**
41+
* Embedded tests for nested data, ingested in different {@link NestedCommonFormatColumnFormatSpec}.
42+
*/
43+
public class NestedDataFormatsTest extends EmbeddedClusterTestBase
44+
{
45+
private final EmbeddedBroker broker = new EmbeddedBroker();
46+
private final EmbeddedOverlord overlord = new EmbeddedOverlord();
47+
private final EmbeddedCoordinator coordinator = new EmbeddedCoordinator();
48+
49+
private final String datasourceWithDefaultFormat = EmbeddedClusterApis.createTestDatasourceName();
50+
51+
@Override
52+
protected EmbeddedDruidCluster createCluster()
53+
{
54+
return EmbeddedDruidCluster.withEmbeddedDerbyAndZookeeper()
55+
.useLatchableEmitter()
56+
.useDefaultTimeoutForLatchableEmitter(120)
57+
.addServer(overlord)
58+
.addServer(coordinator)
59+
.addServer(new EmbeddedIndexer())
60+
.addServer(new EmbeddedHistorical())
61+
.addServer(broker)
62+
.addServer(new EmbeddedRouter());
63+
}
64+
65+
@BeforeAll
66+
protected void ingestWithDefaultFormat()
67+
{
68+
final TaskBuilder.IndexParallel indexTask =
69+
TaskBuilder.ofTypeIndexParallel()
70+
.dataSource(datasourceWithDefaultFormat)
71+
.timestampColumn("timestamp")
72+
.jsonInputFormat()
73+
.inputSource(Resources.HttpData.kttmNested1Day())
74+
.schemaDiscovery();
75+
76+
final String taskId = EmbeddedClusterApis.newTaskId(datasourceWithDefaultFormat);
77+
cluster.callApi().runTask(indexTask.withId(taskId), overlord);
78+
cluster.callApi().waitForAllSegmentsToBeAvailable(datasourceWithDefaultFormat, coordinator, broker);
79+
}
80+
81+
@Test
82+
public void test_objectStorageEncoding()
83+
{
84+
// Ingest kttm data with skipping smile raw json format, comparing diff with defaultFormat
85+
NestedCommonFormatColumnFormatSpec spec =
86+
NestedCommonFormatColumnFormatSpec.builder().setObjectStorageEncoding(ObjectStorageEncoding.NONE).build();
87+
final TaskBuilder.IndexParallel indexTask =
88+
TaskBuilder.ofTypeIndexParallel()
89+
.dataSource(dataSource)
90+
.timestampColumn("timestamp")
91+
.jsonInputFormat()
92+
.inputSource(Resources.HttpData.kttmNested1Day())
93+
.schemaDiscovery()
94+
.tuningConfig(t -> t.withIndexSpec(IndexSpec.builder().withAutoColumnFormatSpec(spec).build()));
95+
final String taskId = EmbeddedClusterApis.newTaskId(dataSource);
96+
cluster.callApi().runTask(indexTask.withId(taskId), overlord);
97+
cluster.callApi().waitForAllSegmentsToBeAvailable(dataSource, coordinator, broker);
98+
99+
// Test ingesting with skipping raw json smile format works, same row count, with ~20% storage saving
100+
final String metadataSql = "select sum(num_rows), sum(size) from sys.segments where datasource = '%s'";
101+
final String defaultFormatResult = cluster.runSql(metadataSql, datasourceWithDefaultFormat);
102+
final String noneObjectStorageFormatResult = cluster.runSql(metadataSql, dataSource);
103+
Assertions.assertEquals(StringUtils.format("%d,%d", 465_346, 53_000_804), defaultFormatResult);
104+
Assertions.assertEquals(StringUtils.format("%d,%d", 465_346, 41_938_750), noneObjectStorageFormatResult);
105+
106+
// Test querying on a nested field works
107+
final String groupByQuery =
108+
"""
109+
select json_value(event, '$.type') as event_type, count(*) as total from %s
110+
group by 1 order by 2 desc, 1 asc limit 10
111+
""";
112+
final String queryResultDefaultFormat = cluster.runSql(groupByQuery, datasourceWithDefaultFormat);
113+
final String queryResultNoneObjectStorage = cluster.runSql(groupByQuery, dataSource);
114+
Assertions.assertEquals(queryResultDefaultFormat, queryResultNoneObjectStorage);
115+
116+
// Test reconstruct json column works, the ordering of the fields has changed, but all values are perserved.
117+
final String scanQuery =
118+
"""
119+
select event, to_json_string(agent) as agent from %s
120+
where json_value(event, '$.type') = 'PercentClear' and json_value(agent, '$.os') = 'Android'
121+
order by __time asc limit 1
122+
""";
123+
final String scanQueryResultDefaultFormat = cluster.runSql(scanQuery, datasourceWithDefaultFormat);
124+
final String scanQueryResultNoneObjectStorage = cluster.runSql(scanQuery, dataSource);
125+
// CHECKSTYLE: text blocks not supported in current Checkstyle version
126+
Assertions.assertEquals(
127+
"""
128+
"{""type"":""PercentClear"",""percentage"":85}","{""type"":""Mobile Browser"",""category"":""Smartphone"",""browser"":""Chrome Mobile"",""browser_version"":""50.0.2661.89"",""os"":""Android"",""platform"":""Android""}"
129+
""".trim(),
130+
scanQueryResultDefaultFormat
131+
);
132+
Assertions.assertEquals(
133+
"""
134+
"{""percentage"":85,""type"":""PercentClear""}","{""browser"":""Chrome Mobile"",""browser_version"":""50.0.2661.89"",""category"":""Smartphone"",""os"":""Android"",""platform"":""Android"",""type"":""Mobile Browser""}"
135+
""".trim(),
136+
scanQueryResultNoneObjectStorage
137+
);
138+
}
139+
}

indexing-service/src/test/java/org/apache/druid/indexing/common/task/TaskBuilder.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,9 @@
6262
* explicitly.
6363
*
6464
* @param <Self> Type of this builder itself
65-
* @param <C> Type of tuning config used by this builder.
66-
* @param <T> Type of task created by this builder.
67-
* @param <CB> Type of tuning config builder
65+
* @param <C> Type of tuning config used by this builder.
66+
* @param <T> Type of task created by this builder.
67+
* @param <CB> Type of tuning config builder
6868
* @see #ofTypeIndex()
6969
* @see #tuningConfig(Consumer) to specify the {@code tuningConfig}.
7070
*/
@@ -286,6 +286,12 @@ public Self dimensions(String... dimensions)
286286
return (Self) this;
287287
}
288288

289+
public Self schemaDiscovery()
290+
{
291+
dataSchema.withDimensions(DimensionsSpec.builder().useSchemaDiscovery(true).build());
292+
return (Self) this;
293+
}
294+
289295
public Self metricAggregates(AggregatorFactory... aggregators)
290296
{
291297
dataSchema.withAggregators(aggregators);

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
</scm>
7171

7272
<properties>
73-
<java.version>11</java.version>
73+
<java.version>17</java.version>
7474
<maven.compiler.release>${java.version}</maven.compiler.release>
7575
<project.build.resourceEncoding>UTF-8</project.build.resourceEncoding>
7676
<aether.version>0.9.0.M2</aether.version>
@@ -1572,7 +1572,7 @@
15721572
<consoleOutput>true</consoleOutput>
15731573
<failsOnError>true</failsOnError>
15741574
<excludes>
1575-
*com/fasterxml/jackson/databind/*
1575+
*com/fasterxml/jackson/databind/*,**/NestedDataFormatsTest.java
15761576
</excludes>
15771577
</configuration>
15781578
<dependencies>

services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
import org.apache.druid.query.DruidMetrics;
4242
import org.apache.druid.query.http.ClientSqlQuery;
4343
import org.apache.druid.rpc.indexing.OverlordClient;
44-
import org.apache.druid.segment.TestDataSource;
4544
import org.apache.druid.segment.TestHelper;
4645
import org.apache.druid.server.metrics.LatchableEmitter;
4746
import org.apache.druid.sql.http.ResultFormat;
@@ -421,11 +420,11 @@ public SupervisorStatus getSupervisorStatus(String supervisorId)
421420
// STATIC UTILITY METHODS
422421

423422
/**
424-
* Creates a random datasource name prefixed with {@link TestDataSource#WIKI}.
423+
* Creates a random datasource name prefixed with {@code datasource_}.
425424
*/
426425
public static String createTestDatasourceName()
427426
{
428-
return TestDataSource.WIKI + "_" + IdUtils.getRandomId();
427+
return "datasource_" + IdUtils.getRandomId();
429428
}
430429

431430
/**

services/src/test/java/org/apache/druid/testing/embedded/indexing/Resources.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,23 @@ public static HttpInputSource wikipedia1Day()
131131
throw new RuntimeException(e);
132132
}
133133
}
134+
135+
public static HttpInputSource kttmNested1Day()
136+
{
137+
try {
138+
return new HttpInputSource(
139+
List.of(new URIBuilder("https://static.imply.io/example-data/kttm-nested-v2/kttm-nested-v2-2019-08-25.json.gz").build()),
140+
null,
141+
null,
142+
null,
143+
null,
144+
new HttpInputSourceConfig(null, null)
145+
);
146+
}
147+
catch (URISyntaxException e) {
148+
throw new RuntimeException(e);
149+
}
150+
}
134151
}
135152

136153
/**

0 commit comments

Comments
 (0)