Skip to content

Commit 779bc05

Browse files
[Improvement](Iceberg) Optimize LocationPath.of performance for Iceberg table queries (#59217)
### What problem does this PR solve? ### Problem When querying Iceberg tables with large number of data files, `LocationPath.of` method consumes significant CPU time. The main bottlenecks are: 1. Repeated regex parsing in `S3URI.create()` for each file path 2. Multiple `String.split()` calls for scheme extraction 3. Repeated `StorageProperties` lookup from map for each file ### Solution This PR introduces several optimizations to reduce CPU overhead: #### 1. Optimize scheme parsing in `LocationPath.parseScheme` - Replace `String.split("://")` with `indexOf` + `substring` to avoid array allocation #### 2. Add fast path for S3-compatible schemes in `S3PropertyUtils.validateAndNormalizeUri` - For simple S3-compatible URIs like `oss://bucket/key`, `s3a://bucket/key`, use direct string replacement instead of full S3URI regex parsing - Only fall back to full S3URI parsing for complex HTTP URLs #### 3. Add path prefix caching in `IcebergScanNode` - Cache `StorageProperties`, schema, and path prefix mapping on first file - For subsequent files with the same prefix, directly transform paths using string replacement - New `LocationPath.ofDirect()` method to create LocationPath without any parsing
1 parent 2458657 commit 779bc05

File tree

5 files changed

+260
-19
lines changed

5 files changed

+260
-19
lines changed

fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java

Lines changed: 80 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -96,27 +96,25 @@ private LocationPath(String schema,
9696
}
9797

9898
private static String parseScheme(String finalLocation) {
99-
String scheme = "";
100-
String[] schemeSplit = finalLocation.split(SCHEME_DELIM);
101-
if (schemeSplit.length > 1) {
102-
scheme = schemeSplit[0];
103-
} else {
104-
schemeSplit = finalLocation.split(NONSTANDARD_SCHEME_DELIM);
105-
if (schemeSplit.length > 1) {
106-
scheme = schemeSplit[0];
107-
}
99+
// Use indexOf instead of split for better performance
100+
int schemeDelimIndex = finalLocation.indexOf(SCHEME_DELIM);
101+
if (schemeDelimIndex > 0) {
102+
return finalLocation.substring(0, schemeDelimIndex);
103+
}
104+
105+
int nonstandardDelimIndex = finalLocation.indexOf(NONSTANDARD_SCHEME_DELIM);
106+
if (nonstandardDelimIndex > 0) {
107+
return finalLocation.substring(0, nonstandardDelimIndex);
108108
}
109109

110110
// if not get scheme, need consider /path/to/local to no scheme
111-
if (scheme.isEmpty()) {
112-
try {
113-
Paths.get(finalLocation);
114-
} catch (InvalidPathException exception) {
115-
throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation);
116-
}
111+
try {
112+
Paths.get(finalLocation);
113+
} catch (InvalidPathException exception) {
114+
throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation);
117115
}
118116

119-
return scheme;
117+
return "";
120118
}
121119

122120
/**
@@ -201,6 +199,72 @@ public static LocationPath of(String location,
201199
}
202200
}
203201

202+
/**
203+
* Ultra-fast factory method that directly constructs LocationPath without any parsing.
204+
* This is used when the normalized location is already known (e.g., from prefix transformation).
205+
*
206+
* @param normalizedLocation the already-normalized location string
207+
* @param schema pre-computed schema
208+
* @param fsIdentifier pre-computed filesystem identifier
209+
* @param storageProperties the storage properties (can be null)
210+
* @return a new LocationPath instance
211+
*/
212+
public static LocationPath ofDirect(String normalizedLocation,
213+
String schema,
214+
String fsIdentifier,
215+
StorageProperties storageProperties) {
216+
return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties);
217+
}
218+
219+
/**
220+
* Fast factory method that reuses pre-computed schema and fsIdentifier.
221+
* This is optimized for batch processing where many files share the same bucket/prefix.
222+
*
223+
* @param location the input URI location string
224+
* @param storageProperties pre-computed storage properties for normalization
225+
* @param cachedSchema pre-computed schema (can be null to compute)
226+
* @param cachedFsIdPrefix pre-computed fsIdentifier prefix like "s3://" (can be null to compute)
227+
* @return a new LocationPath instance
228+
*/
229+
public static LocationPath ofWithCache(String location,
230+
StorageProperties storageProperties,
231+
String cachedSchema,
232+
String cachedFsIdPrefix) {
233+
try {
234+
String normalizedLocation = storageProperties.validateAndNormalizeUri(location);
235+
236+
String fsIdentifier;
237+
if (cachedFsIdPrefix != null && normalizedLocation.startsWith(cachedFsIdPrefix)) {
238+
// Fast path: extract authority from normalized location without full URI parsing
239+
int authorityStart = cachedFsIdPrefix.length();
240+
int authorityEnd = normalizedLocation.indexOf('/', authorityStart);
241+
if (authorityEnd == -1) {
242+
authorityEnd = normalizedLocation.length();
243+
}
244+
String authority = normalizedLocation.substring(authorityStart, authorityEnd);
245+
if (authority.isEmpty()) {
246+
throw new StoragePropertiesException("Invalid location, missing authority: " + normalizedLocation);
247+
}
248+
fsIdentifier = cachedFsIdPrefix + authority;
249+
} else {
250+
// Fallback to full URI parsing
251+
String encodedLocation = encodedLocation(normalizedLocation);
252+
URI uri = URI.create(encodedLocation);
253+
String authority = uri.getAuthority();
254+
if (Strings.isNullOrEmpty(authority)) {
255+
throw new StoragePropertiesException("Invalid location, missing authority: " + normalizedLocation);
256+
}
257+
fsIdentifier = Strings.nullToEmpty(uri.getScheme()) + "://"
258+
+ authority;
259+
}
260+
261+
String schema = cachedSchema != null ? cachedSchema : extractScheme(location);
262+
return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties);
263+
} catch (UserException e) {
264+
throw new StoragePropertiesException("Failed to create LocationPath for location: " + location, e);
265+
}
266+
}
267+
204268
/**
205269
* Extracts the URI scheme (e.g., "s3", "hdfs") from the location string.
206270
*

fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,18 @@ public class IcebergScanNode extends FileQueryScanNode {
135135
private long manifestCacheMisses;
136136
private long manifestCacheFailures;
137137

138+
// Cached values for LocationPath creation optimization
139+
// These are lazily initialized on first use to avoid parsing overhead for each file
140+
private boolean locationPathCacheInitialized = false;
141+
private StorageProperties cachedStorageProperties;
142+
private String cachedSchema;
143+
private String cachedFsIdPrefix;
144+
// Cache for path prefix transformation to avoid repeated S3URI parsing
145+
// Maps original path prefix (e.g., "https://bucket.s3.amazonaws.com/") to normalized prefix (e.g., "s3://bucket/")
146+
private String cachedOriginalPathPrefix;
147+
private String cachedNormalizedPathPrefix;
148+
private String cachedFsIdentifier;
149+
138150
// for test
139151
@VisibleForTesting
140152
public IcebergScanNode(PlanNodeId id, TupleDescriptor desc, SessionVariable sv) {
@@ -511,9 +523,74 @@ private CloseableIterable<FileScanTask> planFileScanTaskWithManifestCache(TableS
511523
return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), targetSplitSize);
512524
}
513525

526+
/**
527+
* Initialize cached values for LocationPath creation on first use.
528+
* This avoids repeated StorageProperties lookup, scheme parsing, and S3URI regex parsing for each file.
529+
*/
530+
private void initLocationPathCache(String samplePath) {
531+
try {
532+
// Create a LocationPath using the full method to get all cached values
533+
LocationPath sampleLocationPath = LocationPath.of(samplePath, storagePropertiesMap);
534+
cachedStorageProperties = sampleLocationPath.getStorageProperties();
535+
cachedSchema = sampleLocationPath.getSchema();
536+
cachedFsIdentifier = sampleLocationPath.getFsIdentifier();
537+
538+
// Extract fsIdPrefix like "s3://" from fsIdentifier like "s3://bucket"
539+
int schemeEnd = cachedFsIdentifier.indexOf("://");
540+
if (schemeEnd > 0) {
541+
cachedFsIdPrefix = cachedFsIdentifier.substring(0, schemeEnd + 3);
542+
}
543+
544+
// Cache path prefix mapping for fast transformation
545+
// This allows subsequent files to skip S3URI regex parsing entirely
546+
String normalizedPath = sampleLocationPath.getNormalizedLocation();
547+
548+
// Find the common prefix by looking for the last '/' before the filename
549+
int lastSlashInOriginal = samplePath.lastIndexOf('/');
550+
int lastSlashInNormalized = normalizedPath.lastIndexOf('/');
551+
552+
if (lastSlashInOriginal > 0 && lastSlashInNormalized > 0) {
553+
cachedOriginalPathPrefix = samplePath.substring(0, lastSlashInOriginal + 1);
554+
cachedNormalizedPathPrefix = normalizedPath.substring(0, lastSlashInNormalized + 1);
555+
}
556+
557+
locationPathCacheInitialized = true;
558+
} catch (Exception e) {
559+
// If caching fails, try to initialize again on next use
560+
LOG.warn("Failed to initialize LocationPath cache, will use full parsing", e);
561+
}
562+
}
563+
564+
/**
565+
* Create a LocationPath with cached values for better performance.
566+
* Uses cached path prefix mapping to completely bypass S3URI regex parsing for most files.
567+
* Falls back to full parsing if cache is not available or path doesn't match cached prefix.
568+
*/
569+
private LocationPath createLocationPathWithCache(String path) {
570+
// Initialize cache on first call
571+
if (!locationPathCacheInitialized) {
572+
initLocationPathCache(path);
573+
}
574+
575+
// Fast path: if path starts with cached original prefix, directly transform without any parsing
576+
if (cachedOriginalPathPrefix != null && path.startsWith(cachedOriginalPathPrefix)) {
577+
// Transform: replace original prefix with normalized prefix
578+
String normalizedPath = cachedNormalizedPathPrefix + path.substring(cachedOriginalPathPrefix.length());
579+
return LocationPath.ofDirect(normalizedPath, cachedSchema, cachedFsIdentifier, cachedStorageProperties);
580+
}
581+
582+
// Medium path: use cached StorageProperties but still need validateAndNormalizeUri
583+
if (cachedStorageProperties != null) {
584+
return LocationPath.ofWithCache(path, cachedStorageProperties, cachedSchema, cachedFsIdPrefix);
585+
}
586+
587+
// Fallback to full parsing
588+
return LocationPath.of(path, storagePropertiesMap);
589+
}
590+
514591
private Split createIcebergSplit(FileScanTask fileScanTask) {
515592
String originalPath = fileScanTask.file().path().toString();
516-
LocationPath locationPath = LocationPath.of(originalPath, storagePropertiesMap);
593+
LocationPath locationPath = createLocationPathWithCache(originalPath);
517594
IcebergSplit split = new IcebergSplit(
518595
locationPath,
519596
fileScanTask.start(),

fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@
3333
public class S3PropertyUtils {
3434
private static final Logger LOG = LogManager.getLogger(S3PropertyUtils.class);
3535

36+
private static final String SCHEME_DELIM = "://";
37+
private static final String S3_SCHEME_PREFIX = "s3://";
38+
39+
// S3-compatible schemes that can be converted to s3:// with simple string replacement
40+
// Format: scheme://bucket/key -> s3://bucket/key
41+
private static final String[] SIMPLE_S3_COMPATIBLE_SCHEMES = {
42+
"s3a", "s3n", "oss", "cos", "cosn", "obs", "bos", "gs"
43+
};
44+
3645
/**
3746
* Constructs the S3 endpoint from a given URI in the props map.
3847
*
@@ -113,7 +122,8 @@ public static String constructRegionFromUrl(Map<String, String> props,
113122

114123
/**
115124
* Validates and normalizes the given path into a standard S3 URI.
116-
* If the input already starts with "s3://", it is returned as-is.
125+
* If the input already starts with a known S3-compatible scheme (s3://, s3a://, oss://, etc.),
126+
* it is returned as-is to avoid expensive regex parsing.
117127
* Otherwise, it is parsed and converted into an S3-compatible URI format.
118128
*
119129
* @param path the raw S3-style path or full URI
@@ -132,16 +142,54 @@ public static String validateAndNormalizeUri(String path,
132142
if (StringUtils.isBlank(path)) {
133143
throw new StoragePropertiesException("path is null");
134144
}
135-
if (path.startsWith("s3://")) {
145+
146+
// Fast path 1: s3:// paths are already in the normalized format expected by BE
147+
if (path.startsWith(S3_SCHEME_PREFIX)) {
136148
return path;
137149
}
138150

151+
// Fast path 2: simple S3-compatible schemes (oss://, cos://, s3a://, etc.)
152+
// can be converted with simple string replacement: scheme://bucket/key -> s3://bucket/key
153+
String normalized = trySimpleSchemeConversion(path);
154+
if (normalized != null) {
155+
return normalized;
156+
}
157+
158+
// Full parsing path: for HTTP URLs and other complex formats
139159
boolean usePathStyle = Boolean.parseBoolean(stringUsePathStyle);
140160
boolean forceParsingByStandardUri = Boolean.parseBoolean(stringForceParsingByStandardUri);
141161
S3URI s3uri = S3URI.create(path, usePathStyle, forceParsingByStandardUri);
142162
return "s3" + S3URI.SCHEME_DELIM + s3uri.getBucket() + S3URI.PATH_DELIM + s3uri.getKey();
143163
}
144164

165+
/**
166+
* Try to convert simple S3-compatible scheme URIs to s3:// format using string replacement.
167+
* This avoids expensive regex parsing for common cases like oss://bucket/key, s3a://bucket/key, etc.
168+
*
169+
* @param path the input path
170+
* @return converted s3:// path if successful, null if the path doesn't match simple pattern
171+
*/
172+
private static String trySimpleSchemeConversion(String path) {
173+
int delimIndex = path.indexOf(SCHEME_DELIM);
174+
if (delimIndex <= 0) {
175+
return null;
176+
}
177+
178+
String scheme = path.substring(0, delimIndex).toLowerCase();
179+
for (String compatibleScheme : SIMPLE_S3_COMPATIBLE_SCHEMES) {
180+
if (compatibleScheme.equals(scheme)) {
181+
String rest = path.substring(delimIndex + SCHEME_DELIM.length());
182+
if (rest.isEmpty() || rest.startsWith(S3URI.PATH_DELIM) || rest.contains(SCHEME_DELIM)) {
183+
return null;
184+
}
185+
// Simple conversion: replace scheme with "s3"
186+
// e.g., "oss://bucket/key" -> "s3://bucket/key"
187+
return S3_SCHEME_PREFIX + rest;
188+
}
189+
}
190+
return null;
191+
}
192+
145193
/**
146194
* Extracts and returns the raw URI string from the given props map.
147195
*

fe/fe-core/src/test/java/org/apache/doris/common/util/LocationPathTest.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import org.apache.doris.common.UserException;
2121
import org.apache.doris.datasource.property.storage.StorageProperties;
22+
import org.apache.doris.datasource.property.storage.exception.StoragePropertiesException;
2223
import org.apache.doris.fs.FileSystemType;
2324
import org.apache.doris.thrift.TFileType;
2425

@@ -300,4 +301,43 @@ public void testOnelakeStorageLocationConvert() {
300301

301302
}
302303

304+
@Test
305+
public void testLocationPathDirect() {
306+
StorageProperties storageProperties = STORAGE_PROPERTIES_MAP.get(StorageProperties.Type.S3);
307+
LocationPath locationPath = LocationPath.ofDirect("s3://bucket/key", "s3", "s3://bucket", storageProperties);
308+
Assertions.assertEquals("s3://bucket/key", locationPath.getNormalizedLocation());
309+
Assertions.assertEquals("s3", locationPath.getSchema());
310+
Assertions.assertEquals("s3://bucket", locationPath.getFsIdentifier());
311+
Assertions.assertEquals(storageProperties, locationPath.getStorageProperties());
312+
}
313+
314+
@Test
315+
public void testLocationPathWithCacheFastPath() {
316+
StorageProperties storageProperties = STORAGE_PROPERTIES_MAP.get(StorageProperties.Type.S3);
317+
String location = "s3://bucket/path/to/file";
318+
LocationPath cached = LocationPath.ofWithCache(location, storageProperties, "s3", "s3://");
319+
LocationPath full = LocationPath.of(location, STORAGE_PROPERTIES_MAP);
320+
Assertions.assertEquals(full.getNormalizedLocation(), cached.getNormalizedLocation());
321+
Assertions.assertEquals(full.getFsIdentifier(), cached.getFsIdentifier());
322+
Assertions.assertEquals(full.getSchema(), cached.getSchema());
323+
}
324+
325+
@Test
326+
public void testLocationPathWithCacheFallback() {
327+
StorageProperties storageProperties = STORAGE_PROPERTIES_MAP.get(StorageProperties.Type.S3);
328+
String location = "s3://bucket/path/to/file";
329+
LocationPath cached = LocationPath.ofWithCache(location, storageProperties, "s3", null);
330+
LocationPath full = LocationPath.of(location, STORAGE_PROPERTIES_MAP);
331+
Assertions.assertEquals(full.getNormalizedLocation(), cached.getNormalizedLocation());
332+
Assertions.assertEquals(full.getFsIdentifier(), cached.getFsIdentifier());
333+
Assertions.assertEquals(full.getSchema(), cached.getSchema());
334+
}
335+
336+
@Test
337+
public void testLocationPathWithCacheMissingAuthority() {
338+
StorageProperties storageProperties = STORAGE_PROPERTIES_MAP.get(StorageProperties.Type.S3);
339+
Assertions.assertThrows(StoragePropertiesException.class,
340+
() -> LocationPath.ofWithCache("s3:///path", storageProperties, "s3", "s3://"));
341+
}
342+
303343
}

fe/fe-core/src/test/java/org/apache/doris/datasource/property/storage/S3ConnectorPropertiesUtilsTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,4 +108,16 @@ void testConvertToS3Address_invalid() {
108108
Assertions.assertThrows(StoragePropertiesException.class, () -> S3PropertyUtils.validateAndNormalizeUri("", "false", "false"));
109109
Assertions.assertThrows(UserException.class, () -> S3PropertyUtils.validateAndNormalizeUri("not a uri", "true", "true"));
110110
}
111+
112+
@Test
113+
void testSimpleSchemeConversion() throws UserException {
114+
String[] schemes = {"s3a", "s3n", "oss", "cos", "cosn", "obs", "bos", "gs"};
115+
for (String scheme : schemes) {
116+
String input = scheme + "://bucket/key";
117+
Assertions.assertEquals("s3://bucket/key",
118+
S3PropertyUtils.validateAndNormalizeUri(input, "false", "false"));
119+
}
120+
Assertions.assertEquals("s3://bucket/key",
121+
S3PropertyUtils.validateAndNormalizeUri("OSS://bucket/key", "false", "false"));
122+
}
111123
}

0 commit comments

Comments
 (0)