Skip to content

Commit e3aa550

Browse files
suxiaogang223morningman
authored andcommitted
Optimize location for tpch1000 (#59218)
1 parent 9b897fc commit e3aa550

File tree

3 files changed

+205
-19
lines changed

3 files changed

+205
-19
lines changed

fe/fe-core/src/main/java/org/apache/doris/common/util/LocationPath.java

Lines changed: 73 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -96,27 +96,25 @@ private LocationPath(String schema,
9696
}
9797

9898
private static String parseScheme(String finalLocation) {
99-
String scheme = "";
100-
String[] schemeSplit = finalLocation.split(SCHEME_DELIM);
101-
if (schemeSplit.length > 1) {
102-
scheme = schemeSplit[0];
103-
} else {
104-
schemeSplit = finalLocation.split(NONSTANDARD_SCHEME_DELIM);
105-
if (schemeSplit.length > 1) {
106-
scheme = schemeSplit[0];
107-
}
99+
// Use indexOf instead of split for better performance
100+
int schemeDelimIndex = finalLocation.indexOf(SCHEME_DELIM);
101+
if (schemeDelimIndex > 0) {
102+
return finalLocation.substring(0, schemeDelimIndex);
103+
}
104+
105+
int nonstandardDelimIndex = finalLocation.indexOf(NONSTANDARD_SCHEME_DELIM);
106+
if (nonstandardDelimIndex > 0) {
107+
return finalLocation.substring(0, nonstandardDelimIndex);
108108
}
109109

110110
// if not get scheme, need consider /path/to/local to no scheme
111-
if (scheme.isEmpty()) {
112-
try {
113-
Paths.get(finalLocation);
114-
} catch (InvalidPathException exception) {
115-
throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation);
116-
}
111+
try {
112+
Paths.get(finalLocation);
113+
} catch (InvalidPathException exception) {
114+
throw new IllegalArgumentException("Fail to parse scheme, invalid location: " + finalLocation);
117115
}
118116

119-
return scheme;
117+
return "";
120118
}
121119

122120
/**
@@ -201,6 +199,65 @@ public static LocationPath of(String location,
201199
}
202200
}
203201

202+
/**
203+
* Ultra-fast factory method that directly constructs LocationPath without any parsing.
204+
* This is used when the normalized location is already known (e.g., from prefix transformation).
205+
*
206+
* @param normalizedLocation the already-normalized location string
207+
* @param schema pre-computed schema
208+
* @param fsIdentifier pre-computed filesystem identifier
209+
* @param storageProperties the storage properties (can be null)
210+
* @return a new LocationPath instance
211+
*/
212+
public static LocationPath ofDirect(String normalizedLocation,
213+
String schema,
214+
String fsIdentifier,
215+
StorageProperties storageProperties) {
216+
return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties);
217+
}
218+
219+
/**
220+
* Fast factory method that reuses pre-computed schema and fsIdentifier.
221+
* This is optimized for batch processing where many files share the same bucket/prefix.
222+
*
223+
* @param location the input URI location string
224+
* @param storageProperties pre-computed storage properties for normalization
225+
* @param cachedSchema pre-computed schema (can be null to compute)
226+
* @param cachedFsIdPrefix pre-computed fsIdentifier prefix like "s3://" (can be null to compute)
227+
* @return a new LocationPath instance
228+
*/
229+
public static LocationPath ofWithCache(String location,
230+
StorageProperties storageProperties,
231+
String cachedSchema,
232+
String cachedFsIdPrefix) {
233+
try {
234+
String normalizedLocation = storageProperties.validateAndNormalizeUri(location);
235+
236+
String fsIdentifier;
237+
if (cachedFsIdPrefix != null && normalizedLocation.startsWith(cachedFsIdPrefix)) {
238+
// Fast path: extract authority from normalized location without full URI parsing
239+
int authorityStart = cachedFsIdPrefix.length();
240+
int authorityEnd = normalizedLocation.indexOf('/', authorityStart);
241+
if (authorityEnd == -1) {
242+
authorityEnd = normalizedLocation.length();
243+
}
244+
String authority = normalizedLocation.substring(authorityStart, authorityEnd);
245+
fsIdentifier = cachedFsIdPrefix + authority;
246+
} else {
247+
// Fallback to full URI parsing
248+
String encodedLocation = encodedLocation(normalizedLocation);
249+
URI uri = URI.create(encodedLocation);
250+
fsIdentifier = Strings.nullToEmpty(uri.getScheme()) + "://"
251+
+ Strings.nullToEmpty(uri.getAuthority());
252+
}
253+
254+
String schema = cachedSchema != null ? cachedSchema : extractScheme(location);
255+
return new LocationPath(schema, normalizedLocation, fsIdentifier, storageProperties);
256+
} catch (UserException e) {
257+
throw new StoragePropertiesException("Failed to create LocationPath for location: " + location, e);
258+
}
259+
}
260+
204261
/**
205262
* Extracts the URI scheme (e.g., "s3", "hdfs") from the location string.
206263
*

fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,17 @@ public class IcebergScanNode extends FileQueryScanNode {
133133
private Map<String, String> backendStorageProperties;
134134

135135
private Boolean isBatchMode = null;
136+
// Cached values for LocationPath creation optimization
137+
// These are lazily initialized on first use to avoid parsing overhead for each file
138+
private volatile StorageProperties cachedStorageProperties;
139+
private volatile String cachedSchema;
140+
private volatile String cachedFsIdPrefix;
141+
private volatile boolean locationPathCacheInitialized = false;
142+
// Cache for path prefix transformation to avoid repeated S3URI parsing
143+
// Maps original path prefix (e.g., "https://bucket.s3.amazonaws.com/") to normalized prefix (e.g., "s3://bucket/")
144+
private volatile String cachedOriginalPathPrefix;
145+
private volatile String cachedNormalizedPathPrefix;
146+
private volatile String cachedFsIdentifier;
136147

137148
// for test
138149
@VisibleForTesting
@@ -547,9 +558,83 @@ private CloseableIterable<FileScanTask> planFileScanTaskWithManifestCache(TableS
547558
return TableScanUtil.splitFiles(CloseableIterable.withNoopClose(tasks), targetSplitSize);
548559
}
549560

561+
/**
562+
* Initialize cached values for LocationPath creation on first use.
563+
* This avoids repeated StorageProperties lookup, scheme parsing, and S3URI regex parsing for each file.
564+
*/
565+
private void initLocationPathCache(String samplePath) {
566+
if (locationPathCacheInitialized) {
567+
return;
568+
}
569+
synchronized (this) {
570+
if (locationPathCacheInitialized) {
571+
return;
572+
}
573+
try {
574+
// Create a LocationPath using the full method to get all cached values
575+
LocationPath sampleLocationPath = LocationPath.of(samplePath, storagePropertiesMap);
576+
cachedStorageProperties = sampleLocationPath.getStorageProperties();
577+
cachedSchema = sampleLocationPath.getSchema();
578+
cachedFsIdentifier = sampleLocationPath.getFsIdentifier();
579+
580+
// Extract fsIdPrefix like "s3://" from fsIdentifier like "s3://bucket"
581+
int schemeEnd = cachedFsIdentifier.indexOf("://");
582+
if (schemeEnd > 0) {
583+
cachedFsIdPrefix = cachedFsIdentifier.substring(0, schemeEnd + 3);
584+
}
585+
586+
// Cache path prefix mapping for fast transformation
587+
// This allows subsequent files to skip S3URI regex parsing entirely
588+
String normalizedPath = sampleLocationPath.getNormalizedLocation();
589+
590+
// Find the common prefix by looking for the last '/' before the filename
591+
int lastSlashInOriginal = samplePath.lastIndexOf('/');
592+
int lastSlashInNormalized = normalizedPath.lastIndexOf('/');
593+
594+
if (lastSlashInOriginal > 0 && lastSlashInNormalized > 0) {
595+
cachedOriginalPathPrefix = samplePath.substring(0, lastSlashInOriginal + 1);
596+
cachedNormalizedPathPrefix = normalizedPath.substring(0, lastSlashInNormalized + 1);
597+
}
598+
599+
locationPathCacheInitialized = true;
600+
} catch (Exception e) {
601+
// If caching fails, we'll fall back to the full method each time
602+
LOG.warn("Failed to initialize LocationPath cache, will use full parsing", e);
603+
locationPathCacheInitialized = true;
604+
}
605+
}
606+
}
607+
608+
/**
609+
* Create a LocationPath with cached values for better performance.
610+
* Uses cached path prefix mapping to completely bypass S3URI regex parsing for most files.
611+
* Falls back to full parsing if cache is not available or path doesn't match cached prefix.
612+
*/
613+
private LocationPath createLocationPathWithCache(String path) {
614+
// Initialize cache on first call
615+
if (!locationPathCacheInitialized) {
616+
initLocationPathCache(path);
617+
}
618+
619+
// Fast path: if path starts with cached original prefix, directly transform without any parsing
620+
if (cachedOriginalPathPrefix != null && path.startsWith(cachedOriginalPathPrefix)) {
621+
// Transform: replace original prefix with normalized prefix
622+
String normalizedPath = cachedNormalizedPathPrefix + path.substring(cachedOriginalPathPrefix.length());
623+
return LocationPath.ofDirect(normalizedPath, cachedSchema, cachedFsIdentifier, cachedStorageProperties);
624+
}
625+
626+
// Medium path: use cached StorageProperties but still need validateAndNormalizeUri
627+
if (cachedStorageProperties != null) {
628+
return LocationPath.ofWithCache(path, cachedStorageProperties, cachedSchema, cachedFsIdPrefix);
629+
}
630+
631+
// Fallback to full parsing
632+
return LocationPath.of(path, storagePropertiesMap);
633+
}
634+
550635
private Split createIcebergSplit(FileScanTask fileScanTask) {
551636
String originalPath = fileScanTask.file().path().toString();
552-
LocationPath locationPath = LocationPath.of(originalPath, storagePropertiesMap);
637+
LocationPath locationPath = createLocationPathWithCache(originalPath);
553638
IcebergSplit split = new IcebergSplit(
554639
locationPath,
555640
fileScanTask.start(),

fe/fe-core/src/main/java/org/apache/doris/datasource/property/storage/S3PropertyUtils.java

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@
3333
public class S3PropertyUtils {
3434
private static final Logger LOG = LogManager.getLogger(S3PropertyUtils.class);
3535

36+
private static final String SCHEME_DELIM = "://";
37+
private static final String S3_SCHEME_PREFIX = "s3://";
38+
39+
// S3-compatible schemes that can be converted to s3:// with simple string replacement
40+
// Format: scheme://bucket/key -> s3://bucket/key
41+
private static final String[] SIMPLE_S3_COMPATIBLE_SCHEMES = {
42+
"s3a", "s3n", "oss", "cos", "cosn", "obs", "bos", "gs"
43+
};
44+
3645
/**
3746
* Constructs the S3 endpoint from a given URI in the props map.
3847
*
@@ -113,7 +122,8 @@ public static String constructRegionFromUrl(Map<String, String> props,
113122

114123
/**
115124
* Validates and normalizes the given path into a standard S3 URI.
116-
* If the input already starts with "s3://", it is returned as-is.
125+
* If the input already starts with a known S3-compatible scheme (s3://, s3a://, oss://, etc.),
126+
* it is returned as-is to avoid expensive regex parsing.
117127
* Otherwise, it is parsed and converted into an S3-compatible URI format.
118128
*
119129
* @param path the raw S3-style path or full URI
@@ -132,16 +142,50 @@ public static String validateAndNormalizeUri(String path,
132142
if (StringUtils.isBlank(path)) {
133143
throw new StoragePropertiesException("path is null");
134144
}
135-
if (path.startsWith("s3://")) {
145+
146+
// Fast path 1: s3:// paths are already in the normalized format expected by BE
147+
if (path.startsWith(S3_SCHEME_PREFIX)) {
136148
return path;
137149
}
138150

151+
// Fast path 2: simple S3-compatible schemes (oss://, cos://, s3a://, etc.)
152+
// can be converted with simple string replacement: scheme://bucket/key -> s3://bucket/key
153+
String normalized = trySimpleSchemeConversion(path);
154+
if (normalized != null) {
155+
return normalized;
156+
}
157+
158+
// Full parsing path: for HTTP URLs and other complex formats
139159
boolean usePathStyle = Boolean.parseBoolean(stringUsePathStyle);
140160
boolean forceParsingByStandardUri = Boolean.parseBoolean(stringForceParsingByStandardUri);
141161
S3URI s3uri = S3URI.create(path, usePathStyle, forceParsingByStandardUri);
142162
return "s3" + S3URI.SCHEME_DELIM + s3uri.getBucket() + S3URI.PATH_DELIM + s3uri.getKey();
143163
}
144164

165+
/**
166+
* Try to convert simple S3-compatible scheme URIs to s3:// format using string replacement.
167+
* This avoids expensive regex parsing for common cases like oss://bucket/key, s3a://bucket/key, etc.
168+
*
169+
* @param path the input path
170+
* @return converted s3:// path if successful, null if the path doesn't match simple pattern
171+
*/
172+
private static String trySimpleSchemeConversion(String path) {
173+
int delimIndex = path.indexOf(SCHEME_DELIM);
174+
if (delimIndex <= 0) {
175+
return null;
176+
}
177+
178+
String scheme = path.substring(0, delimIndex).toLowerCase();
179+
for (String compatibleScheme : SIMPLE_S3_COMPATIBLE_SCHEMES) {
180+
if (compatibleScheme.equals(scheme)) {
181+
// Simple conversion: replace scheme with "s3"
182+
// e.g., "oss://bucket/key" -> "s3://bucket/key"
183+
return S3_SCHEME_PREFIX + path.substring(delimIndex + SCHEME_DELIM.length());
184+
}
185+
}
186+
return null;
187+
}
188+
145189
/**
146190
* Extracts and returns the raw URI string from the given props map.
147191
*

0 commit comments

Comments
 (0)