Skip to content

Commit e4f8546

Browse files
committed
Handle external file groups in ExternalFilePathUtil
1 parent cf8e6e4 commit e4f8546

File tree

3 files changed

+117
-7
lines changed

3 files changed

+117
-7
lines changed

hudi-common/src/main/java/org/apache/hudi/common/model/HoodieBaseFile.java

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,28 @@ private static String[] handleHudiGeneratedFile(String fileName) {
119119

120120
private static String[] handleExternallyGeneratedFile(String fileName) {
121121
String[] values = new String[2];
122-
// file name has format <originalFileName>_<commitTime>_hudiext and originalFileName is used as fileId
123-
int lastUnderscore = fileName.lastIndexOf(UNDERSCORE);
124-
int secondToLastUnderscore = fileName.lastIndexOf(UNDERSCORE, lastUnderscore - 1);
125-
values[0] = fileName.substring(0, secondToLastUnderscore);
126-
values[1] = fileName.substring(secondToLastUnderscore + 1, lastUnderscore);
122+
123+
if (ExternalFilePathUtil.hasExternalFileGroupPrefix(fileName)) {
124+
// New format: <originalName>_<commitTime>_fg%3D<prefix>_hudiext
125+
Option<String> prefixOpt = ExternalFilePathUtil.getExternalFileGroupPrefix(fileName);
126+
String prefix = prefixOpt.get();
127+
128+
int prefixMarkerStart = fileName.indexOf("_fg%3D");
129+
String beforePrefix = fileName.substring(0, prefixMarkerStart);
130+
int commitTimeStart = beforePrefix.lastIndexOf(UNDERSCORE);
131+
132+
String originalName = beforePrefix.substring(0, commitTimeStart);
133+
String commitTime = beforePrefix.substring(commitTimeStart + 1);
134+
135+
values[0] = prefix + "/" + originalName; // fileId includes prefix
136+
values[1] = commitTime;
137+
} else {
138+
// Legacy format: <originalFileName>_<commitTime>_hudiext and originalFileName is used as fileId
139+
int lastUnderscore = fileName.lastIndexOf(UNDERSCORE);
140+
int secondToLastUnderscore = fileName.lastIndexOf(UNDERSCORE, lastUnderscore - 1);
141+
values[0] = fileName.substring(0, secondToLastUnderscore);
142+
values[1] = fileName.substring(secondToLastUnderscore + 1, lastUnderscore);
143+
}
127144
return values;
128145
}
129146

@@ -141,8 +158,14 @@ private static StoragePathInfo maybeHandleExternallyGeneratedFileName(StoragePat
141158
return null;
142159
}
143160
if (ExternalFilePathUtil.isExternallyCreatedFile(pathInfo.getPath().getName())) {
144-
// fileId is the same as the original file name for externally created files
145161
StoragePath parent = pathInfo.getPath().getParent();
162+
163+
// For files with file group prefix in fileId, go up one more level
164+
// because the prefix represents a subdirectory within the partition
165+
if (ExternalFilePathUtil.hasExternalFileGroupPrefix(pathInfo.getPath().getName())) {
166+
parent = parent.getParent();
167+
}
168+
146169
return new StoragePathInfo(
147170
new StoragePath(parent, fileId), pathInfo.getLength(), pathInfo.isDirectory(),
148171
pathInfo.getBlockReplication(), pathInfo.getBlockSize(), pathInfo.getModificationTime(), pathInfo.getLocations());

hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.apache.hudi.common.table.timeline.HoodieTimeline;
3939
import org.apache.hudi.common.util.ClusteringUtils;
4040
import org.apache.hudi.common.util.CompactionUtils;
41+
import org.apache.hudi.common.util.ExternalFilePathUtil;
4142
import org.apache.hudi.common.util.HoodieTimer;
4243
import org.apache.hudi.common.util.Option;
4344
import org.apache.hudi.common.util.SamplingLogger;
@@ -174,7 +175,16 @@ public Option<String> getCompletionTime(String instantTime) {
174175
*/
175176
public List<HoodieFileGroup> addFilesToView(List<StoragePathInfo> statuses) {
176177
Map<String, List<StoragePathInfo>> statusesByPartitionPath = statuses.stream()
177-
.collect(Collectors.groupingBy(fileStatus -> FSUtils.getRelativePartitionPath(metaClient.getBasePath(), fileStatus.getPath().getParent())));
178+
.collect(Collectors.groupingBy(fileStatus -> {
179+
String fileName = fileStatus.getPath().getName();
180+
StoragePath parent = fileStatus.getPath().getParent();
181+
// For external files with file group prefix, adjust parent to skip the prefix path
182+
// because the prefix is part of the fileId, not the partition path
183+
if (ExternalFilePathUtil.hasExternalFileGroupPrefix(fileName)) {
184+
parent = parent.getParent();
185+
}
186+
return FSUtils.getRelativePartitionPath(metaClient.getBasePath(), parent);
187+
}));
178188
return statusesByPartitionPath.entrySet().stream().map(entry -> addFilesToView(entry.getKey(), entry.getValue()))
179189
.flatMap(List::stream).collect(Collectors.toList());
180190
}

hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
public class ExternalFilePathUtil {
2525
// Suffix acts as a marker when appended to a file path that the path was created by an external system and not a Hudi writer.
2626
private static final String EXTERNAL_FILE_SUFFIX = "_hudiext";
27+
// Marker for file group prefix in external file names. URL-encoded "_fg=" to avoid conflicts with file names.
28+
private static final String FILE_GROUP_PREFIX_MARKER = "_fg%3D";
2729

2830
/**
2931
* Appends the commit time and external file marker to the file path. Hudi relies on the commit time in the file name for properly generating views of the files in a table.
@@ -35,6 +37,23 @@ public static String appendCommitTimeAndExternalFileMarker(String filePath, Stri
3537
return filePath + "_" + commitTime + EXTERNAL_FILE_SUFFIX;
3638
}
3739

40+
/**
41+
* Appends the commit time, file group prefix, and external file marker to the file name.
42+
* Use this when the external file is located in a subdirectory within the partition (e.g., bucket-0/file.parquet).
43+
*
44+
* @param fileName The original file name (without any path prefix)
45+
* @param commitTime The time of the commit that added this file to the table
46+
* @param externalFileGroupPrefix The prefix path where the file is located (e.g., "bucket-0"). Can be null or empty.
47+
* @return The file name with commit time and markers appended
48+
*/
49+
public static String appendCommitTimeAndExternalFileMarker(String fileName, String commitTime, String externalFileGroupPrefix) {
50+
if (externalFileGroupPrefix == null || externalFileGroupPrefix.isEmpty()) {
51+
return appendCommitTimeAndExternalFileMarker(fileName, commitTime);
52+
}
53+
String encodedPrefix = PartitionPathEncodeUtils.escapePathName(externalFileGroupPrefix);
54+
return fileName + "_" + commitTime + FILE_GROUP_PREFIX_MARKER + encodedPrefix + EXTERNAL_FILE_SUFFIX;
55+
}
56+
3857
/**
3958
* Checks if the file name was created by an external system by checking for the external file marker at the end of the file name.
4059
* @param fileName The file name
@@ -43,4 +62,62 @@ public static String appendCommitTimeAndExternalFileMarker(String filePath, Stri
4362
public static boolean isExternallyCreatedFile(String fileName) {
4463
return fileName.endsWith(EXTERNAL_FILE_SUFFIX);
4564
}
65+
66+
/**
67+
* Checks if the external file name contains a file group prefix.
68+
* @param fileName The file name
69+
* @return True if the file has a file group prefix encoded in its name
70+
*/
71+
public static boolean hasExternalFileGroupPrefix(String fileName) {
72+
return isExternallyCreatedFile(fileName) && fileName.contains(FILE_GROUP_PREFIX_MARKER);
73+
}
74+
75+
/**
76+
* Extracts the file group prefix from an external file name.
77+
* @param fileName The external file name
78+
* @return Option containing the decoded file group prefix, or empty if not present
79+
*/
80+
public static Option<String> getExternalFileGroupPrefix(String fileName) {
81+
if (!hasExternalFileGroupPrefix(fileName)) {
82+
return Option.empty();
83+
}
84+
int start = fileName.indexOf(FILE_GROUP_PREFIX_MARKER) + FILE_GROUP_PREFIX_MARKER.length();
85+
int end = fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
86+
return Option.of(PartitionPathEncodeUtils.unescapePathName(fileName.substring(start, end)));
87+
}
88+
89+
/**
90+
* Gets the original file path including the file group prefix if present.
91+
* For example, "data.parquet_123_fg%3Dbucket-0_hudiext" returns "bucket-0/data.parquet"
92+
*
93+
* @param fileName The external file name
94+
* @return The original file path with prefix
95+
*/
96+
public static String getOriginalFilePath(String fileName) {
97+
if (!isExternallyCreatedFile(fileName)) {
98+
return fileName;
99+
}
100+
String originalName = getOriginalFileName(fileName);
101+
Option<String> prefix = getExternalFileGroupPrefix(fileName);
102+
return prefix.map(p -> p + "/" + originalName).orElse(originalName);
103+
}
104+
105+
/**
106+
* Extracts the original file name from an external file name (without commit time and markers).
107+
* For example, "data.parquet_123_hudiext" returns "data.parquet"
108+
* And "data.parquet_123_fg%3Dbucket-0_hudiext" also returns "data.parquet"
109+
*
110+
* @param fileName The external file name
111+
* @return The original file name
112+
*/
113+
public static String getOriginalFileName(String fileName) {
114+
if (!isExternallyCreatedFile(fileName)) {
115+
return fileName;
116+
}
117+
int markerEnd = hasExternalFileGroupPrefix(fileName)
118+
? fileName.indexOf(FILE_GROUP_PREFIX_MARKER)
119+
: fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
120+
int commitTimeStart = fileName.lastIndexOf('_', markerEnd - 1);
121+
return fileName.substring(0, commitTimeStart);
122+
}
46123
}

0 commit comments

Comments
 (0)