Skip to content

Commit 0eac555

Browse files
[HUDI-9267] Fix the file group reader log file read sequence (apache#13115)
* [HUDI-9267] Fix the file group reader log file read sequence Fix the file group reader log file sequence to be in asending order, so that to keep the "processing_time" merging semantics for streaming scenarios: always choose the latest incoming if the ordering val are equals. This semantics works now for both `COMMIT_TIME` and `EVENT_TIME` merging modes after the fix. Also fix some other issues: * the unnecessary copy of rows for position based merging; * the event time merging sequence for CUSTOM merger. * the HoodieEmptyRecord default ordering value * the fallback strategy read for position based merging --------- Co-authored-by: sivabalan <n.siva.b@gmail.com>
1 parent 9347544 commit 0eac555

File tree

7 files changed

+102
-46
lines changed

7 files changed

+102
-46
lines changed

hudi-common/src/main/java/org/apache/hudi/common/model/HoodieEmptyRecord.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ public class HoodieEmptyRecord<T> extends HoodieRecord<T> {
3939
public HoodieEmptyRecord(HoodieKey key, HoodieRecordType type) {
4040
super(key, null);
4141
this.type = type;
42-
this.orderingVal = null;
42+
// IMPORTANT:
43+
// This should be kept in line with EmptyHoodieRecordPayload
44+
// default natural order
45+
this.orderingVal = 0;
4346
}
4447

4548
public HoodieEmptyRecord(HoodieKey key, HoodieOperation operation, Comparable<?> orderingVal, HoodieRecordType type) {

hudi-common/src/main/java/org/apache/hudi/common/table/log/BaseHoodieLogRecordReader.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,13 +223,13 @@ private void scanInternalV1(Option<KeySpec> keySpecOpt) {
223223
totalCorruptBlocks = new AtomicLong(0);
224224
totalLogBlocks = new AtomicLong(0);
225225
totalLogRecords = new AtomicLong(0);
226-
HoodieLogFormatReverseReader logFormatReaderWrapper = null;
226+
HoodieLogFormatReader logFormatReaderWrapper = null;
227227
HoodieTimeline commitsTimeline = this.hoodieTableMetaClient.getCommitsTimeline();
228228
HoodieTimeline completedInstantsTimeline = commitsTimeline.filterCompletedInstants();
229229
HoodieTimeline inflightInstantsTimeline = commitsTimeline.filterInflights();
230230
try {
231231
// Iterate over the paths
232-
logFormatReaderWrapper = new HoodieLogFormatReverseReader(storage,
232+
logFormatReaderWrapper = new HoodieLogFormatReader(storage,
233233
logFilePaths.stream().map(logFile -> new HoodieLogFile(new StoragePath(logFile))).collect(Collectors.toList()),
234234
readerSchema, reverseReader, bufferSize, shouldLookupRecords(), recordKeyField, internalSchema);
235235

hudi-common/src/main/java/org/apache/hudi/common/table/read/FileGroupRecordBuffer.java

Lines changed: 72 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -233,13 +233,14 @@ public void close() {
233233
/**
234234
* Merge two log data records if needed.
235235
*
236-
* @param record
237-
* @param metadata
238-
* @param existingRecordMetadataPair
239-
* @return
240-
* @throws IOException
236+
* @param newRecord The new incoming record
237+
* @param metadata The metadata
238+
* @param existingRecordMetadataPair The existing record metadata pair
239+
*
240+
* @return The pair of the record that needs to be updated with and its metadata,
241+
* returns empty to skip the update.
241242
*/
242-
protected Option<Pair<Option<T>, Map<String, Object>>> doProcessNextDataRecord(T record,
243+
protected Option<Pair<Option<T>, Map<String, Object>>> doProcessNextDataRecord(T newRecord,
243244
Map<String, Object> metadata,
244245
Pair<Option<T>, Map<String, Object>> existingRecordMetadataPair)
245246
throws IOException {
@@ -249,14 +250,12 @@ protected Option<Pair<Option<T>, Map<String, Object>>> doProcessNextDataRecord(T
249250
// TODO(HUDI-7843): decouple the merging logic from the merger
250251
// and use the record merge mode to control how to merge partial updates
251252
// Merge and store the combined record
252-
// Note that the incoming `record` is from an older commit, so it should be put as
253-
// the `older` in the merge API
254253
Option<Pair<HoodieRecord, Schema>> combinedRecordAndSchemaOpt = recordMerger.get().partialMerge(
255-
readerContext.constructHoodieRecord(Option.of(record), metadata),
256-
readerContext.getSchemaFromMetadata(metadata),
257254
readerContext.constructHoodieRecord(
258255
existingRecordMetadataPair.getLeft(), existingRecordMetadataPair.getRight()),
259256
readerContext.getSchemaFromMetadata(existingRecordMetadataPair.getRight()),
257+
readerContext.constructHoodieRecord(Option.of(newRecord), metadata),
258+
readerContext.getSchemaFromMetadata(metadata),
260259
readerSchema,
261260
props);
262261
if (!combinedRecordAndSchemaOpt.isPresent()) {
@@ -266,7 +265,7 @@ protected Option<Pair<Option<T>, Map<String, Object>>> doProcessNextDataRecord(T
266265
HoodieRecord<T> combinedRecord = combinedRecordAndSchema.getLeft();
267266

268267
// If pre-combine returns existing record, no need to update it
269-
if (combinedRecord.getData() != existingRecordMetadataPair.getLeft().get()) {
268+
if (combinedRecord.getData() != existingRecordMetadataPair.getLeft().orElse(null)) {
270269
return Option.of(Pair.of(
271270
Option.ofNullable(combinedRecord.getData()),
272271
readerContext.updateSchemaAndResetOrderingValInMetadata(metadata, combinedRecordAndSchema.getRight())));
@@ -275,43 +274,47 @@ protected Option<Pair<Option<T>, Map<String, Object>>> doProcessNextDataRecord(T
275274
} else {
276275
switch (recordMergeMode) {
277276
case COMMIT_TIME_ORDERING:
278-
return Option.empty();
277+
return Option.of(Pair.of(Option.ofNullable(newRecord), metadata));
279278
case EVENT_TIME_ORDERING:
280-
Comparable existingOrderingValue = readerContext.getOrderingValue(
281-
existingRecordMetadataPair.getLeft(), existingRecordMetadataPair.getRight(),
282-
readerSchema, orderingFieldName);
283-
if (isDeleteRecordWithNaturalOrder(existingRecordMetadataPair.getLeft(), existingOrderingValue)) {
284-
return Option.empty();
285-
}
286-
Comparable incomingOrderingValue = readerContext.getOrderingValue(
287-
Option.of(record), metadata, readerSchema, orderingFieldName);
288-
if (incomingOrderingValue.compareTo(existingOrderingValue) > 0) {
289-
return Option.of(Pair.of(Option.of(record), metadata));
279+
if (shouldKeepNewerRecord(existingRecordMetadataPair.getLeft(), existingRecordMetadataPair.getRight(), Option.ofNullable(newRecord), metadata)) {
280+
return Option.of(Pair.of(Option.of(newRecord), metadata));
290281
}
291282
return Option.empty();
292283
case CUSTOM:
293284
default:
294285
// Merge and store the combined record
295-
// Note that the incoming `record` is from an older commit, so it should be put as
296-
// the `older` in the merge API
297286
if (payloadClass.isPresent()) {
287+
if (existingRecordMetadataPair.getLeft().isEmpty()
288+
&& shouldKeepNewerRecord(existingRecordMetadataPair.getLeft(), existingRecordMetadataPair.getRight(), Option.ofNullable(newRecord), metadata)) {
289+
// IMPORTANT:
290+
// this is needed when the fallback HoodieAvroRecordMerger got used, the merger would
291+
// return Option.empty when the old payload data is empty(a delete) and ignores its ordering value directly.
292+
return Option.of(Pair.of(Option.of(newRecord), metadata));
293+
}
298294
Option<Pair<HoodieRecord, Schema>> combinedRecordAndSchemaOpt =
299-
getMergedRecord(Option.of(record), metadata, existingRecordMetadataPair.getLeft(), existingRecordMetadataPair.getRight());
295+
getMergedRecord(existingRecordMetadataPair.getLeft(), existingRecordMetadataPair.getRight(), Option.of(newRecord), metadata);
300296
if (combinedRecordAndSchemaOpt.isPresent()) {
301297
T combinedRecordData = readerContext.convertAvroRecord((IndexedRecord) combinedRecordAndSchemaOpt.get().getLeft().getData());
302298
// If pre-combine does not return existing record, update it
303-
if (combinedRecordData != existingRecordMetadataPair.getLeft().get()) {
299+
if (combinedRecordData != existingRecordMetadataPair.getLeft().orElse(null)) {
304300
return Option.of(Pair.of(Option.ofNullable(combinedRecordData), metadata));
305301
}
306302
}
307303
return Option.empty();
308304
} else {
305+
if (existingRecordMetadataPair.getLeft().isEmpty()
306+
&& shouldKeepNewerRecord(existingRecordMetadataPair.getLeft(), existingRecordMetadataPair.getRight(), Option.ofNullable(newRecord), metadata)) {
307+
// IMPORTANT:
308+
// this is needed when the fallback HoodieAvroRecordMerger got used, the merger would
309+
// return Option.empty when the old payload data is empty(a delete) and ignores its ordering value directly.
310+
return Option.of(Pair.of(Option.of(newRecord), metadata));
311+
}
309312
Option<Pair<HoodieRecord, Schema>> combinedRecordAndSchemaOpt = recordMerger.get().merge(
310-
readerContext.constructHoodieRecord(Option.of(record), metadata),
311-
readerContext.getSchemaFromMetadata(metadata),
312313
readerContext.constructHoodieRecord(
313314
existingRecordMetadataPair.getLeft(), existingRecordMetadataPair.getRight()),
314315
readerContext.getSchemaFromMetadata(existingRecordMetadataPair.getRight()),
316+
readerContext.constructHoodieRecord(Option.of(newRecord), metadata),
317+
readerContext.getSchemaFromMetadata(metadata),
315318
props);
316319

317320
if (!combinedRecordAndSchemaOpt.isPresent()) {
@@ -322,7 +325,7 @@ protected Option<Pair<Option<T>, Map<String, Object>>> doProcessNextDataRecord(T
322325
HoodieRecord<T> combinedRecord = combinedRecordAndSchema.getLeft();
323326

324327
// If pre-combine returns existing record, no need to update it
325-
if (combinedRecord.getData() != existingRecordMetadataPair.getLeft().get()) {
328+
if (combinedRecord.getData() != existingRecordMetadataPair.getLeft().orElse(null)) {
326329
return Option.of(Pair.of(Option.ofNullable(combinedRecord.getData()), metadata));
327330
}
328331
return Option.empty();
@@ -334,24 +337,25 @@ protected Option<Pair<Option<T>, Map<String, Object>>> doProcessNextDataRecord(T
334337
// NOTE: Record have to be cloned here to make sure if it holds low-level engine-specific
335338
// payload pointing into a shared, mutable (underlying) buffer we get a clean copy of
336339
// it since these records will be put into records(Map).
337-
return Option.of(Pair.of(Option.ofNullable(record), metadata));
340+
return Option.of(Pair.of(Option.ofNullable(newRecord), metadata));
338341
}
339342
}
340343

341344
/**
342345
* Merge a delete record with another record (data, or delete).
343346
*
344-
* @param deleteRecord
345-
* @param existingRecordMetadataPair
346-
* @return
347+
* @param deleteRecord The delete record
348+
* @param existingRecordMetadataPair The existing record metadata pair
349+
*
350+
* @return The option of new delete record that needs to be updated with.
347351
*/
348352
protected Option<DeleteRecord> doProcessNextDeletedRecord(DeleteRecord deleteRecord,
349353
Pair<Option<T>, Map<String, Object>> existingRecordMetadataPair) {
350354
totalLogRecords++;
351355
if (existingRecordMetadataPair != null) {
352356
switch (recordMergeMode) {
353357
case COMMIT_TIME_ORDERING:
354-
return Option.empty();
358+
return Option.of(deleteRecord);
355359
case EVENT_TIME_ORDERING:
356360
case CUSTOM:
357361
default:
@@ -473,6 +477,17 @@ protected Option<T> merge(Option<T> older, Map<String, Object> olderInfoMap,
473477
case CUSTOM:
474478
default:
475479
if (payloadClass.isPresent()) {
480+
if (older.isEmpty() || newer.isEmpty()) {
481+
if (shouldKeepNewerRecord(older, olderInfoMap, newer, newerInfoMap)) {
482+
// IMPORTANT:
483+
// this is needed when the fallback HoodieAvroRecordMerger got used, the merger would
484+
// return Option.empty when the new payload data is empty(a delete) and ignores its ordering value directly.
485+
return newer;
486+
} else {
487+
return older;
488+
}
489+
}
490+
476491
Option<Pair<HoodieRecord, Schema>> mergedRecord =
477492
getMergedRecord(older, olderInfoMap, newer, newerInfoMap);
478493
if (mergedRecord.isPresent()
@@ -487,6 +502,16 @@ protected Option<T> merge(Option<T> older, Map<String, Object> olderInfoMap,
487502
}
488503
return Option.empty();
489504
} else {
505+
if (older.isEmpty() || newer.isEmpty()) {
506+
if (shouldKeepNewerRecord(older, olderInfoMap, newer, newerInfoMap)) {
507+
// IMPORTANT:
508+
// this is needed when the fallback HoodieAvroRecordMerger got used, the merger would
509+
// return Option.empty when the new payload data is empty(a delete) and ignores its ordering value directly.
510+
return newer;
511+
} else {
512+
return older;
513+
}
514+
}
490515
Option<Pair<HoodieRecord, Schema>> mergedRecord = recordMerger.get().merge(
491516
readerContext.constructHoodieRecord(older, olderInfoMap), readerContext.getSchemaFromMetadata(olderInfoMap),
492517
readerContext.constructHoodieRecord(newer, newerInfoMap), readerContext.getSchemaFromMetadata(newerInfoMap), props);
@@ -504,6 +529,19 @@ protected Option<T> merge(Option<T> older, Map<String, Object> olderInfoMap,
504529
}
505530
}
506531

532+
/**
533+
* Decides whether to keep the incoming record with ordering value comparison.
534+
*/
535+
private boolean shouldKeepNewerRecord(Option<T> oldVal, Map<String, Object> oldMetadata, Option<T> newVal, Map<String, Object> newMetadata) {
536+
Comparable newOrderingVal = readerContext.getOrderingValue(newVal, newMetadata, readerSchema, orderingFieldName);
537+
if (isDeleteRecordWithNaturalOrder(newVal, newOrderingVal)) {
538+
// handle records coming from DELETE statements(the orderingVal is constant 0)
539+
return true;
540+
}
541+
Comparable oldOrderingVal = readerContext.getOrderingValue(oldVal, oldMetadata, readerSchema, orderingFieldName);
542+
return newOrderingVal.compareTo(oldOrderingVal) >= 0;
543+
}
544+
507545
private Option<Pair<HoodieRecord, Schema>> getMergedRecord(Option<T> older, Map<String, Object> olderInfoMap, Option<T> newer, Map<String, Object> newerInfoMap) throws IOException {
508546
ValidationUtils.checkArgument(!Objects.equals(payloadClass, OverwriteWithLatestAvroPayload.class.getCanonicalName())
509547
&& !Objects.equals(payloadClass, DefaultHoodieRecordPayload.class.getCanonicalName()));

hudi-common/src/main/java/org/apache/hudi/common/table/read/PositionBasedFileGroupRecordBuffer.java

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@
5454
import java.util.function.Function;
5555

5656
import static org.apache.hudi.common.engine.HoodieReaderContext.INTERNAL_META_RECORD_KEY;
57-
import static org.apache.hudi.common.model.HoodieRecord.DEFAULT_ORDERING_VALUE;
5857

5958
/**
6059
* A buffer that is used to store log records by {@link org.apache.hudi.common.table.log.HoodieMergedLogRecordReader}
@@ -184,10 +183,22 @@ public void processDeleteBlock(HoodieDeleteBlock deleteBlock) throws IOException
184183

185184
switch (recordMergeMode) {
186185
case COMMIT_TIME_ORDERING:
186+
int commitTimeBasedRecordIndex = 0;
187+
DeleteRecord[] deleteRecords = deleteBlock.getRecordsToDelete();
187188
for (Long recordPosition : recordPositions) {
188-
records.putIfAbsent(recordPosition,
189+
// IMPORTANT:
190+
// use #put for log files with regular order(see HoodieLogFile.LOG_FILE_COMPARATOR);
191+
// use #putIfAbsent for log files with reverse order(see HoodieLogFile.LOG_FILE_COMPARATOR_REVERSED),
192+
// the delete block would be parsed ahead of a data block if they are in different log files.
193+
194+
// set up the record key for key-based fallback handling, this is needed
195+
// because under hybrid strategy in #doHasNextFallbackBaseRecord, if the record keys are not set up,
196+
// this delete-vector could be kept in the records cache(see the check in #fallbackToKeyBasedBuffer),
197+
// and these keys would be deleted no matter whether there are following-up inserts/updates.
198+
DeleteRecord deleteRecord = deleteRecords[commitTimeBasedRecordIndex++];
199+
records.put(recordPosition,
189200
Pair.of(Option.empty(), readerContext.generateMetadataForRecord(
190-
null, "", DEFAULT_ORDERING_VALUE)));
201+
deleteRecord.getRecordKey(), "", deleteRecord.getOrderingValue())));
191202
}
192203
return;
193204
case EVENT_TIME_ORDERING:
@@ -246,12 +257,11 @@ protected boolean hasNextBaseRecord(T baseRecord) throws IOException {
246257
Map<String, Object> metadata = readerContext.generateMetadataForRecord(
247258
baseRecord, readerSchema);
248259

249-
Option<T> resultRecord = Option.empty();
260+
final Option<T> resultRecord;
250261
if (logRecordInfo != null) {
251262
resultRecord = merge(
252263
Option.of(baseRecord), metadata, logRecordInfo.getLeft(), logRecordInfo.getRight());
253264
if (resultRecord.isPresent()) {
254-
nextRecord = readerContext.seal(resultRecord.get());
255265
readStats.incrementNumUpdates();
256266
} else {
257267
readStats.incrementNumDeletes();
@@ -275,7 +285,7 @@ private boolean doHasNextFallbackBaseRecord(T baseRecord) throws IOException {
275285
ROW_INDEX_TEMPORARY_COLUMN_NAME, nextRecordPosition);
276286
Pair<Option<T>, Map<String, Object>> logRecordInfo = records.remove(nextRecordPosition++);
277287
if (logRecordInfo != null) {
278-
//we have a delete that was not able to be converted. Since it is the newest version, the record is deleted
288+
//we have a delete that was not to be able to be converted. Since it is the newest version, the record is deleted
279289
//remove a key based record if it exists
280290
records.remove(readerContext.getRecordKey(baseRecord, readerSchema));
281291
return false;

hudi-hadoop-common/src/test/java/org/apache/hudi/common/table/read/TestCustomMerger.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,9 @@ public void testWithTwoLogFiles(boolean useRecordPositions) throws IOException,
168168
public void testWithThreeLogFiles(boolean useRecordPositions) throws IOException, InterruptedException {
169169
shouldWritePositions = Arrays.asList(useRecordPositions, useRecordPositions, useRecordPositions, useRecordPositions);
170170
ClosableIterator<IndexedRecord> iterator = getFileGroupIterator(4, useRecordPositions);
171+
// The records with keys 6 and 8 are deletes with lower ordering val
171172
List<String> leftKeysExpected =
172-
Arrays.asList("1", "3", "7", "9", "10");
173+
Arrays.asList("1", "3", "6", "7", "8", "9", "10");
173174
List<String> leftKeysActual = new ArrayList<>();
174175
while (iterator.hasNext()) {
175176
leftKeysActual.add(iterator.next()

hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestPositionBasedFileGroupRecordBuffer.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
import java.util.Map;
6363
import java.util.stream.Collectors;
6464

65+
import static org.apache.hudi.common.engine.HoodieReaderContext.INTERNAL_META_ORDERING_FIELD;
6566
import static org.apache.hudi.common.engine.HoodieReaderContext.INTERNAL_META_RECORD_KEY;
6667
import static org.apache.hudi.common.model.WriteOperationType.INSERT;
6768
import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType.BASE_FILE_INSTANT_TIME_OF_RECORD_POSITIONS;
@@ -203,7 +204,10 @@ public void testProcessDeleteBlockWithPositions(boolean sameBaseInstantTime) thr
203204
if (sameBaseInstantTime) {
204205
// If the log block's base instant time of record positions match the base file
205206
// to merge, the log records are stored based on the position
206-
assertNull(buffer.getLogRecords().get(0L).getRight().get(INTERNAL_META_RECORD_KEY));
207+
assertNotNull(buffer.getLogRecords().get(0L).getRight().get(INTERNAL_META_RECORD_KEY),
208+
"the record key is set up for fallback handling");
209+
assertNotNull(buffer.getLogRecords().get(0L).getRight().get(INTERNAL_META_ORDERING_FIELD),
210+
"the ordering value is set up for fallback handling");
207211
} else {
208212
// If the log block's base instant time of record positions does not match the
209213
// base file to merge, the log records are stored based on the record key

hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderOnSpark.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ class TestHoodieFileGroupReaderOnSpark extends TestHoodieFileGroupReaderBase[Int
269269
val columnsToCompare = Set("ts", "key", "rider", "driver", "fare", "op")
270270
val df = spark.read.options(readOpts).format("hudi").load(getBasePath)
271271
val finalDf = df.select("ts", "key", "rider", "driver", "fare", "op").sort("key")
272-
val expected = if (mergeMode == RecordMergeMode.EVENT_TIME_ORDERING.name()) {
272+
val expected = if (mergeMode != RecordMergeMode.COMMIT_TIME_ORDERING.name()) {
273273
expectedEventTimeBased
274274
} else {
275275
expectedCommitTimeBased

0 commit comments

Comments
 (0)