[Flink] merge again

jto · jto · commit 24a5a51662f1 · 2025-01-24T10:46:50.000+01:00
diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json
@@ -1,4 +1,4 @@
 {
     "comment": "Modify this file in a trivial way to cause this test suite to run.",
-    "modification": 2
+    "modification": 3
 }
diff --git a/.github/workflows/beam_PerformanceTests_Kafka_IO.yml b/.github/workflows/beam_PerformanceTests_Kafka_IO.yml
@@ -54,7 +54,7 @@ jobs:
       github.event_name == 'workflow_dispatch' ||
       (github.event_name == 'schedule' && github.repository == 'apache/beam') ||
       github.event.comment.body == 'Run Java KafkaIO Performance Test'
-    runs-on: [self-hosted, ubuntu-20.04, main]
+    runs-on: [self-hosted, ubuntu-20.04, highmem]
     timeout-minutes: 120
     name: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
     strategy:
diff --git a/.github/workflows/finalize_release.yml b/.github/workflows/finalize_release.yml
@@ -138,6 +138,7 @@ jobs:
       env:
         VERSION_TAG: "v${{ github.event.inputs.RELEASE }}"
         RC_TAG: "v${{ github.event.inputs.RELEASE }}-RC${{ github.event.inputs.RC }}"
+        POST_RELEASE_BRANCH: "release-${{ github.event.inputs.RELEASE }}-postrelease"
       run: |
         # Ensure local tags are in sync. If there's a mismatch, it will tell you.
         git fetch --all --tags --prune
@@ -152,3 +153,27 @@ jobs:
         # Tag for repo root.
         git tag "$VERSION_TAG" "$RC_TAG"^{} -m "Tagging release" --local-user="${{steps.import_gpg.outputs.name}}"
         git push https://github.com/apache/beam "$VERSION_TAG"
+
+        git checkout -b "$POST_RELEASE_BRANCH" "$VERSION_TAG"
+        git push https://github.com/apache/beam "$POST_RELEASE_BRANCH"
+
+  update_master:
+    needs: push_git_tags
+    runs-on: ubuntu-latest
+    env:
+        POST_RELEASE_BRANCH: "release-${{ github.event.inputs.RELEASE }}-postrelease"
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+      - name: Set git config
+        run: |
+          git config user.name $GITHUB_ACTOR
+          git config user.email actions@"$RUNNER_NAME".local
+      - name: Update .asf.yaml to protect new postrelease branch from force push
+        run: |
+          sed -i -e "s/master: {}/master: {}\n    ${POST_RELEASE_BRANCH}: {}/g" .asf.yaml
+      - name: Commit and Push to master branch files with Next Version
+        run: |
+          git add .asf.yaml
+          git commit -m "Moving to ${NEXT_VERSION_IN_BASE_BRANCH}-SNAPSHOT on master branch."
+          git push origin ${MASTER_BRANCH}
diff --git a/.github/workflows/republish_released_docker_containers.yml b/.github/workflows/republish_released_docker_containers.yml
@@ -43,7 +43,7 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
         with:
-          ref: "v${{ env.release }}-RC${{ env.rc }}"
+          ref: "release-${{ env.release }}-postrelease"
           repository: apache/beam
       - name: Free Disk Space (Ubuntu)
         uses: jlumbroso/free-disk-space@v1.3.0
diff --git a/runners/flink/flink_runner.gradle b/runners/flink/flink_runner.gradle
@@ -239,7 +239,9 @@ def sickbayTests = [
         // Flink errors are not deterministic. Exception may just be
         // org.apache.flink.runtime.operators.coordination.TaskNotRunningException: Task is not running, but in state FAILED
         // instead of the actual cause. Real cause is visible in the logs.
-        'org.apache.beam.sdk.transforms.ParDoTest$LifecycleTests'
+        'org.apache.beam.sdk.transforms.ParDoTest$LifecycleTests',
+        'org.apache.beam.sdk.transforms.GroupByKeyTest$BasicTests.testAfterProcessingTimeContinuationTriggerUsingState',
+        // TODO(https://github.com/apache/beam/issues/18198)
 ]
 
 def createValidatesRunnerTask(Map m) {
diff --git a/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/LargeCommitTest.java b/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/LargeCommitTest.java
diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/common/worker/OutputObjectAndByteCounter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/common/worker/OutputObjectAndByteCounter.java
@@ -18,27 +18,28 @@
 package org.apache.beam.runners.dataflow.worker.util.common.worker;
 
 import java.util.Random;
+import java.util.concurrent.ThreadLocalRandom;
 import org.apache.beam.runners.core.ElementByteSizeObservable;
 import org.apache.beam.runners.dataflow.worker.counters.Counter;
 import org.apache.beam.runners.dataflow.worker.counters.CounterBackedElementByteSizeObserver;
 import org.apache.beam.runners.dataflow.worker.counters.CounterFactory;
 import org.apache.beam.runners.dataflow.worker.counters.CounterFactory.CounterMean;
 import org.apache.beam.runners.dataflow.worker.counters.CounterName;
 import org.apache.beam.runners.dataflow.worker.counters.NameContext;
+import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
 import org.checkerframework.checker.nullness.qual.Nullable;
 
 /** An {@link ElementCounter} that counts output objects, bytes, and mean bytes. */
 @SuppressWarnings({
   "nullness" // TODO(https://github.com/apache/beam/issues/20497)
 })
 public class OutputObjectAndByteCounter implements ElementCounter {
+
   // Might be null, e.g., undeclared outputs will not have an
   // elementByteSizeObservable.
   private final ElementByteSizeObservable<Object> elementByteSizeObservable;
   private final CounterFactory counterFactory;
 
-  private Random randomGenerator = new Random();
-
   // Lowest sampling probability: 0.001%.
   private static final int SAMPLING_TOKEN_UPPER_BOUND = 1000000;
   private static final int SAMPLING_CUTOFF = 10;
@@ -163,12 +164,12 @@ protected boolean sampleElement() {
     // samplingCutoff / samplingTokenUpperBound. This algorithm may be refined
     // later.
     samplingToken = Math.min(samplingToken + 1, samplingTokenUpperBound);
-    return randomGenerator.nextInt(samplingToken) < SAMPLING_CUTOFF;
+    return getRandom().nextInt(samplingToken) < SAMPLING_CUTOFF;
   }
 
-  public OutputObjectAndByteCounter setRandom(Random random) {
-    this.randomGenerator = random;
-    return this;
+  @VisibleForTesting
+  protected Random getRandom() {
+    return ThreadLocalRandom.current();
   }
 
   private CounterName getCounterName(String name) {
diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/OutputObjectAndByteCounterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/OutputObjectAndByteCounterTest.java
@@ -92,13 +92,19 @@ public void testAddingCountersIntoCounterSet() throws Exception {
   }
 
   private OutputObjectAndByteCounter makeCounter(String name, int samplingPeriod, int seed) {
-    return new OutputObjectAndByteCounter(
+    OutputObjectAndByteCounter outputObjectAndByteCounter =
+        new OutputObjectAndByteCounter(
             new ElementByteSizeObservableCoder<>(StringUtf8Coder.of()),
             counterSet,
-            NameContextsForTests.nameContextForTest())
-        .setSamplingPeriod(samplingPeriod)
-        .setRandom(new Random(seed))
-        .countBytes(name);
+            NameContextsForTests.nameContextForTest()) {
+          private final Random random = new Random(seed);
+
+          @Override
+          protected Random getRandom() {
+            return random;
+          }
+        };
+    return outputObjectAndByteCounter.setSamplingPeriod(samplingPeriod).countBytes(name);
   }
 
   @Test
diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle
@@ -55,6 +55,7 @@ dependencies {
     implementation "org.apache.iceberg:iceberg-api:$iceberg_version"
     implementation "org.apache.iceberg:iceberg-parquet:$iceberg_version"
     implementation "org.apache.iceberg:iceberg-orc:$iceberg_version"
+    implementation "org.apache.iceberg:iceberg-data:$iceberg_version"
     implementation library.java.hadoop_common
     runtimeOnly "org.apache.iceberg:iceberg-gcp:$iceberg_version"
 
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java
@@ -50,14 +50,11 @@
 import org.apache.iceberg.Table;
 import org.apache.iceberg.catalog.Catalog;
 import org.apache.iceberg.catalog.TableIdentifier;
-import org.apache.iceberg.data.GenericRecord;
+import org.apache.iceberg.data.InternalRecordWrapper;
 import org.apache.iceberg.data.Record;
 import org.apache.iceberg.exceptions.AlreadyExistsException;
 import org.apache.iceberg.exceptions.NoSuchTableException;
-import org.apache.iceberg.expressions.Literal;
-import org.apache.iceberg.transforms.Transform;
 import org.apache.iceberg.transforms.Transforms;
-import org.apache.iceberg.types.Types;
 import org.checkerframework.checker.nullness.qual.Nullable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -106,12 +103,14 @@ class DestinationState {
     @VisibleForTesting final Map<PartitionKey, Integer> writerCounts = Maps.newHashMap();
     private final Map<String, PartitionField> partitionFieldMap = Maps.newHashMap();
     private final List<Exception> exceptions = Lists.newArrayList();
+    private final InternalRecordWrapper wrapper; // wrapper that facilitates partitioning
 
     DestinationState(IcebergDestination icebergDestination, Table table) {
       this.icebergDestination = icebergDestination;
       this.schema = table.schema();
       this.spec = table.spec();
       this.routingPartitionKey = new PartitionKey(spec, schema);
+      this.wrapper = new InternalRecordWrapper(schema.asStruct());
       this.table = table;
       for (PartitionField partitionField : spec.fields()) {
         partitionFieldMap.put(partitionField.name(), partitionField);
@@ -156,7 +155,7 @@ class DestinationState {
      * can't create a new writer, the {@link Record} is rejected and {@code false} is returned.
      */
     boolean write(Record record) {
-      routingPartitionKey.partition(getPartitionableRecord(record));
+      routingPartitionKey.partition(wrapper.wrap(record));
 
       @Nullable RecordWriter writer = writers.getIfPresent(routingPartitionKey);
       if (writer == null && openWriters >= maxNumWriters) {
@@ -207,30 +206,6 @@ private RecordWriter createWriter(PartitionKey partitionKey) {
             e);
       }
     }
-
-    /**
-     * Resolves an input {@link Record}'s partition values and returns another {@link Record} that
-     * can be applied to the destination's {@link PartitionSpec}.
-     */
-    private Record getPartitionableRecord(Record record) {
-      if (spec.isUnpartitioned()) {
-        return record;
-      }
-      Record output = GenericRecord.create(schema);
-      for (PartitionField partitionField : spec.fields()) {
-        Transform<?, ?> transform = partitionField.transform();
-        Types.NestedField field = schema.findField(partitionField.sourceId());
-        String name = field.name();
-        Object value = record.getField(name);
-        @Nullable Literal<Object> literal = Literal.of(value.toString()).to(field.type());
-        if (literal == null || transform.isVoid() || transform.isIdentity()) {
-          output.setField(name, value);
-        } else {
-          output.setField(name, literal.value());
-        }
-      }
-      return output;
-    }
   }
 
   /**
diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/RecordWriterManagerTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/RecordWriterManagerTest.java
@@ -17,6 +17,7 @@
  */
 package org.apache.beam.sdk.io.iceberg;
 
+import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.containsInAnyOrder;
 import static org.hamcrest.Matchers.containsString;
@@ -27,9 +28,12 @@
 import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
 import java.time.LocalDate;
 import java.time.LocalDateTime;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -451,10 +455,27 @@ public void testIdentityPartitioning() throws IOException {
             .addFloatField("float")
             .addDoubleField("double")
             .addStringField("str")
+            .addLogicalTypeField("date", SqlTypes.DATE)
+            .addLogicalTypeField("time", SqlTypes.TIME)
+            .addLogicalTypeField("datetime", SqlTypes.DATETIME)
+            .addDateTimeField("datetime_tz")
             .build();
-
+    String timestamp = "2025-01-21T13:18:20.053";
+    LocalDateTime localDateTime = LocalDateTime.parse(timestamp);
     Row row =
-        Row.withSchema(primitiveTypeSchema).addValues(true, 1, 1L, 1.23f, 4.56, "str").build();
+        Row.withSchema(primitiveTypeSchema)
+            .addValues(
+                true,
+                1,
+                1L,
+                1.23f,
+                4.56,
+                "str",
+                localDateTime.toLocalDate(),
+                localDateTime.toLocalTime(),
+                localDateTime,
+                DateTime.parse(timestamp))
+            .build();
     org.apache.iceberg.Schema icebergSchema =
         IcebergUtils.beamSchemaToIcebergSchema(primitiveTypeSchema);
     PartitionSpec spec =
@@ -465,6 +486,10 @@ public void testIdentityPartitioning() throws IOException {
             .identity("float")
             .identity("double")
             .identity("str")
+            .identity("date")
+            .identity("time")
+            .identity("datetime")
+            .identity("datetime_tz")
             .build();
     WindowedValue<IcebergDestination> dest =
         getWindowedDestination("identity_partitioning", icebergSchema, spec);
@@ -479,8 +504,12 @@ public void testIdentityPartitioning() throws IOException {
     assertEquals(1, dataFile.getRecordCount());
     // build this string: bool=true/int=1/long=1/float=1.0/double=1.0/str=str
     List<String> expectedPartitions = new ArrayList<>();
+    List<String> dateTypes = Arrays.asList("date", "time", "datetime", "datetime_tz");
     for (Schema.Field field : primitiveTypeSchema.getFields()) {
-      Object val = row.getValue(field.getName());
+      Object val = checkStateNotNull(row.getValue(field.getName()));
+      if (dateTypes.contains(field.getName())) {
+        val = URLEncoder.encode(val.toString(), StandardCharsets.UTF_8.toString());
+      }
       expectedPartitions.add(field.getName() + "=" + val);
     }
     String expectedPartitionPath = String.join("/", expectedPartitions);
diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOIT.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOIT.java
diff --git a/sdks/python/test-suites/tox/py39/build.gradle b/sdks/python/test-suites/tox/py39/build.gradle
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
diff --git a/settings.gradle.kts b/settings.gradle.kts

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`{`
`2`	`2`	`"comment": "Modify this file in a trivial way to cause this test suite to run.",`
`3`		`- "modification": 2`
	`3`	`+ "modification": 3`
`4`	`4`	`}`