apache
diff --git a/‎sdks/python/apache_beam/yaml/examples/testing/examples_test.py‎
Lines changed: 103 additions & 19 deletions b/‎sdks/python/apache_beam/yaml/examples/testing/examples_test.py‎
Lines changed: 103 additions & 19 deletions
diff --git a/‎sdks/python/apache_beam/yaml/examples/testing/input_data.py‎
Lines changed: 1 addition & 52 deletions b/‎sdks/python/apache_beam/yaml/examples/testing/input_data.py‎
Lines changed: 1 addition & 52 deletions
diff --git a/‎sdks/python/apache_beam/yaml/examples/transforms/blueprint/gcs_text_to_bigquery.yaml‎
Lines changed: 44 additions & 0 deletions b/‎sdks/python/apache_beam/yaml/examples/transforms/blueprint/gcs_text_to_bigquery.yaml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎sdks/python/apache_beam/yaml/examples/transforms/blueprint/kafka_to_iceberg.yaml‎
Lines changed: 73 additions & 0 deletions b/‎sdks/python/apache_beam/yaml/examples/transforms/blueprint/kafka_to_iceberg.yaml‎
Lines changed: 73 additions & 0 deletions
@@ -418,7 +418,8 @@ def _wordcount_test_preprocessor(
       env.input_file('kinglear.txt', '\n'.join(lines)))
 
 
-@YamlExamplesTestSuite.register_test_preprocessor('test_kafka_yaml')
+@YamlExamplesTestSuite.register_test_preprocessor(
+    ['test_kafka_yaml', 'test_kafka_to_iceberg_yaml'])
 def _kafka_test_preprocessor(
     test_spec: dict, expected: List[str], env: TestEnvironment):
 
@@ -448,7 +449,15 @@ def _kafka_test_preprocessor(
     'test_pubsub_topic_to_bigquery_yaml',
     'test_pubsub_subscription_to_bigquery_yaml',
     'test_jdbc_to_bigquery_yaml',
-    'test_spanner_to_avro_yaml'
+    'test_spanner_to_avro_yaml',
+    'test_gcs_text_to_bigquery_yaml',
+    'test_sqlserver_to_bigquery_yaml',
+    'test_postgres_to_bigquery_yaml',
+    'test_kafka_to_iceberg_yaml',
+    'test_pubsub_to_iceberg_yaml',
+    'test_oracle_to_bigquery_yaml',
+    'test_mysql_to_bigquery_yaml',
+    'test_spanner_to_bigquery_yaml'
 ])
 def _io_write_test_preprocessor(
     test_spec: dict, expected: List[str], env: TestEnvironment):
@@ -482,8 +491,11 @@ def _io_write_test_preprocessor(
   return test_spec
 
 
-@YamlExamplesTestSuite.register_test_preprocessor(
-    ['test_simple_filter_yaml', 'test_simple_filter_and_combine_yaml'])
+@YamlExamplesTestSuite.register_test_preprocessor([
+    'test_simple_filter_yaml',
+    'test_simple_filter_and_combine_yaml',
+    'test_gcs_text_to_bigquery_yaml'
+])
 def _file_io_read_test_preprocessor(
     test_spec: dict, expected: List[str], env: TestEnvironment):
   """
@@ -560,7 +572,8 @@ def _iceberg_io_read_test_preprocessor(
 @YamlExamplesTestSuite.register_test_preprocessor([
     'test_spanner_read_yaml',
     'test_enrich_spanner_with_bigquery_yaml',
-    "test_spanner_to_avro_yaml"
+    'test_spanner_to_avro_yaml',
+    'test_spanner_to_bigquery_yaml'
 ])
 def _spanner_io_read_test_preprocessor(
     test_spec: dict, expected: List[str], env: TestEnvironment):
@@ -642,13 +655,13 @@ def _enrichment_test_preprocessor(
 
 @YamlExamplesTestSuite.register_test_preprocessor([
     'test_pubsub_topic_to_bigquery_yaml',
-    'test_pubsub_subscription_to_bigquery_yaml'
+    'test_pubsub_subscription_to_bigquery_yaml',
+    'test_pubsub_to_iceberg_yaml'
 ])
 def _pubsub_io_read_test_preprocessor(
     test_spec: dict, expected: List[str], env: TestEnvironment):
   """
   Preprocessor for tests that involve reading from Pub/Sub.
-
   This preprocessor replaces any ReadFromPubSub transform with a Create
   transform that reads from a predefined in-memory list of messages.
   This allows the test to verify the pipeline's correctness without relying
@@ -668,27 +681,91 @@ def _pubsub_io_read_test_preprocessor(
 def _jdbc_io_read_test_preprocessor(
     test_spec: dict, expected: List[str], env: TestEnvironment):
   """
-  Preprocessor for tests that involve reading from JDBC.
+  Preprocessor for tests that involve reading from generic Jdbc.
+  url syntax: 'jdbc:<database-type>://<host>:<port>/<database>'
+  """
+  return _db_io_read_test_processor(
+      test_spec, lambda url: url.split('/')[-1], 'Jdbc')
 
-  This preprocessor replaces any ReadFromJdbc transform with a Create
-  transform that reads from a predefined in-memory list of records.
-  This allows the test to verify the pipeline's correctness without
-  relying on an active JDBC connection.
+
+@YamlExamplesTestSuite.register_test_preprocessor([
+    'test_sqlserver_to_bigquery_yaml',
+])
+def __sqlserver_io_read_test_preprocessor(
+    test_spec: dict, expected: List[str], env: TestEnvironment):
+  """
+  Preprocessor for tests that involve reading from SqlServer.
+  url syntax: 'jdbc:sqlserver://<host>:<port>;databaseName=<database>;
+    user=<user>;password=<password>;encrypt=false;trustServerCertificate=true'
+  """
+  return _db_io_read_test_processor(
+      test_spec, lambda url: url.split(';')[1].split('=')[-1], 'SqlServer')
+
+
+@YamlExamplesTestSuite.register_test_preprocessor([
+    'test_postgres_to_bigquery_yaml',
+])
+def __postgres_io_read_test_preprocessor(
+    test_spec: dict, expected: List[str], env: TestEnvironment):
+  """
+  Preprocessor for tests that involve reading from Postgres.
+  url syntax: 'jdbc:postgresql://<host>:<port>/shipment?user=<user>&
+    password=<password>'
+  """
+  return _db_io_read_test_processor(
+      test_spec, lambda url: url.split('/')[3].split('?')[0], 'Postgres')
+
+
+@YamlExamplesTestSuite.register_test_preprocessor([
+    'test_oracle_to_bigquery_yaml',
+])
+def __oracle_io_read_test_preprocessor(
+    test_spec: dict, expected: List[str], env: TestEnvironment):
+  """
+  Preprocessor for tests that involve reading from Oracle.
+  url syntax: 'jdbc:oracle:thin:system/oracle@<host>:{port}/<database>'
+  """
+  return _db_io_read_test_processor(
+      test_spec, lambda url: url.split('/')[2], 'Oracle')
+
+
+@YamlExamplesTestSuite.register_test_preprocessor([
+    'test_mysql_to_bigquery_yaml',
+])
+def __mysql_io_read_test_preprocessor(
+    test_spec: dict, expected: List[str], env: TestEnvironment):
+  """
+  Preprocessor for tests that involve reading from MySql.
+  url syntax: 'jdbc:mysql://<host>:<port>/<database>?user=<user>&
+    password=<password>'
+  """
+  return _db_io_read_test_processor(
+      test_spec, lambda url: url.split('/')[3].split('?')[0], 'MySql')
+
+
+def _db_io_read_test_processor(
+    test_spec: dict, database_url_fn: Callable, database_type: str):
+  """
+  This preprocessor replaces any ReadFrom<database> transform with a Create
+  transform that reads from a predefined in-memory list of records. This allows
+  the test to verify the pipeline's correctness without relying on an active
+  database.
   """
   if pipeline := test_spec.get('pipeline', None):
     for transform in pipeline.get('transforms', []):
-      if transform.get('type', '').startswith('ReadFromJdbc'):
+      transform_name = f"ReadFrom{database_type}"
+      if transform.get('type', '').startswith(transform_name):
         config = transform['config']
         url = config['url']
-        database = url.split('/')[-1]
+        database = database_url_fn(url)
         if (table := config.get('table', None)) is None:
           table = config.get('query', '').split('FROM')[-1].strip()
         transform['type'] = 'Create'
         transform['config'] = {
             k: v
             for k, v in config.items() if k.startswith('__')
         }
-        elements = INPUT_TABLES[("Jdbc", database, table)]
+        elements = INPUT_TABLES[(database_type, database, table)]
         if config.get('query', None):
           config['query'].replace('select ',
                                   'SELECT ').replace(' from ', ' FROM ')
@@ -705,17 +782,24 @@ def _jdbc_io_read_test_preprocessor(
   return test_spec
 
 
-INPUT_FILES = {'products.csv': input_data.products_csv()}
+INPUT_FILES = {
+    'products.csv': input_data.products_csv(),
+    'kinglear.txt': input_data.text_data()
+}
+
 INPUT_TABLES = {
-    ('shipment-test', 'shipment', 'shipments'): input_data.
-    spanner_shipments_data(),
+    ('shipment-test', 'shipment', 'shipments'): input_data.shipments_data(),
     ('orders-test', 'order-database', 'orders'): input_data.
     spanner_orders_data(),
     ('db', 'users', 'NY'): input_data.iceberg_dynamic_destinations_users_data(),
     ('BigTable', 'beam-test', 'bigtable-enrichment-test'): input_data.
     bigtable_data(),
     ('BigQuery', 'ALL_TEST', 'customers'): input_data.bigquery_data(),
-    ('Jdbc', 'shipment', 'shipments'): input_data.jdbc_shipments_data()
+    ('Jdbc', 'shipment', 'shipments'): input_data.shipments_data(),
+    ('SqlServer', 'shipment', 'shipments'): input_data.shipments_data(),
+    ('Postgres', 'shipment', 'shipments'): input_data.shipments_data(),
+    ('Oracle', 'shipment', 'shipments'): input_data.shipments_data(),
+    ('MySql', 'shipment', 'shipments'): input_data.shipments_data()
 }
 YAML_DOCS_DIR = os.path.join(os.path.dirname(__file__))
 
 
@@ -78,58 +78,7 @@ def spanner_orders_data():
           }]
 
 
-def spanner_shipments_data():
-  return [{
-      'shipment_id': 'S1',
-      'customer_id': 'C1',
-      'shipment_date': '2023-05-01',
-      'shipment_cost': 150.0,
-      'customer_name': 'Alice',
-      'customer_email': 'alice@example.com'
-  },
-          {
-              'shipment_id': 'S2',
-              'customer_id': 'C2',
-              'shipment_date': '2023-06-12',
-              'shipment_cost': 300.0,
-              'customer_name': 'Bob',
-              'customer_email': 'bob@example.com'
-          },
-          {
-              'shipment_id': 'S3',
-              'customer_id': 'C1',
-              'shipment_date': '2023-05-10',
-              'shipment_cost': 20.0,
-              'customer_name': 'Alice',
-              'customer_email': 'alice@example.com'
-          },
-          {
-              'shipment_id': 'S4',
-              'customer_id': 'C4',
-              'shipment_date': '2024-07-01',
-              'shipment_cost': 150.0,
-              'customer_name': 'Derek',
-              'customer_email': 'derek@example.com'
-          },
-          {
-              'shipment_id': 'S5',
-              'customer_id': 'C5',
-              'shipment_date': '2023-05-09',
-              'shipment_cost': 300.0,
-              'customer_name': 'Erin',
-              'customer_email': 'erin@example.com'
-          },
-          {
-              'shipment_id': 'S6',
-              'customer_id': 'C4',
-              'shipment_date': '2024-07-02',
-              'shipment_cost': 150.0,
-              'customer_name': 'Derek',
-              'customer_email': 'derek@example.com'
-          }]
-
-
-def jdbc_shipments_data():
+def shipments_data():
   return [{
       'shipment_id': 'S1',
       'customer_id': 'C1',
 
@@ -0,0 +1,44 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is an example of a Beam YAML pipeline that reads from spanner database
+# and writes to GCS avro files.  This matches the Dataflow Template located
+# here - https://cloud.google.com/dataflow/docs/guides/templates/provided/cloud-spanner-to-avro
+
+pipeline:
+  type: chain
+  transforms:
+    # Step 1: Reading data from GCS
+    - type: ReadFromText
+      name: ReadFromGCS
+      config:
+        path: gs://dataflow-samples/shakespeare/kinglear.txt
+    # Step 2: Write records out to BigQuery
+    - type: WriteToBigQuery
+      name: WriteWords
+      config:
+        table: "apache-beam-testing.yaml_test.words"
+        create_disposition: "CREATE_NEVER"
+        write_disposition: "WRITE_APPEND"
+        num_streams: 1
+
+
+# Expected:
+#  Row(line='Fool\tThou shouldst not have been old till thou hadst')
+#  Row(line='\tbeen wise.')
+#  Row(line='KING LEAR\tNothing will come of nothing: speak again.')
+#  Row(line='\tNever, never, never, never, never!')
@@ -0,0 +1,73 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# A pipeline that both writes to and reads from the same Kafka topic.
+
+pipeline:
+  type: chain
+  transforms:
+    # Step 1: Reading data from Kafka
+    - type: ReadFromKafka
+      name: ReadFromMyTopic
+      config:
+        format: "RAW"
+        topic: "{{ TOPIC }}"
+        bootstrap_servers: "{{ BOOTSTRAP_SERVERS }}"
+        auto_offset_reset_config: earliest
+        consumer_config:
+          sasl.jaas.config: "org.apache.kafka.common.security.plain.PlainLoginModule required \
+            username={{ USERNAME }} \
+            password={{ PASSWORD }};"
+          security.protocol: "SASL_PLAINTEXT"
+          sasl.mechanism: "PLAIN"
+    # Step 2: Convert Kafka records
+    - type: MapToFields
+      name: ParseKafkaRecords
+      config:
+        language: python
+        fields:
+          text:
+            callable: |
+              def func(row):
+                # Kafka RAW format reads messages as bytes 
+                # in the 'payload' field of a Row
+                return row.payload.decode('utf-8')
+    # Step 3: Write records out to Iceberg
+    - type: WriteToIceberg
+      name: WriteToAnIcebergTable
+      config:
+        # Dynamic destinations
+        table: "db.users.{zip}"
+        catalog_name: "hadoop_catalog"
+        catalog_properties:
+          type: "hadoop"
+          warehouse: "gs://MY-WAREHOUSE"
+        # Hadoop catalog config required to run pipeline locally
+        # Omit if running on Dataflow
+        config_properties:
+          "fs.gs.auth.type": "SERVICE_ACCOUNT_JSON_KEYFILE"
+          "fs.gs.auth.service.account.json.keyfile": "/path/to/service/account/key.json"
+
+options:
+  streaming: true
+
+# Expected:
+#  Row(text='Fool\tThou shouldst not have been old till thou hadst')
+#  Row(text='\tbeen wise.')
+#  Row(text='KING LEAR\tNothing will come of nothing: speak again.')
+#  Row(text='\tNever, never, never, never, never!')