docs(samples): Add Dataflow snippet to read from multiple Kafka topics (GoogleCloudPlatform#12530)

VeronicaWasson · riathakkar · commit e13155a1e7f2 · 2024-10-08T16:29:48.000Z
diff --git a/dataflow/snippets/Dockerfile b/dataflow/snippets/Dockerfile
@@ -38,3 +38,4 @@ RUN apt-get update \
 
 
 COPY read_kafka.py ./
+COPY read_kafka_multi_topic.py ./
diff --git a/dataflow/snippets/read_kafka_multi_topic.py b/dataflow/snippets/read_kafka_multi_topic.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+#  Copyright 2024 Google LLC
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# [START dataflow_kafka_read_multi_topic]
+import argparse
+
+import apache_beam as beam
+
+from apache_beam.io.kafka import ReadFromKafka
+from apache_beam.io.textio import WriteToText
+from apache_beam.options.pipeline_options import PipelineOptions
+
+
+def read_from_kafka() -> None:
+    # Parse the pipeline options passed into the application. Example:
+    #   --bootstrap_server=$BOOTSTRAP_SERVER --output=$STORAGE_BUCKET --streaming
+    # For more information, see
+    # https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options
+    class MyOptions(PipelineOptions):
+        @staticmethod
+        def _add_argparse_args(parser: argparse.ArgumentParser) -> None:
+            parser.add_argument('--bootstrap_server')
+            parser.add_argument('--output')
+
+    options = MyOptions()
+    with beam.Pipeline(options=options) as pipeline:
+        # Read from two Kafka topics.
+        all_topics = pipeline | ReadFromKafka(consumer_config={
+                "bootstrap.servers": options.bootstrap_server
+            },
+            topics=["topic1", "topic2"],
+            with_metadata=True,
+            max_num_records=10,
+            start_read_time=0
+        )
+
+        # Filter messages from one topic into one branch of the pipeline.
+        (all_topics
+            | beam.Filter(lambda message: message.topic == 'topic1')
+            | beam.Map(lambda message: message.value.decode('utf-8'))
+            | "Write topic1" >> WriteToText(
+                file_path_prefix=options.output + '/topic1/output',
+                file_name_suffix='.txt',
+                num_shards=1))
+
+        # Filter messages from the other topic.
+        (all_topics
+            | beam.Filter(lambda message: message.topic == 'topic2')
+            | beam.Map(lambda message: message.value.decode('utf-8'))
+            | "Write topic2" >> WriteToText(
+                file_path_prefix=options.output + '/topic2/output',
+                file_name_suffix='.txt',
+                num_shards=1))
+# [END dataflow_kafka_read_multi_topic]
+
+
+if __name__ == "__main__":
+    read_from_kafka()
diff --git a/dataflow/snippets/tests/test_read_kafka.py b/dataflow/snippets/tests/test_read_kafka.py
@@ -18,76 +18,100 @@
 
 import docker
 
+from docker import DockerClient
 from kafka import KafkaProducer
 from kafka.admin import KafkaAdminClient, NewTopic
 from kafka.errors import NoBrokersAvailable
 
 import pytest
 
 
-BOOTSTRAP_SERVER = "localhost:9092"
-TOPIC_NAME = f"topic-{uuid.uuid4()}"
-CONTAINER_IMAGE_NAME = "kafka-pipeline:1"
+BOOTSTRAP_SERVER = 'localhost:9092'
+TOPIC_NAMES = ['topic1', 'topic2']
+CONTAINER_IMAGE_NAME = 'kafka-pipeline:1'
 
 
-@pytest.fixture(scope="module", autouse=True)
-def kafka_container() -> None:
+@pytest.fixture(scope='module')
+def docker_client() -> DockerClient:
+    # Build a container image for the pipeline.
+    client = docker.from_env()
+    client.images.build(path='./', tag=CONTAINER_IMAGE_NAME)
+    yield client
+
+
+@pytest.fixture(scope='module', autouse=True)
+def kafka_container(docker_client: DockerClient) -> None:
     # Start a containerized Kafka server.
-    docker_client = docker.from_env()
-    container = docker_client.containers.run(
-        "apache/kafka:3.7.0", network_mode="host", detach=True
-    )
+    container = docker_client.containers.run('apache/kafka:3.7.0', network_mode='host', detach=True)
     try:
-        create_topic()
+        create_topics()
+        send_messages(TOPIC_NAMES[0])
+        send_messages(TOPIC_NAMES[1])
         yield
     finally:
         container.stop()
 
 
-def create_topic() -> None:
-    # Try to create a Kafka topic. We might need to wait for the Kafka service to start.
+@pytest.fixture
+def file_name_prefix() -> str:
+    return f'output-{uuid.uuid4()}'
+
+
+def create_topics() -> None:
+    # Try to create Kafka topics. We might need to wait for the Kafka service to start.
     for _ in range(1, 10):
         try:
             client = KafkaAdminClient(bootstrap_servers=BOOTSTRAP_SERVER)
             topics = []
-            topics.append(
-                NewTopic(name=TOPIC_NAME, num_partitions=1, replication_factor=1)
-            )
+            topics.append(NewTopic(name=TOPIC_NAMES[0], num_partitions=1, replication_factor=1))
+            topics.append(NewTopic(name=TOPIC_NAMES[1], num_partitions=1, replication_factor=1))
             client.create_topics(topics)
             break
         except NoBrokersAvailable:
             time.sleep(5)
 
 
-def test_read_from_kafka(tmp_path: Path) -> None:
-    file_name_prefix = f"output-{uuid.uuid4()}"
-    file_name = f"{tmp_path}/{file_name_prefix}-00000-of-00001.txt"
-
+def send_messages(topic: str) -> None:
     # Send some messages to Kafka
     producer = KafkaProducer(bootstrap_servers=BOOTSTRAP_SERVER)
     for i in range(0, 5):
-        message = f"event-{i}"
-        producer.send(TOPIC_NAME, message.encode())
-
-    # Build a container image for the pipeline.
-    client = docker.from_env()
-    client.images.build(path="./", tag=CONTAINER_IMAGE_NAME)
+        message = f'{topic}-{i}'
+        producer.send(topic, message.encode())
 
-    # Run the pipeline.
-    client.containers.run(
-        image=CONTAINER_IMAGE_NAME,
-        command=f"/pipeline/read_kafka.py --output /out/{file_name_prefix} --bootstrap_server {BOOTSTRAP_SERVER} --topic {TOPIC_NAME}",
-        volumes=["/var/run/docker.sock:/var/run/docker.sock", f"{tmp_path}/:/out"],
-        network_mode="host",
-        entrypoint="python",
-    )
 
+def verify_output(file_name: str, topic: str) -> None:
     # Verify the pipeline wrote the Kafka messages to the output file.
-    with open(file_name, "r") as f:
+    with open(file_name, 'r') as f:
         text = f.read()
         for i in range(0, 5):
-            assert f"event-{i}" in text
+            assert f'{topic}-{i}' in text
+
 
+def test_read_kafka(docker_client: DockerClient, tmp_path: Path, file_name_prefix: str) -> None:
+    topic = TOPIC_NAMES[0]
+
+    # Run the containerized Dataflow pipeline.
+    docker_client.containers.run(
+        image=CONTAINER_IMAGE_NAME,
+        command=f'/pipeline/read_kafka.py --output /out/{file_name_prefix} --bootstrap_server {BOOTSTRAP_SERVER} --topic {topic}',
+        volumes=['/var/run/docker.sock:/var/run/docker.sock', f'{tmp_path}/:/out'],
+        network_mode='host',
+        entrypoint='python')
 
-if __name__ == "__main__":
-    test_read_from_kafka()
+    # Verify the pipeline wrote the Kafka messages to the output file.
+    verify_output(f'{tmp_path}/{file_name_prefix}-00000-of-00001.txt', topic)
+
+
+def test_read_kafka_multi_topic(docker_client: DockerClient, tmp_path: Path, file_name_prefix: str) -> None:
+    # Run the containerized Dataflow pipeline.
+    docker_client.containers.run(
+        image=CONTAINER_IMAGE_NAME,
+        command=f'/pipeline/read_kafka_multi_topic.py --output /out/{file_name_prefix} --bootstrap_server {BOOTSTRAP_SERVER}',
+        volumes=['/var/run/docker.sock:/var/run/docker.sock', f'{tmp_path}/:/out'],
+        network_mode='host',
+        entrypoint='python')
+
+    # Verify the pipeline wrote the Kafka messages to the output files.
+    # This code snippet writes outputs to separate directories based on the topic name.
+    for topic in TOPIC_NAMES:
+        verify_output(f'{tmp_path}/{file_name_prefix}/{topic}/output-00000-of-00001.txt', topic)

Original file line number	Diff line number	Diff line change
`@@ -38,3 +38,4 @@ RUN apt-get update \`
`38`	`38`
`39`	`39`
`40`	`40`	`COPY read_kafka.py ./`
	`41`	`+COPY read_kafka_multi_topic.py ./`