make kafka message keys accessible (#9512)

zxqfd555 · Manul from Pathway · commit f3bac2e7379c · 2025-12-07T12:00:25.000Z
GitOrigin-RevId: bf3966813216d6842546b3330f5abee39e781e53
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ### Changed
 - The MCP server `tool` method now allows to pass an optional `description`, default value ​​being kept as the handler's docstring.
+- `pw.io.kafka.read` and `pw.io.redpanda.read` now create a `key` column storing the contents of the message keys.
 
 ## [0.27.0] - 2025-11-13
 
diff --git a/integration_tests/kafka/test_simple.py b/integration_tests/kafka/test_simple.py
@@ -8,6 +8,7 @@
 import time
 import uuid
 
+import pandas as pd
 import pytest
 
 import pathway as pw
@@ -55,6 +56,31 @@ def test_kafka_raw(with_metadata, tmp_path, kafka_context):
     )
 
 
+@pytest.mark.parametrize("with_metadata", [False, True])
+@pytest.mark.parametrize("input_format", ["plaintext", "raw"])
+@pytest.mark.flaky(reruns=3)
+def test_kafka_key_parsing(input_format, with_metadata, tmp_path, kafka_context):
+    context = [("1", "one"), ("2", "two"), ("3", "three")]
+    kafka_context.fill(context)
+
+    table = pw.io.kafka.read(
+        rdkafka_settings=kafka_context.default_rdkafka_settings(),
+        topic=kafka_context.input_topic,
+        format=input_format,
+        autocommit_duration_ms=100,
+        with_metadata=with_metadata,
+        mode="static",
+    )
+
+    pandas_table = pw.debug.table_to_pandas(table)
+    for key, value in context:
+        if input_format != "plaintext":
+            key = key.encode("utf-8")  # type: ignore
+            value = value.encode("utf-8")  # type: ignore
+        row = pandas_table.loc[pandas_table["key"] == key, ["data"]].iloc[0]
+        assert (row == pd.Series({"data": value})).all()
+
+
 @pytest.mark.flaky(reruns=3)
 def test_kafka_static_mode(tmp_path, kafka_context):
     kafka_context.fill(["foo", "bar"])
diff --git a/python/pathway/io/_utils.py b/python/pathway/io/_utils.py
@@ -30,6 +30,7 @@
 SNAPSHOT_MODE_NAME = "streaming_with_deletions"  # deprecated
 
 METADATA_COLUMN_NAME = "_metadata"
+MESSAGE_QUEUE_KEY_COLUMN_NAME = "key"
 
 STATUS_SIZE_LIMIT_EXCEEDED = "size_limit_exceeded"
 STATUS_DOWNLOADED = "downloaded"
@@ -206,6 +207,31 @@ def assert_schema_not_none(
         return schema
 
 
+class PlaintextKeySchema(pw.Schema):
+    key: str
+
+
+class RawKeySchema(pw.Schema):
+    key: bytes
+
+
+def construct_raw_data_schema_by_flags(
+    *, with_native_record_key: bool, parse_utf8: bool, with_metadata: bool
+) -> type[pw.Schema]:
+    Schema: Any
+    if parse_utf8:
+        Schema = PlaintextDataSchema
+        if with_native_record_key:
+            Schema = Schema | PlaintextKeySchema
+    else:
+        Schema = RawDataSchema
+        if with_native_record_key:
+            Schema = Schema | RawKeySchema
+    if with_metadata:
+        Schema = Schema | MetadataSchema
+    return Schema
+
+
 def construct_schema_and_data_format(
     format: str,
     *,
@@ -215,6 +241,7 @@ def construct_schema_and_data_format(
     csv_settings: CsvParserSettings | None = None,
     json_field_paths: dict[str, str] | None = None,
     schema_registry_settings: SchemaRegistrySettings | None = None,
+    with_native_record_key: bool = False,
     _stacklevel: int = 1,
 ) -> tuple[type[Schema], api.DataFormat]:
     data_format_type = get_data_format_type(format, SUPPORTED_INPUT_FORMATS)
@@ -231,13 +258,11 @@ def construct_schema_and_data_format(
                 raise ValueError(f"Unexpected argument for plaintext format: {param}")
 
         parse_utf8 = format not in ("binary", "only_metadata")
-        if parse_utf8:
-            schema = PlaintextDataSchema
-        else:
-            schema = RawDataSchema
-
-        if with_metadata:
-            schema |= MetadataSchema
+        schema = construct_raw_data_schema_by_flags(
+            with_native_record_key=with_native_record_key,
+            parse_utf8=parse_utf8,
+            with_metadata=with_metadata,
+        )
         schema, api_schema = read_schema(schema)
 
         return schema, api.DataFormat(
@@ -252,6 +277,9 @@ def construct_schema_and_data_format(
             schema_registry_settings=maybe_schema_registry_settings(
                 schema_registry_settings
             ),
+            message_queue_key_field=(
+                MESSAGE_QUEUE_KEY_COLUMN_NAME if with_native_record_key else None
+            ),
         )
 
     schema = assert_schema_not_none(schema, data_format_type)
diff --git a/python/pathway/io/kafka/__init__.py b/python/pathway/io/kafka/__init__.py
@@ -52,7 +52,8 @@ def read(
     If the ``"raw"`` format is chosen, the key and the payload are read from the topic as raw
     bytes and used in the table "as is". If you choose the ``"plaintext"`` option, however,
     they are parsed from the UTF-8 into the plaintext entries. In both cases, the
-    table consists of a primary key and a single column ``"data"``, denoting the payload read.
+    table consists of a primary key and two columns ``"key"`` and ``"data"``,
+    denoting the key and the payload read.
 
     If ``"json"`` is chosen, the connector first parses the payload of the message
     according to the JSON format and then creates the columns corresponding to the
@@ -108,9 +109,11 @@ def read(
     Returns:
         Table: The table read.
 
-    When using the format "raw", the connector will produce a single-column table:
-    all the data is saved into a column named ``data``.
-    For other formats, the argument value_column is required and defines the columns.
+    When using the format ``"raw"`` or ``"plaintext"``, the connector will produce a
+    two-column table: all the payloads are saved into a column named ``data``, while the
+    keys are saved into a column ``key``.
+
+    For other formats, the schema is required and defines the columns.
 
     Example:
 
@@ -140,7 +143,8 @@ def read(
     ...    format="raw",
     ... )
 
-    All the data will be accessible in the column data.
+    All the payload data will be accessible in the column ``data``, the keys of the messages
+    will be stored in the column ``key``.
 
     JSON version:
 
@@ -236,6 +240,7 @@ def read(
         schema=schema,
         json_field_paths=json_field_paths,
         schema_registry_settings=schema_registry_settings,
+        with_native_record_key=True,
         _stacklevel=5,
     )
     data_source_options = datasource.DataSourceOptions(
diff --git a/python/pathway/io/redpanda/__init__.py b/python/pathway/io/redpanda/__init__.py
@@ -33,7 +33,18 @@ def read(
     **kwargs,
 ) -> Table:
     """Reads table from a set of topics in Redpanda.
-    There are three formats currently supported: "raw", "csv", and "json".
+
+    There are three formats currently supported: ``"plaintext"``, ``"raw"``, and ``"json"``.
+    If the ``"raw"`` format is chosen, the key and the payload are read from the topic as raw
+    bytes and used in the table "as is". If you choose the ``"plaintext"`` option, however,
+    they are parsed from the UTF-8 into the plaintext entries. In both cases, the
+    table consists of a primary key and two columns ``"key"`` and ``"data"``,
+    denoting the key and the payload read.
+
+    If ``"json"`` is chosen, the connector first parses the payload of the message
+    according to the JSON format and then creates the columns corresponding to the
+    schema defined by the ``schema`` parameter. The values of these columns are
+    taken from the respective parsed JSON fields.
 
     Args:
         rdkafka_settings: Connection settings in the format of
@@ -45,7 +56,7 @@ def read(
             process them as they arrive, and send them into the engine. Alternatively,
             if set to ``"static"``, the engine will only read and process the data that
             is already available at the time of execution.
-        format: format of the input data, "raw", "csv", or "json"
+        format: format of the input data, ``"raw"``, ``"plaintext"``, or ``"json"``.
         schema_registry_settings: settings for connecting to the Confluent Schema Registry,
             if this type of registry is used.
         debug_data: Static data replacing original one when debug mode is active.
@@ -73,9 +84,11 @@ def read(
     Returns:
         Table: The table read.
 
-    When using the format "raw", the connector will produce a single-column table:
-    all the data is saved into a column named `data`.
-    For other formats, the argument value_column is required and defines the columns.
+    When using the format ``"raw"`` or ``"plaintext"``, the connector will produce a
+    two-column table: all the payloads are saved into a column named ``data``, while the
+    keys are saved into a column ``key``.
+
+    For other formats, the schema is required and defines the columns.
 
     Example:
 
diff --git a/src/connectors/data_format.rs b/src/connectors/data_format.rs
@@ -878,49 +878,87 @@ impl KeyGenerationPolicy {
 }
 
 pub struct IdentityParser {
-    value_fields: Vec<String>,
     parse_utf8: bool,
     metadata_column_value: Value,
     session_type: SessionType,
     key_generation_policy: KeyGenerationPolicy,
+
+    n_value_fields: usize,
+    key_field_index: Option<usize>,
+    metadata_field_index: Option<usize>,
+    value_field_index: usize,
 }
 
 impl IdentityParser {
     pub fn new(
-        value_fields: Vec<String>,
+        value_fields: &[String],
         parse_utf8: bool,
+        message_queue_key_field: Option<&String>,
         key_generation_policy: KeyGenerationPolicy,
         session_type: SessionType,
     ) -> IdentityParser {
+        let mut key_field_index = None;
+        let mut metadata_field_index = None;
+        let mut value_field_index = None;
+        for (index, value_field) in value_fields.iter().enumerate() {
+            if value_field == METADATA_FIELD_NAME {
+                assert!(metadata_field_index.is_none());
+                metadata_field_index = Some(index);
+            } else if Some(value_field) == message_queue_key_field {
+                assert!(key_field_index.is_none());
+                key_field_index = Some(index);
+            } else {
+                assert!(value_field_index.is_none());
+                value_field_index = Some(index);
+            }
+        }
+
         Self {
-            value_fields,
+            n_value_fields: value_fields.len(),
             parse_utf8,
             metadata_column_value: Value::None,
             key_generation_policy,
             session_type,
+            key_field_index,
+            metadata_field_index,
+            value_field_index: value_field_index
+                .expect("value field must be present in the schema"),
         }
     }
 }
 
 impl Parser for IdentityParser {
     fn parse(&mut self, data: &ReaderContext) -> ParseResult {
+        let mut values = Vec::with_capacity(self.n_value_fields);
+        for _ in 0..self.n_value_fields {
+            // clone isn't available for the array element type, hence constructing manually
+            values.push(Ok(Value::None));
+        }
+
         let (event, key, value, metadata) = match data {
             RawBytes(event, raw_bytes) => (
                 *event,
                 None,
                 value_from_bytes(raw_bytes, self.parse_utf8),
                 Ok(None),
             ),
-            KeyValue((key, value)) => match value {
-                Some(bytes) => (
-                    DataEventType::Insert,
-                    self.key_generation_policy
-                        .generate(key.as_ref(), self.parse_utf8),
-                    value_from_bytes(bytes, self.parse_utf8),
-                    Ok(None),
-                ),
-                None => return Err(ParseError::EmptyKafkaPayload.into()),
-            },
+            KeyValue((key, value)) => {
+                if let Some(key_field_index) = self.key_field_index {
+                    values[key_field_index] = key
+                        .as_ref()
+                        .map_or_else(|| Ok(Value::None), |k| value_from_bytes(k, self.parse_utf8));
+                }
+                match value {
+                    Some(bytes) => (
+                        DataEventType::Insert,
+                        self.key_generation_policy
+                            .generate(key.as_ref(), self.parse_utf8),
+                        value_from_bytes(bytes, self.parse_utf8),
+                        Ok(None),
+                    ),
+                    None => return Err(ParseError::EmptyKafkaPayload.into()),
+                }
+            }
             Diff(_) | TokenizedEntries(_, _) => {
                 return Err(ParseError::UnsupportedReaderContext.into())
             }
@@ -932,22 +970,11 @@ impl Parser for IdentityParser {
         let event = if is_commit {
             ParsedEventWithErrors::AdvanceTime
         } else {
-            let mut values = Vec::new();
-            let mut metadata = Some(metadata);
-            let mut value = Some(value);
-            for field in &self.value_fields {
-                let to_insert = if field == METADATA_FIELD_NAME {
-                    metadata
-                        .take()
-                        .expect("metadata column should be used exactly once in IdentityParser")
-                        .map(|metadata| metadata.unwrap_or(self.metadata_column_value.clone()))
-                } else {
-                    value
-                        .take()
-                        .expect("value column should be used exactly once in IdentityParser")
-                };
-                values.push(to_insert);
+            if let Some(metadata_field_index) = self.metadata_field_index {
+                values[metadata_field_index] =
+                    metadata.map(|metadata| metadata.unwrap_or(self.metadata_column_value.clone()));
             }
+            values[self.value_field_index] = value;
             ParsedEventWithErrors::new(self.session_type(), event, key, values)
         };
 
@@ -960,7 +987,7 @@ impl Parser for IdentityParser {
     }
 
     fn column_count(&self) -> usize {
-        self.value_fields.len()
+        self.n_value_fields
     }
 
     fn session_type(&self) -> SessionType {
diff --git a/src/python_api.rs b/src/python_api.rs
@@ -4959,6 +4959,7 @@ pub struct DataFormat {
     designated_timestamp_policy: Option<String>,
     external_diff_column_index: Option<usize>,
     timestamp_unit: Option<String>,
+    message_queue_key_field: Option<String>,
 }
 
 #[pymethods]
@@ -5180,6 +5181,7 @@ impl DataFormat {
         designated_timestamp_policy = None,
         external_diff_column_index = None,
         timestamp_unit = None,
+        message_queue_key_field = None,
     ))]
     #[allow(clippy::too_many_arguments)]
     fn new(
@@ -5200,6 +5202,7 @@ impl DataFormat {
         designated_timestamp_policy: Option<String>,
         external_diff_column_index: Option<usize>,
         timestamp_unit: Option<String>,
+        message_queue_key_field: Option<String>,
     ) -> Self {
         DataFormat {
             format_type,
@@ -5219,6 +5222,7 @@ impl DataFormat {
             designated_timestamp_policy,
             external_diff_column_index,
             timestamp_unit,
+            message_queue_key_field,
         }
     }
 
@@ -6658,8 +6662,9 @@ impl DataFormat {
                 Ok(Box::new(parser))
             }
             "identity" => Ok(Box::new(IdentityParser::new(
-                self.value_field_names(py),
+                self.value_field_names(py).as_slice(),
                 self.parse_utf8,
+                self.message_queue_key_field.as_ref(),
                 self.key_generation_policy,
                 self.session_type,
             ))),
diff --git a/tests/integration/test_bytes.rs b/tests/integration/test_bytes.rs
diff --git a/tests/integration/test_metadata.rs b/tests/integration/test_metadata.rs