pathwaycom
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎integration_tests/kafka/test_simple.py‎
Lines changed: 202 additions & 10 deletions b/‎integration_tests/kafka/test_simple.py‎
Lines changed: 202 additions & 10 deletions
diff --git a/‎integration_tests/kafka/utils.py‎
Lines changed: 28 additions & 6 deletions b/‎integration_tests/kafka/utils.py‎
Lines changed: 28 additions & 6 deletions
diff --git a/‎python/pathway/engine.pyi‎
Lines changed: 7 additions & 1 deletion b/‎python/pathway/engine.pyi‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎python/pathway/internals/_io_helpers.py‎
Lines changed: 15 additions & 4 deletions b/‎python/pathway/internals/_io_helpers.py‎
Lines changed: 15 additions & 4 deletions
@@ -5,6 +5,9 @@ All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 
+### Added
+- `pw.io.kafka.read` and `pw.io.redpanda.read` now allow each schema field to be specified as coming from either the message key or the message value.
+
 ## [0.27.1] - 2025-12-08
 
 ### Added
 
@@ -8,7 +8,6 @@
 import time
 import uuid
 
-import pandas as pd
 import pytest
 
 import pathway as pw
@@ -60,9 +59,16 @@ def test_kafka_raw(with_metadata, tmp_path, kafka_context):
 @pytest.mark.parametrize("input_format", ["plaintext", "raw"])
 @pytest.mark.flaky(reruns=3)
 def test_kafka_key_parsing(input_format, with_metadata, tmp_path, kafka_context):
-    context = [("1", "one"), ("2", "two"), ("3", "three")]
+    context = [
+        ("1", "one"),
+        ("2", "two"),
+        ("3", "three"),
+        ("4", None),
+        (None, "five"),
+    ]
     kafka_context.fill(context)
 
+    output_path = tmp_path / "output.jsonl"
     table = pw.io.kafka.read(
         rdkafka_settings=kafka_context.default_rdkafka_settings(),
         topic=kafka_context.input_topic,
@@ -71,14 +77,23 @@ def test_kafka_key_parsing(input_format, with_metadata, tmp_path, kafka_context)
         with_metadata=with_metadata,
         mode="static",
     )
+    pw.io.jsonlines.write(table, output_path)
+    pw.run()
 
-    pandas_table = pw.debug.table_to_pandas(table)
-    for key, value in context:
-        if input_format != "plaintext":
-            key = key.encode("utf-8")  # type: ignore
-            value = value.encode("utf-8")  # type: ignore
-        row = pandas_table.loc[pandas_table["key"] == key, ["data"]].iloc[0]
-        assert (row == pd.Series({"data": value})).all()
+    parsed_values = []
+    with open(output_path, "r") as f:
+        for row in f:
+            data = json.loads(row)
+            key = data["key"]
+            value = data["data"]
+            if input_format == "raw" and key is not None:
+                key = base64.b64decode(key).decode("utf-8")
+            if input_format == "raw" and value is not None:
+                value = base64.b64decode(value).decode("utf-8")
+            parsed_values.append((key, value))
+    parsed_values.sort(key=lambda data: str(data[0]))
+    context.sort(key=lambda data: str(data[0]))
+    assert parsed_values == context
 
 
 @pytest.mark.flaky(reruns=3)
@@ -172,6 +187,133 @@ class InputSchema(pw.Schema):
     )
 
 
+@pytest.mark.parametrize("with_metadata", [False, True])
+@pytest.mark.flaky(reruns=3)
+def test_kafka_json_key_parsing(tmp_path, kafka_context, with_metadata):
+    context = [
+        (json.dumps({"k": 0}), json.dumps({"v": "foo"})),
+        (json.dumps({"k": 1}), json.dumps({"v": "bar"})),
+        (json.dumps({"k": 2}), json.dumps({"v": "baz"})),
+    ]
+    kafka_context.fill(context)
+
+    class InputSchema(pw.Schema):
+        k: int = pw.column_definition(primary_key=True, source_component="key")
+        v: str = pw.column_definition(primary_key=True, source_component="payload")
+
+    table = pw.io.kafka.read(
+        rdkafka_settings=kafka_context.default_rdkafka_settings(),
+        topic=kafka_context.input_topic,
+        format="json",
+        schema=InputSchema,
+        with_metadata=with_metadata,
+        autocommit_duration_ms=100,
+    )
+
+    pw.io.csv.write(table, tmp_path / "output.csv")
+
+    wait_result_with_checker(
+        expect_csv_checker(
+            """
+            k    | v
+            0    | foo
+            1    | bar
+            2    | baz
+            """,
+            tmp_path / "output.csv",
+            usecols=["v"],
+            index_col=["k"],
+        ),
+        10,
+    )
+
+
+@pytest.mark.parametrize("with_metadata", [False, True])
+@pytest.mark.flaky(reruns=3)
+def test_kafka_json_key_jsonpaths(tmp_path, kafka_context, with_metadata):
+    context = [
+        (json.dumps({"k": {"l": 0, "m": 3}}), json.dumps({"v": {"vv": "foo"}})),
+        (json.dumps({"k": {"l": 1, "m": 4}}), json.dumps({"v": {"vv": "bar"}})),
+        (json.dumps({"k": {"l": 2, "m": 5}}), json.dumps({"v": {"vv": "baz"}})),
+    ]
+    kafka_context.fill(context)
+
+    class InputSchema(pw.Schema):
+        k: int = pw.column_definition(primary_key=True, source_component="key")
+        v: str = pw.column_definition(primary_key=True, source_component="payload")
+
+    table = pw.io.kafka.read(
+        rdkafka_settings=kafka_context.default_rdkafka_settings(),
+        topic=kafka_context.input_topic,
+        format="json",
+        schema=InputSchema,
+        with_metadata=with_metadata,
+        autocommit_duration_ms=100,
+        json_field_paths={"k": "/k/l", "v": "/v/vv"},
+    )
+
+    pw.io.csv.write(table, tmp_path / "output.csv")
+
+    wait_result_with_checker(
+        expect_csv_checker(
+            """
+            k    | v
+            0    | foo
+            1    | bar
+            2    | baz
+            """,
+            tmp_path / "output.csv",
+            usecols=["v"],
+            index_col=["k"],
+        ),
+        10,
+    )
+
+
+@pytest.mark.parametrize("with_metadata", [False, True])
+@pytest.mark.parametrize("unparsable_value", ["abracadabra", None])
+@pytest.mark.flaky(reruns=3)
+def test_kafka_json_data_only_in_key(
+    tmp_path, unparsable_value, kafka_context, with_metadata
+):
+    context = [
+        (json.dumps({"k": 0, "v": "foo"}), unparsable_value),
+        (json.dumps({"k": 1, "v": "bar"}), unparsable_value),
+        (json.dumps({"k": 2, "v": "baz"}), unparsable_value),
+    ]
+    kafka_context.fill(context)
+
+    class InputSchema(pw.Schema):
+        k: int = pw.column_definition(primary_key=True, source_component="key")
+        v: str = pw.column_definition(primary_key=True, source_component="key")
+
+    table = pw.io.kafka.read(
+        rdkafka_settings=kafka_context.default_rdkafka_settings(),
+        topic=kafka_context.input_topic,
+        format="json",
+        schema=InputSchema,
+        with_metadata=with_metadata,
+        autocommit_duration_ms=100,
+    )
+
+    pw.io.csv.write(table, tmp_path / "output.csv")
+
+    wait_result_with_checker(
+        expect_csv_checker(
+            """
+            k    | v
+            0    | foo
+            1    | bar
+            2    | baz
+            """,
+            tmp_path / "output.csv",
+            usecols=["v"],
+            index_col=["k"],
+        ),
+        10,
+    )
+
+
 @pytest.mark.flaky(reruns=3)
 def test_kafka_simple_wrapper_bytes_io(
     tmp_path: pathlib.Path, kafka_context: KafkaTestContext
@@ -704,6 +846,7 @@ def test_kafka_registry(tmp_path, kafka_context):
 
     input_path = tmp_path / "input.jsonl"
     output_path = tmp_path / "output.jsonl"
+    raw_output_path = tmp_path / "output_raw.jsonl"
     input_entries = [
         {"key": 1, "value": "one"},
         {"key": 2, "value": "two"},
@@ -743,9 +886,17 @@ class TableSchema(pw.Schema):
             timeout=datetime.timedelta(seconds=5),
         ),
     )
+    table_raw = pw.io.kafka.read(
+        rdkafka_settings=kafka_context.default_rdkafka_settings(),
+        topic=kafka_context.input_topic,
+        format="raw",
+    )
 
     pw.io.jsonlines.write(table_reread, output_path)
-    wait_result_with_checker(FileLinesNumberChecker(output_path, 2), 30)
+    pw.io.jsonlines.write(table_raw, raw_output_path)
+    wait_result_with_checker(
+        FileLinesNumberChecker(output_path, 2).add_path(raw_output_path, 2), 30
+    )
     output_entries = []
     with open(output_path, "r") as f:
         for line in f:
@@ -758,3 +909,44 @@ class TableSchema(pw.Schema):
             )
     output_entries.sort(key=lambda x: x["key"])
     assert output_entries == input_entries
+
+    # Send the data encoded by the registry as a key, while keeping the value as empty.
+    # Check that value parsing works.
+    additional_topic = kafka_context.create_additional_topic()
+    with open(raw_output_path, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            encoded_message = base64.b64decode(data["data"])
+            kafka_context.send(message=(encoded_message, None), topic=additional_topic)
+
+    class KeyTableSchema(pw.Schema):
+        key: int = pw.column_definition(source_component="key")
+        value: str = pw.column_definition(source_component="key")
+
+    G.clear()
+    table = pw.io.kafka.read(
+        rdkafka_settings=kafka_context.default_rdkafka_settings(),
+        topic=additional_topic,
+        format="json",
+        schema=KeyTableSchema,
+        schema_registry_settings=pw.io.kafka.SchemaRegistrySettings(
+            urls=[SCHEMA_REGISTRY_BASE_ROUTE],
+            timeout=datetime.timedelta(seconds=5),
+        ),
+        mode="static",
+    )
+    pw.io.jsonlines.write(table, output_path)
+    pw.run(monitoring_level=pw.MonitoringLevel.NONE)
+
+    roundtrip_entries = []
+    with open(output_path, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            roundtrip_entries.append(
+                {
+                    "key": data["key"],
+                    "value": data["value"],
+                }
+            )
+    roundtrip_entries.sort(key=lambda x: x["key"])
+    assert roundtrip_entries == input_entries
@@ -21,6 +21,10 @@
 KINESIS_ENDPOINT_URL = "http://kinesis:4567"
 
 
+def random_topic_name():
+    return f"integration-tests-{uuid4()}"
+
+
 class KafkaTestContext:
     _producer: KafkaProducer
     _admin: KafkaAdminClient
@@ -34,25 +38,43 @@ def __init__(self) -> None:
         self._admin = KafkaAdminClient(
             bootstrap_servers=KAFKA_SETTINGS["bootstrap_servers"],
         )
-        self._input_topic = f"integration-tests-{uuid4()}"
-        self._output_topic = f"integration-tests-{uuid4()}"
+        self._input_topic = random_topic_name()
+        self._output_topic = random_topic_name()
+        self._created_topics: set[str] = set()
+
         self._create_topic(self.input_topic)
         self._create_topic(self.output_topic)
 
+    def create_additional_topic(self) -> str:
+        topic_name = random_topic_name()
+        self._create_topic(topic_name)
+        return topic_name
+
     def _create_topic(self, name: str, num_partitions: int = 1) -> None:
         self._admin.create_topics(
             [NewTopic(name=name, num_partitions=num_partitions, replication_factor=1)]
         )
+        self._created_topics.add(name)
 
     def _delete_topic(self, name: str) -> None:
         self._admin.delete_topics(topics=[name])
 
-    def send(self, message: str | tuple[str, str]) -> None:
+    def send(
+        self, message: str | tuple[str | bytes | None, str | bytes | None], topic=None
+    ) -> None:
+        topic = topic or self._input_topic
+
         if isinstance(message, tuple):
             (key, value) = message
         else:
             (key, value) = str(uuid4()), message
-        self._producer.send(self.input_topic, key=key.encode(), value=value.encode())
+
+        if isinstance(key, str):
+            key = key.encode()
+        if isinstance(value, str):
+            value = value.encode()
+
+        self._producer.send(topic, key=key, value=value)
 
     def set_input_topic_partitions(self, num_partitions: int):
         self._delete_topic(self._input_topic)
@@ -97,8 +119,8 @@ def read_input_topic(self, poll_timeout_ms: int = 1000) -> list[ConsumerRecord]:
         return self.read_topic(self._input_topic, poll_timeout_ms)
 
     def teardown(self) -> None:
-        self._delete_topic(self.input_topic)
-        self._delete_topic(self.output_topic)
+        for topic in self._created_topics:
+            self._delete_topic(topic)
         self._producer.close()
         self._admin.close()
 
 
@@ -931,9 +931,15 @@ class AwsS3Settings:
 class AzureBlobStorageSettings:
     def __init__(self, *args, **kwargs): ...
 
+class FieldSource(Enum):
+    KEY: FieldSource
+    PAYLOAD: FieldSource
+
 class ValueField:
     name: str
-    def __init__(self, name: str, type_: PathwayType): ...
+    def __init__(
+        self, name: str, type_: PathwayType, source: FieldSource = FieldSource.PAYLOAD
+    ): ...
     def set_default(self, *args, **kwargs): ...
     def set_metadata(self, *args, **kwargs): ...
 
 
@@ -195,6 +195,7 @@ def _format_output_value_fields(table: Table) -> list[api.ValueField]:
         value_field = api.ValueField(
             column_name,
             column_data.dtype.to_engine(),
+            source=column_data.engine_field_source,
         )
         value_field.set_metadata(
             json.dumps(column_data.to_json_serializable_dict(), sort_keys=True)
@@ -209,11 +210,21 @@ def _form_value_fields(schema: type[schema.Schema]) -> list[api.ValueField]:
     default_values = schema.default_values()
     result = []
 
-    types = {name: dtype.to_engine() for name, dtype in schema._dtypes().items()}
-
+    columns = schema.columns()
     for f in schema.column_names():
-        dtype = types.get(f, api.PathwayType.ANY)
-        value_field = api.ValueField(f, dtype)
+        item = columns.get(f)
+        if item is None:
+            value_field = api.ValueField(
+                f,
+                api.PathwayType.ANY,
+                source=api.FieldSource.PAYLOAD,
+            )
+        else:
+            value_field = api.ValueField(
+                f,
+                item.dtype.to_engine(),
+                source=item.engine_field_source,
+            )
         if f in default_values:
             value_field.set_default(default_values[f])
         result.append(value_field)