|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
| 3 | +import base64 |
3 | 4 | import io
|
| 5 | +from typing import Any |
4 | 6 |
|
5 |
| -from avro.errors import SchemaResolutionException |
6 | 7 | from avro.io import BinaryDecoder, DatumReader
|
| 8 | +from avro.schema import parse as parse_schema |
| 9 | +from google.protobuf.json_format import MessageToDict |
7 | 10 |
|
| 11 | +from aws_lambda_powertools.utilities.kafka_consumer.exceptions import ( |
| 12 | + KafkaConsumerAvroMissingSchemaError, |
| 13 | + KafkaConsumerAvroSchemaMismatchError, |
| 14 | + KafkaConsumerDeserializationError, |
| 15 | +) |
8 | 16 |
|
9 |
| -def deserialize_avro(avro_bytes, reader_schema: str | None = None): |
| 17 | + |
| 18 | +def deserialize_avro(avro_bytes: bytes | str, value_schema_str: str) -> dict: |
10 | 19 | """
|
11 |
| - Deserialize Avro binary data to Python objects |
| 20 | + Deserialize Avro binary data to Python dictionary objects. |
| 21 | +
|
| 22 | + This function handles the deserialization of Avro-formatted binary data |
| 23 | + using a specified schema string. It supports both raw bytes and |
| 24 | + base64-encoded string inputs. |
12 | 25 |
|
13 | 26 | Parameters
|
14 | 27 | ----------
|
15 |
| - avro_bytes: bytes |
16 |
| - Avro binary data |
17 |
| - reader_schema: str, Optional |
18 |
| - Schema to use for reading |
| 28 | + avro_bytes : bytes or str |
| 29 | + Avro binary data, either as raw bytes or base64-encoded string. |
| 30 | + If a string is provided, it will be treated as base64-encoded. |
| 31 | + value_schema_str : str |
| 32 | + Avro schema definition in JSON string format to use for reading. |
| 33 | + Must be a valid Avro schema definition. |
19 | 34 |
|
20 | 35 | Returns
|
21 | 36 | -------
|
22 |
| - dict |
23 |
| - Deserialized Python object |
| 37 | + Any |
| 38 | + Deserialized Python dictionary representing the Avro data. |
24 | 39 |
|
25 | 40 | Raises
|
26 | 41 | ------
|
27 |
| - ValueError |
28 |
| - If reader_schema schema is None or if deserialization fails |
| 42 | + KafkaConsumerAvroMissingSchemaError |
| 43 | + If the schema is not provided |
| 44 | + KafkaConsumerAvroSchemaMismatchError |
| 45 | + If there's a schema mismatch |
| 46 | + KafkaConsumerDeserializationError |
| 47 | + If deserialization fails due to data corruption. |
| 48 | + TypeError |
| 49 | + If avro_bytes is neither bytes nor a base64-encoded string. |
| 50 | +
|
| 51 | + Examples |
| 52 | + -------- |
| 53 | + >>> schema_str = '{"type": "record", "name": "User", "fields": [{"name": "name", "type": "string"}]}' |
| 54 | + >>> encoded_data = base64.b64encode(b'some-avro-binary-data') |
| 55 | + >>> user_dict = deserialize_avro(encoded_data, schema_str) |
29 | 56 | """
|
| 57 | + if not value_schema_str: |
| 58 | + raise KafkaConsumerAvroMissingSchemaError("Schema string must be provided for Avro deserialization") |
| 59 | + |
30 | 60 | try:
|
31 |
| - reader = DatumReader(reader_schema) |
| 61 | + # Parse the provided schema |
| 62 | + parsed_schema = parse_schema(value_schema_str) |
| 63 | + reader = DatumReader(parsed_schema) |
| 64 | + |
| 65 | + # Handle different input types |
| 66 | + if isinstance(avro_bytes, str): |
| 67 | + # Assume base64 encoded string |
| 68 | + value = base64.b64decode(avro_bytes) |
| 69 | + elif isinstance(avro_bytes, bytes): |
| 70 | + # Already raw bytes |
| 71 | + value = avro_bytes |
| 72 | + else: |
| 73 | + # Try base64 decoding as a fallback |
| 74 | + try: |
| 75 | + value = base64.b64decode(avro_bytes) |
| 76 | + except Exception as e: |
| 77 | + raise TypeError( |
| 78 | + f"Expected bytes or base64-encoded string, got {type(avro_bytes).__name__}. Error: {str(e)}", |
| 79 | + ) from e |
32 | 80 |
|
33 |
| - decoder = BinaryDecoder(io.BytesIO(avro_bytes)) |
| 81 | + # Create binary decoder and read data |
| 82 | + bytes_reader = io.BytesIO(value) |
| 83 | + decoder = BinaryDecoder(bytes_reader) |
34 | 84 | return reader.read(decoder)
|
35 |
| - except SchemaResolutionException as e: |
36 |
| - raise ValueError(f"Schema mismatch: {e}") from e |
| 85 | + |
| 86 | + except KafkaConsumerAvroSchemaMismatchError as e: |
| 87 | + raise ValueError( |
| 88 | + f"Schema mismatch detected: Message schema doesn't match expected schema. " |
| 89 | + f"Details: {str(e)}. Verify schema registry configuration and message format.", |
| 90 | + ) from e |
| 91 | + except KafkaConsumerDeserializationError as e: |
| 92 | + raise ValueError( |
| 93 | + f"Deserialization failed: Unable to decode message data using Avro schema. " |
| 94 | + f"Error: {str(e)}. Check for data corruption or schema evolution issues.", |
| 95 | + ) from e |
| 96 | + |
| 97 | + |
| 98 | +def deserialize_protobuf_with_compiled_classes( |
| 99 | + protobuf_bytes: bytes | str, |
| 100 | + message_class: Any, |
| 101 | +) -> dict[str, Any]: |
| 102 | + """ |
| 103 | + A deserialize that works with pre-compiled protobuf classes. |
| 104 | +
|
| 105 | + Parameters |
| 106 | + ---------- |
| 107 | + protobuf_bytes : Union[bytes, str] |
| 108 | + Protocol Buffer binary data, either as raw bytes or base64-encoded string. |
| 109 | + message_class : Any |
| 110 | + The pre-compiled Protocol Buffer message class. |
| 111 | +
|
| 112 | + Returns |
| 113 | + ------- |
| 114 | + Dict[str, Any] |
| 115 | + Deserialized Python dictionary representing the Protocol Buffer data. |
| 116 | +
|
| 117 | + Example |
| 118 | + ------- |
| 119 | + >>> from my_proto_package.user_pb2 import User |
| 120 | + >>> user_dict = deserialize_protobuf_with_compiled_classes(encoded_data, User) |
| 121 | + """ |
| 122 | + |
| 123 | + try: |
| 124 | + # Handle different input types for the binary data |
| 125 | + if isinstance(protobuf_bytes, str): |
| 126 | + # Assume base64 encoded string |
| 127 | + value = base64.b64decode(protobuf_bytes) |
| 128 | + elif isinstance(protobuf_bytes, bytes): |
| 129 | + # Already raw bytes |
| 130 | + value = protobuf_bytes |
| 131 | + else: |
| 132 | + # Try base64 decoding as a fallback |
| 133 | + try: |
| 134 | + value = base64.b64decode(protobuf_bytes) |
| 135 | + except Exception as e: |
| 136 | + raise TypeError( |
| 137 | + f"Expected bytes or base64-encoded string, got {type(protobuf_bytes).__name__}. Error: {str(e)}", |
| 138 | + ) from e |
| 139 | + |
| 140 | + # Create message instance and deserialize |
| 141 | + message = message_class() |
| 142 | + message.ParseFromString(value) |
| 143 | + |
| 144 | + # Convert to dictionary |
| 145 | + return MessageToDict(message, preserving_proto_field_name=True) |
| 146 | + |
37 | 147 | except Exception as e:
|
38 |
| - raise ValueError(f"Failed to deserialize Avro data: {e}") from e |
| 148 | + raise KafkaConsumerDeserializationError( |
| 149 | + f"Protocol Buffer deserialization error: {type(e).__name__}: {str(e)}", |
| 150 | + ) from e |
0 commit comments