Add error handling to text decoding (#48)

JSCU-CNI · web-flow · commit d7ed5516a0fa · 2025-07-31T10:34:14.000+02:00
diff --git a/dissect/esedb/c_esedb.py b/dissect/esedb/c_esedb.py
@@ -493,7 +493,7 @@ def decode_bit(buf: bytes) -> bool:
     return c_esedb.uint8(buf) == 0xFF
 
 
-def decode_text(buf: bytes, encoding: CODEPAGE) -> str:
+def decode_text(buf: bytes, encoding: CODEPAGE, errors: str | None = "backslashreplace") -> str:
     """Decode text with the appropriate encoding.
 
     Args:
@@ -504,7 +504,7 @@ def decode_text(buf: bytes, encoding: CODEPAGE) -> str:
     if encoding == CODEPAGE.UNICODE and len(buf) % 2:
         buf += b"\x00"
 
-    return buf.decode(CODEPAGE_MAP[encoding]).rstrip("\x00")
+    return buf.decode(CODEPAGE_MAP[encoding], errors=errors).rstrip("\x00")
 
 
 def decode_guid(buf: bytes) -> str:
diff --git a/dissect/esedb/record.py b/dissect/esedb/record.py
@@ -195,7 +195,7 @@ def __init__(self, table: Table, node: Node):
         self._get_tag_field = lru_cache(4096)(self._get_tag_field)
         self._find_tag_field_idx = lru_cache(4096)(self._find_tag_field_idx)
 
-    def get(self, column: Column, raw: bool = False) -> RecordValue:
+    def get(self, column: Column, raw: bool = False, errors: str | None = "backslashreplace") -> RecordValue:
         """Retrieve the value for the specified column.
 
         Optionally receive the raw data as it's stored in the record.
@@ -206,6 +206,7 @@ def get(self, column: Column, raw: bool = False) -> RecordValue:
         Args:
             column: The column to retrieve the value of.
             raw: Whether to return the raw data stored in the record instead of the parsed value.
+            errors: Error handling scheme to use when decoding bytes to text (default: 'backslashreplace').
         """
         value = None
         tag_field = None
@@ -228,11 +229,11 @@ def get(self, column: Column, raw: bool = False) -> RecordValue:
             return value
 
         if value is not None:
-            return self._parse_value(column, value, tag_field)
+            return self._parse_value(column, value, tag_field, errors)
 
         return None
 
-    def as_dict(self, raw: bool = False) -> dict[str, RecordValue]:
+    def as_dict(self, raw: bool = False, errors: str | None = "backslashreplace") -> dict[str, RecordValue]:
         """Serialize the record as a dictionary."""
         obj = {}
 
@@ -251,21 +252,23 @@ def _iter_column_id() -> Iterator[Column]:
             column = self.table._column_id_map[column_id]
 
             try:
-                obj[column.name] = self.get(column, raw)
+                obj[column.name] = self.get(column, raw, errors)
             except Exception as e:
                 obj[column.name] = f"!ERROR! {e}"
 
         return obj
 
-    def _parse_value(self, column: Column, value: bytes, tag_field: TagField = None) -> RecordValue:
+    def _parse_value(
+        self, column: Column, value: bytes, tag_field: TagField = None, errors: str | None = "backslashreplace"
+    ) -> RecordValue:
         """Parse the raw value into the appropriate type.
 
         For tagged columns, also interpret things like multi-values, separated and compressed data.
         """
         ctype = column.ctype
         parse_func = ctype.parse
         if column.is_text:
-            parse_func = functools.partial(ctype.parse, encoding=column.encoding)
+            parse_func = functools.partial(ctype.parse, encoding=column.encoding, errors=errors)
 
         if self.esedb.impacket_compat:
             if tag_field and tag_field.flags & TAGFLD_HEADER.Compressed:
diff --git a/tests/_data/Windows.edb.gz b/tests/_data/Windows.edb.gz
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -67,3 +67,8 @@ def sru_db() -> Iterator[BinaryIO]:
 @pytest.fixture
 def ual_db() -> Iterator[BinaryIO]:
     yield from open_file_gz("_data/Current.mdb.gz")
+
+
+@pytest.fixture
+def windows_search_db() -> Iterator[BinaryIO]:
+    yield from open_file_gz("_data/Windows.edb.gz")
diff --git a/tests/test_record.py b/tests/test_record.py
@@ -63,3 +63,19 @@ def test_comparison(basic_db: BinaryIO) -> None:
 
     assert set(records) == {records[0], records[1]}
     assert set(records) | {obj} == {records[0], records[1]}
+
+
+def test_parse_value_encoding(windows_search_db: BinaryIO) -> None:
+    """Test if we can parse invalid utf-16-le (Long)Text columns.
+
+    Resources:
+        - https://github.com/fox-it/dissect.esedb/pull/48
+    """
+
+    db = EseDB(windows_search_db)
+    table = db.table("SystemIndex_PropertyStore")
+
+    record = table.search(WorkID=1017)
+    auto_summary = record.get("4625-System_Search_AutoSummary")
+    assert auto_summary.startswith("Hong Kong SCS AdobeMingStd-Light-Acro-HKscs-B5-H ASCII")
+    assert auto_summary.endswith("\\x4c\\xd8")