Skip to content
This repository was archived by the owner on Nov 20, 2025. It is now read-only.

Commit d7ed551

Browse files
authored
Add error handling to text decoding (#48)
1 parent ca05543 commit d7ed551

File tree

5 files changed

+32
-8
lines changed

5 files changed

+32
-8
lines changed

dissect/esedb/c_esedb.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ def decode_bit(buf: bytes) -> bool:
493493
return c_esedb.uint8(buf) == 0xFF
494494

495495

496-
def decode_text(buf: bytes, encoding: CODEPAGE) -> str:
496+
def decode_text(buf: bytes, encoding: CODEPAGE, errors: str | None = "backslashreplace") -> str:
497497
"""Decode text with the appropriate encoding.
498498
499499
Args:
@@ -504,7 +504,7 @@ def decode_text(buf: bytes, encoding: CODEPAGE) -> str:
504504
if encoding == CODEPAGE.UNICODE and len(buf) % 2:
505505
buf += b"\x00"
506506

507-
return buf.decode(CODEPAGE_MAP[encoding]).rstrip("\x00")
507+
return buf.decode(CODEPAGE_MAP[encoding], errors=errors).rstrip("\x00")
508508

509509

510510
def decode_guid(buf: bytes) -> str:

dissect/esedb/record.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def __init__(self, table: Table, node: Node):
195195
self._get_tag_field = lru_cache(4096)(self._get_tag_field)
196196
self._find_tag_field_idx = lru_cache(4096)(self._find_tag_field_idx)
197197

198-
def get(self, column: Column, raw: bool = False) -> RecordValue:
198+
def get(self, column: Column, raw: bool = False, errors: str | None = "backslashreplace") -> RecordValue:
199199
"""Retrieve the value for the specified column.
200200
201201
Optionally receive the raw data as it's stored in the record.
@@ -206,6 +206,7 @@ def get(self, column: Column, raw: bool = False) -> RecordValue:
206206
Args:
207207
column: The column to retrieve the value of.
208208
raw: Whether to return the raw data stored in the record instead of the parsed value.
209+
errors: Error handling scheme to use when decoding bytes to text (default: 'backslashreplace').
209210
"""
210211
value = None
211212
tag_field = None
@@ -228,11 +229,11 @@ def get(self, column: Column, raw: bool = False) -> RecordValue:
228229
return value
229230

230231
if value is not None:
231-
return self._parse_value(column, value, tag_field)
232+
return self._parse_value(column, value, tag_field, errors)
232233

233234
return None
234235

235-
def as_dict(self, raw: bool = False) -> dict[str, RecordValue]:
236+
def as_dict(self, raw: bool = False, errors: str | None = "backslashreplace") -> dict[str, RecordValue]:
236237
"""Serialize the record as a dictionary."""
237238
obj = {}
238239

@@ -251,21 +252,23 @@ def _iter_column_id() -> Iterator[Column]:
251252
column = self.table._column_id_map[column_id]
252253

253254
try:
254-
obj[column.name] = self.get(column, raw)
255+
obj[column.name] = self.get(column, raw, errors)
255256
except Exception as e:
256257
obj[column.name] = f"!ERROR! {e}"
257258

258259
return obj
259260

260-
def _parse_value(self, column: Column, value: bytes, tag_field: TagField = None) -> RecordValue:
261+
def _parse_value(
262+
self, column: Column, value: bytes, tag_field: TagField = None, errors: str | None = "backslashreplace"
263+
) -> RecordValue:
261264
"""Parse the raw value into the appropriate type.
262265
263266
For tagged columns, also interpret things like multi-values, separated and compressed data.
264267
"""
265268
ctype = column.ctype
266269
parse_func = ctype.parse
267270
if column.is_text:
268-
parse_func = functools.partial(ctype.parse, encoding=column.encoding)
271+
parse_func = functools.partial(ctype.parse, encoding=column.encoding, errors=errors)
269272

270273
if self.esedb.impacket_compat:
271274
if tag_field and tag_field.flags & TAGFLD_HEADER.Compressed:

tests/_data/Windows.edb.gz

6.67 MB
Binary file not shown.

tests/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,8 @@ def sru_db() -> Iterator[BinaryIO]:
6767
@pytest.fixture
6868
def ual_db() -> Iterator[BinaryIO]:
6969
yield from open_file_gz("_data/Current.mdb.gz")
70+
71+
72+
@pytest.fixture
73+
def windows_search_db() -> Iterator[BinaryIO]:
74+
yield from open_file_gz("_data/Windows.edb.gz")

tests/test_record.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,19 @@ def test_comparison(basic_db: BinaryIO) -> None:
6363

6464
assert set(records) == {records[0], records[1]}
6565
assert set(records) | {obj} == {records[0], records[1]}
66+
67+
68+
def test_parse_value_encoding(windows_search_db: BinaryIO) -> None:
69+
"""Test if we can parse invalid utf-16-le (Long)Text columns.
70+
71+
Resources:
72+
- https://github.com/fox-it/dissect.esedb/pull/48
73+
"""
74+
75+
db = EseDB(windows_search_db)
76+
table = db.table("SystemIndex_PropertyStore")
77+
78+
record = table.search(WorkID=1017)
79+
auto_summary = record.get("4625-System_Search_AutoSummary")
80+
assert auto_summary.startswith("Hong Kong SCS AdobeMingStd-Light-Acro-HKscs-B5-H ASCII")
81+
assert auto_summary.endswith("\\x4c\\xd8")

0 commit comments

Comments
 (0)