Skip to content

Commit 387a504

Browse files
committed
fix(scanner): lift the 4GB file size limit imposed by Hyperscan in vectored mode by moving to streaming mode.
1 parent 524db2e commit 387a504

File tree

4 files changed

+24
-16
lines changed

4 files changed

+24
-16
lines changed

unblob/file_utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from typing import Iterator, Tuple
1010

1111
from dissect.cstruct import cstruct
12+
from pyperscan import Scan
1213

1314
from .logging import format_hex
1415

@@ -252,6 +253,13 @@ def iterate_file(
252253
yield data
253254

254255

256+
def stream_scan(scanner, file: File):
257+
"""Scan the whole file by increment of DEFAULT_BUFSIZE using Hyperscan's streaming mode."""
258+
for i in range(0, file.size(), DEFAULT_BUFSIZE):
259+
if scanner.scan(file[i : i + DEFAULT_BUFSIZE]) == Scan.Terminate: # noqa: E203
260+
break
261+
262+
255263
class StructParser:
256264
"""Wrapper for dissect.cstruct to handle different endianness parsing dynamically."""
257265

unblob/finder.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
from typing import List, Optional
77

88
import attr
9-
from pyperscan import BlockDatabase, Flag, Pattern, Scan
9+
from pyperscan import Flag, Pattern, Scan, StreamDatabase
1010
from structlog import get_logger
1111

12-
from .file_utils import InvalidInputFormat, SeekError
12+
from .file_utils import InvalidInputFormat, SeekError, stream_scan
1313
from .handlers import Handlers
1414
from .models import File, Handler, TaskResult, ValidChunk
1515
from .parser import InvalidHexString
@@ -139,11 +139,7 @@ def search_chunks( # noqa: C901
139139
scanner = hyperscan_db.build(hyperscan_context, _hyperscan_match)
140140

141141
try:
142-
if scanner.scan(file) == Scan.Terminate:
143-
logger.debug(
144-
"Scanning terminated as chunk matches till end of file",
145-
)
146-
return all_chunks
142+
stream_scan(scanner, file)
147143
except Exception as e:
148144
logger.error(
149145
"Error scanning for patterns",
@@ -159,7 +155,7 @@ def search_chunks( # noqa: C901
159155

160156

161157
@lru_cache
162-
def build_hyperscan_database(handlers: Handlers):
158+
def build_hyperscan_database(handlers: Handlers) -> StreamDatabase:
163159
patterns = []
164160
for handler_class in handlers:
165161
handler = handler_class()
@@ -181,4 +177,4 @@ def build_hyperscan_database(handlers: Handlers):
181177
error=str(e),
182178
)
183179
raise
184-
return BlockDatabase(*patterns)
180+
return StreamDatabase(*patterns)

unblob/handlers/compression/bzip2.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from typing import Optional
22

33
import attr
4-
from pyperscan import BlockDatabase, Flag, Pattern, Scan
4+
from pyperscan import Flag, Pattern, Scan, StreamDatabase
55
from structlog import get_logger
66

77
from unblob.extractors import Command
88

9-
from ...file_utils import InvalidInputFormat, SeekError, StructParser
9+
from ...file_utils import InvalidInputFormat, SeekError, StructParser, stream_scan
1010
from ...models import File, Handler, HexString, Regex, ValidChunk
1111

1212
logger = get_logger()
@@ -54,7 +54,7 @@
5454

5555

5656
def build_stream_end_scan_db(pattern_list):
57-
return BlockDatabase(
57+
return StreamDatabase(
5858
*(Pattern(p.as_regex(), Flag.SOM_LEFTMOST, Flag.DOTALL) for p in pattern_list)
5959
)
6060

@@ -139,7 +139,8 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
139139
)
140140

141141
try:
142-
hyperscan_stream_end_magic_db.build(context, _hyperscan_match).scan(file)
142+
scanner = hyperscan_stream_end_magic_db.build(context, _hyperscan_match)
143+
stream_scan(scanner, file)
143144
except Exception as e:
144145
logger.debug(
145146
"Error scanning for bzip2 patterns",

unblob/handlers/compression/xz.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from typing import Optional, Tuple
33

44
import attr
5-
from pyperscan import BlockDatabase, Flag, Pattern, Scan
5+
from pyperscan import Flag, Pattern, Scan, StreamDatabase
66
from structlog import get_logger
77

88
from unblob.extractors import Command
@@ -15,6 +15,7 @@
1515
decode_multibyte_integer,
1616
read_until_past,
1717
round_up,
18+
stream_scan,
1819
)
1920
from ...models import File, Handler, HexString, InvalidInputFormat, ValidChunk
2021

@@ -52,7 +53,7 @@
5253

5354

5455
def build_stream_end_scan_db(pattern_list):
55-
return BlockDatabase(
56+
return StreamDatabase(
5657
*(Pattern(p.as_regex(), Flag.SOM_LEFTMOST, Flag.DOTALL) for p in pattern_list)
5758
)
5859

@@ -182,12 +183,14 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
182183
)
183184

184185
try:
185-
hyperscan_stream_end_magic_db.build(context, _hyperscan_match).scan(file)
186+
scanner = hyperscan_stream_end_magic_db.build(context, _hyperscan_match)
187+
stream_scan(scanner, file)
186188
except Exception as e:
187189
logger.debug(
188190
"Error scanning for xz patterns",
189191
error=e,
190192
)
193+
191194
if context.end_streams_offset > 0:
192195
return ValidChunk(
193196
start_offset=start_offset, end_offset=context.end_streams_offset

0 commit comments

Comments
 (0)