Skip to content

Commit d5e6aef

Browse files
authored
Merge pull request #570 from onekey-sec/pattern-scan-speedup
Pattern scan speedup
2 parents f15032d + 1b5f2bc commit d5e6aef

File tree

6 files changed

+71
-25
lines changed

6 files changed

+71
-25
lines changed

flake.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

poetry.lock

Lines changed: 7 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ python-lzo = "^1.14"
2323
plotext = ">=4.2.0,<6.0"
2424
pluggy = "^1.0.0"
2525
python-magic = "^0.4.27"
26-
pyperscan = "^0.2.0"
26+
pyperscan = "^0.2.2"
2727
lark = "^1.1.2"
2828
lz4 = "^4.0.0"
2929
lief = "^0.12.3"

tests/test_finder.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import pytest
33
from pyperscan import Scan
44

5-
from unblob.file_utils import InvalidInputFormat
5+
from unblob.file_utils import DEFAULT_BUFSIZE, InvalidInputFormat
66
from unblob.finder import build_hyperscan_database, search_chunks
77
from unblob.models import File, Handler, HexString, Regex, ValidChunk
88
from unblob.parser import InvalidHexString
@@ -63,6 +63,17 @@ def calculate_chunk(self, file, start_offset: int):
6363
raise ValueError("Error")
6464

6565

66+
class TestHandlerL(Handler):
67+
NAME = "handlerL"
68+
PATTERNS = [Regex("L")]
69+
70+
def calculate_chunk(self, file, start_offset: int):
71+
del file # unused argument
72+
return ValidChunk(
73+
start_offset=start_offset, end_offset=start_offset + DEFAULT_BUFSIZE * 2
74+
)
75+
76+
6677
def test_build_hyperscan_database():
6778
db = build_hyperscan_database((TestHandlerA, TestHandlerB))
6879
matches = []
@@ -137,6 +148,31 @@ def calculate_chunk(self, file, start_offset: int):
137148
pytest.param(
138149
b"EXCA2345", [ValidChunk(3, 8)], id="exception-ignored-scan-continues"
139150
),
151+
pytest.param(b"0", [], id="1-byte"),
152+
pytest.param(b"1234567890", [], id="no-chunk"),
153+
pytest.param(
154+
b"A2345L1" + b"1" * DEFAULT_BUFSIZE * 2,
155+
[ValidChunk(0, 5), ValidChunk(5, 5 + DEFAULT_BUFSIZE * 2)],
156+
id="multi-large-chunk",
157+
),
158+
pytest.param(
159+
b"L" + b"1" * DEFAULT_BUFSIZE + b"A2345" + b"1" * DEFAULT_BUFSIZE,
160+
[ValidChunk(0, DEFAULT_BUFSIZE * 2)],
161+
id="large-small-inside-ignored",
162+
),
163+
pytest.param(
164+
b"0123456789L" + b"1" * DEFAULT_BUFSIZE + b"A2345" + b"1" * DEFAULT_BUFSIZE,
165+
[ValidChunk(10, 10 + DEFAULT_BUFSIZE * 2)],
166+
id="padding-large-small-inside-ignored",
167+
),
168+
pytest.param(
169+
b"L" + b"1" * (DEFAULT_BUFSIZE * 2 - 1) + b"A2345" + b"1" * DEFAULT_BUFSIZE,
170+
[
171+
ValidChunk(0, DEFAULT_BUFSIZE * 2),
172+
ValidChunk(DEFAULT_BUFSIZE * 2, DEFAULT_BUFSIZE * 2 + 5),
173+
],
174+
id="large-small",
175+
),
140176
],
141177
)
142178
def test_search_chunks(content, expected_chunks, task_result):
@@ -149,6 +185,7 @@ def test_search_chunks(content, expected_chunks, task_result):
149185
TestHandlerEof,
150186
TestHandlerInvalid,
151187
TestHandlerExc,
188+
TestHandlerL,
152189
)
153190

154191
chunks = search_chunks(file, len(content), handlers, task_result)

unblob/file_utils.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from typing import Iterator, List, Tuple, Union
1010

1111
from dissect.cstruct import cstruct
12-
from pyperscan import Scan
1312

1413
from .logging import format_hex
1514

@@ -255,9 +254,7 @@ def iterate_file(
255254

256255
def stream_scan(scanner, file: File):
257256
"""Scan the whole file by increment of DEFAULT_BUFSIZE using Hyperscan's streaming mode."""
258-
for i in range(0, file.size(), DEFAULT_BUFSIZE):
259-
if scanner.scan(file[i : i + DEFAULT_BUFSIZE]) == Scan.Terminate:
260-
break
257+
scanner.scan(file, DEFAULT_BUFSIZE)
261258

262259

263260
class StructParser:

unblob/finder.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pyperscan import Flag, Pattern, Scan, StreamDatabase
1010
from structlog import get_logger
1111

12-
from .file_utils import InvalidInputFormat, SeekError, stream_scan
12+
from .file_utils import DEFAULT_BUFSIZE, InvalidInputFormat, SeekError
1313
from .handlers import Handlers
1414
from .models import File, Handler, TaskResult, ValidChunk
1515
from .parser import InvalidHexString
@@ -24,6 +24,7 @@ class HyperscanMatchContext:
2424
file_size: int
2525
all_chunks: List
2626
task_result: TaskResult
27+
start_offset: int
2728

2829

2930
def _calculate_chunk(
@@ -69,6 +70,7 @@ def _hyperscan_match(
6970
context: HyperscanMatchContext, handler: Handler, offset: int, end: int
7071
) -> Scan:
7172
del end # unused argument
73+
offset += context.start_offset
7274
real_offset = offset + handler.PATTERN_MATCH_OFFSET
7375

7476
if real_offset < 0:
@@ -90,6 +92,7 @@ def _hyperscan_match(
9092
start_offset=offset,
9193
real_offset=real_offset,
9294
_verbosity=2,
95+
handler=handler.NAME,
9396
)
9497

9598
chunk = _calculate_chunk(handler, context.file, real_offset, context.task_result)
@@ -103,15 +106,23 @@ def _hyperscan_match(
103106
return Scan.Continue
104107

105108
chunk.handler = handler
106-
logger.debug("Found valid chunk", chunk=chunk, handler=handler.NAME, _verbosity=2)
109+
logger.debug("Found valid chunk", chunk=chunk, handler=handler.NAME, _verbosity=1)
107110
context.all_chunks.append(chunk)
111+
context.start_offset = chunk.end_offset
108112

109-
# Terminate scan if we match till the end of the file
110-
if chunk.end_offset == context.file_size:
111-
logger.debug("Chunk covers till end of the file", chunk=chunk)
112-
return Scan.Terminate
113+
return Scan.Terminate
113114

114-
return Scan.Continue
115+
116+
def stream_scan_chunks(scanner, file: File, context: HyperscanMatchContext):
117+
"""Scan the whole file by increment of DEFAULT_BUFSIZE using Hyperscan's streaming mode."""
118+
i = context.start_offset
119+
with memoryview(file) as data:
120+
while i < file.size():
121+
if scanner.scan(data[i : i + DEFAULT_BUFSIZE]) == Scan.Terminate:
122+
scanner.reset()
123+
i = context.start_offset
124+
else:
125+
i += DEFAULT_BUFSIZE
115126

116127

117128
def search_chunks(
@@ -136,12 +147,13 @@ def search_chunks(
136147
file_size=file_size,
137148
all_chunks=all_chunks,
138149
task_result=task_result,
150+
start_offset=0,
139151
)
140152

141153
scanner = hyperscan_db.build(hyperscan_context, _hyperscan_match)
142154

143155
try:
144-
stream_scan(scanner, file)
156+
stream_scan_chunks(scanner, file, hyperscan_context)
145157
except Exception as e:
146158
logger.error(
147159
"Error scanning for patterns",

0 commit comments

Comments
 (0)