22Searching Chunk related functions.
33The main "entry point" is search_chunks_by_priority.
44"""
5- from enum import Flag
65from functools import lru_cache
7- from typing import Dict , List , Optional , Tuple
6+ from typing import List , Optional
87
98import attr
10- import hyperscan
9+ from pyperscan import BlockDatabase , Flag , Pattern , Scan
1110from structlog import get_logger
1211
1312from .file_utils import InvalidInputFormat , SeekError
2120
2221@attr .define
2322class HyperscanMatchContext :
24- handler_map : Dict [int , Handler ]
2523 file : File
2624 file_size : int
2725 all_chunks : List
2826 task_result : TaskResult
2927
3028
31- class _HyperscanScan (Flag ):
32- Continue = False
33- Terminate = True
34-
35-
3629def _calculate_chunk (
3730 handler : Handler , file : File , real_offset , task_result : TaskResult
3831) -> Optional [ValidChunk ]:
@@ -74,13 +67,12 @@ def _calculate_chunk(
7467
7568
7669def _hyperscan_match (
77- pattern_id : int , offset : int , end : int , flags : int , context : HyperscanMatchContext
78- ) -> _HyperscanScan :
79- handler = context .handler_map [pattern_id ]
70+ context : HyperscanMatchContext , handler : Handler , offset : int , end : int
71+ ) -> Scan :
8072 real_offset = offset + handler .PATTERN_MATCH_OFFSET
8173
8274 if real_offset < 0 :
83- return _HyperscanScan .Continue
75+ return Scan .Continue
8476
8577 # Skip chunk calculation if this would start inside another one,
8678 # similar to remove_inner_chunks, but before we even begin calculating.
@@ -91,7 +83,7 @@ def _hyperscan_match(
9183 offset = real_offset ,
9284 _verbosity = 2 ,
9385 )
94- return _HyperscanScan .Continue
86+ return Scan .Continue
9587
9688 logger .debug (
9789 "Calculating chunk for pattern match" ,
@@ -104,11 +96,11 @@ def _hyperscan_match(
10496
10597 # We found some random bytes this handler couldn't parse
10698 if chunk is None :
107- return _HyperscanScan .Continue
99+ return Scan .Continue
108100
109101 if chunk .end_offset > context .file_size :
110102 logger .debug ("Chunk overflows file" , chunk = chunk , _verbosity = 2 )
111- return _HyperscanScan .Continue
103+ return Scan .Continue
112104
113105 chunk .handler = handler
114106 logger .debug ("Found valid chunk" , chunk = chunk , handler = handler .NAME , _verbosity = 2 )
@@ -117,9 +109,9 @@ def _hyperscan_match(
117109 # Terminate scan if we match till the end of the file
118110 if chunk .end_offset == context .file_size :
119111 logger .debug ("Chunk covers till end of the file" , chunk = chunk )
120- return _HyperscanScan .Terminate
112+ return Scan .Terminate
121113
122- return _HyperscanScan .Continue
114+ return Scan .Continue
123115
124116
125117def search_chunks ( # noqa: C901
@@ -135,33 +127,28 @@ def search_chunks( # noqa: C901
135127 """
136128 all_chunks = []
137129
138- hyperscan_db , handler_map = build_hyperscan_database (handlers )
130+ hyperscan_db = build_hyperscan_database (handlers )
139131
140132 hyperscan_context = HyperscanMatchContext (
141- handler_map = handler_map ,
142133 file = file ,
143134 file_size = file_size ,
144135 all_chunks = all_chunks ,
145136 task_result = task_result ,
146137 )
147138
139+ scanner = hyperscan_db .build (hyperscan_context , _hyperscan_match )
140+
148141 try :
149- hyperscan_db .scan (
150- [file ],
151- match_event_handler = _hyperscan_match ,
152- context = hyperscan_context ,
153- )
154- except hyperscan .error as e :
155- if e .args and e .args [0 ] == f"error code { hyperscan .HS_SCAN_TERMINATED } " :
142+ if scanner .scan (file ) == Scan .Terminate :
156143 logger .debug (
157144 "Scanning terminated as chunk matches till end of file" ,
158145 )
159146 return all_chunks
160- else :
161- logger .error (
162- "Error scanning for patterns" ,
163- error = e ,
164- )
147+ except Exception as e :
148+ logger .error (
149+ "Error scanning for patterns" ,
150+ error = e ,
151+ )
165152
166153 logger .debug (
167154 "Ended searching for chunks" ,
@@ -172,21 +159,18 @@ def search_chunks( # noqa: C901
172159
173160
174161@lru_cache
175- def build_hyperscan_database (handlers : Handlers ) -> Tuple [hyperscan .Database , Dict ]:
176- db = hyperscan .Database (mode = hyperscan .HS_MODE_VECTORED )
177- handler_map = dict ()
178-
179- pattern_id = 0
162+ def build_hyperscan_database (handlers : Handlers ):
180163 patterns = []
181164 for handler_class in handlers :
182165 handler = handler_class ()
183166 for pattern in handler .PATTERNS :
184167 try :
185168 patterns .append (
186- (
169+ Pattern (
187170 pattern .as_regex (),
188- pattern_id ,
189- hyperscan .HS_FLAG_SOM_LEFTMOST | hyperscan .HS_FLAG_DOTALL ,
171+ Flag .SOM_LEFTMOST ,
172+ Flag .DOTALL ,
173+ tag = handler ,
190174 )
191175 )
192176 except InvalidHexString as e :
@@ -197,10 +181,4 @@ def build_hyperscan_database(handlers: Handlers) -> Tuple[hyperscan.Database, Di
197181 error = str (e ),
198182 )
199183 raise
200- handler_map [pattern_id ] = handler
201- pattern_id += 1
202-
203- expressions , ids , flags = zip (* patterns )
204- db .compile (expressions = expressions , ids = ids , elements = len (patterns ), flags = flags )
205-
206- return db , handler_map
184+ return BlockDatabase (* patterns )
0 commit comments