@@ -2,6 +2,7 @@ package objfile
22
33import (
44 "errors"
5+ "sort"
56 "strconv"
67 "strings"
78
@@ -238,11 +239,48 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
238239 return & RegexAndNeedle {patLen , regex_pattern , r , needleOffset , needle }, nil
239240}
240241
241- func FindRegex (data []byte , regexInfo * RegexAndNeedle ) []int {
242+ func getOrSetRegion (regionMap map [int ]map [int ]bool , start , end int ) bool {
243+ if ends , ok := regionMap [start ]; ok {
244+ if ends [end ] {
245+ return true
246+ } else {
247+ ends [end ] = true
248+ return false
249+ }
250+ } else {
251+ regionMap [start ] = map [int ]bool {end : true }
252+ return false
253+ }
254+ }
255+
256+ func regionMapToSlices (regionMap map [int ]map [int ]bool ) [][]int {
257+ totalSize := 0
258+ keys := make ([]int , 0 , len (regionMap ))
259+ for key , valueMap := range regionMap {
260+ keys = append (keys , key )
261+ totalSize += len (valueMap )
262+ }
263+ sort .Ints (keys )
264+ result := make ([][]int , 0 , totalSize )
265+ for _ , key := range keys {
266+ values := make ([]int , 0 , len (regionMap [key ]))
267+ for value := range regionMap [key ] {
268+ values = append (values , value )
269+ }
270+ sort .Ints (values )
271+ for _ , value := range values {
272+ result = append (result , []int {key , value })
273+ }
274+ }
275+ return result
276+ }
277+
278+ func FindRegex (data []byte , regexInfo * RegexAndNeedle ) [][]int {
242279 data_len := len (data )
243- matches := make ([]int , 0 )
280+ matchMap := make (map [int ]map [int ]bool )
281+ cacheMap := make (map [int ]map [int ]bool )
244282
245- // use an optimized memscan to find some candidates chunks from the much larger haystack
283+ // use an optimized memscan to find all candidates chunks from the much larger haystack
246284 needleMatches := findAllOccurrences (data , [][]byte {regexInfo .needle })
247285 for _ , needleMatch := range needleMatches {
248286 // adjust the window to the pattern start and end
@@ -258,35 +296,37 @@ func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
258296 data_end = data_len
259297 }
260298
299+ // don't repeat previously scanned chunks
300+ if getOrSetRegion (cacheMap , data_start , data_end ) {
301+ continue
302+ }
261303 // do the full regex scan on a very small chunk
262304 for _ , reMatch := range regexInfo .re .FindAllIndex (data [data_start :data_end ], - 1 ) {
263305 // the match offset is the start index of the chunk + reMatch index
264306 start := reMatch [0 ] + data_start
307+ end := reMatch [1 ] + data_start
308+ getOrSetRegion (matchMap , start , end )
265309
266- //end := reMatch[1] + data_start
267- matches = append (matches , start )
268-
269- // special case to handle sub-matches, which are skipped by regex but matched by YARA:
270- // AA AA BB CC
271- // { AA [0-1] BB CC }
272- // must produce:
273- // AA AA BB CC
274- // AA BB CC
310+ // handle sub-matches, which are skipped by regex but matched by YARA
275311 subStart := start + 1
276312 for {
313+ // don't repeat previously scanned chunks
314+ if getOrSetRegion (cacheMap , subStart , data_end ) {
315+ break
316+ }
277317 subMatches := regexInfo .re .FindAllIndex (data [subStart :data_end ], - 1 )
278318 if len (subMatches ) == 0 {
279319 break
280320 }
281-
282321 for _ , match := range subMatches {
283- matches = append ( matches , match [0 ]+ subStart )
322+ getOrSetRegion ( matchMap , match [0 ] + subStart , match [ 1 ]+ subStart )
284323 }
285324 subStart += subMatches [0 ][0 ] + 1
286325 }
287326 }
288327 }
289- return matches
328+
329+ return regionMapToSlices (matchMap )
290330}
291331
292332type RegexAndNeedle struct {
0 commit comments