88 "context"
99 "errors"
1010 "fmt"
11+ "io"
1112 "io/fs"
1213 "log/slog"
1314 "os"
@@ -17,9 +18,6 @@ import (
1718 "strings"
1819 "sync"
1920 "sync/atomic"
20- "syscall"
21-
22- "github.com/minio/sha256-simd"
2321
2422 "github.com/chainguard-dev/clog"
2523 "github.com/chainguard-dev/malcontent/pkg/archive"
@@ -29,6 +27,7 @@ import (
2927 "github.com/chainguard-dev/malcontent/pkg/programkind"
3028 "github.com/chainguard-dev/malcontent/pkg/render"
3129 "github.com/chainguard-dev/malcontent/pkg/report"
30+ "github.com/minio/sha256-simd"
3231 "golang.org/x/sync/errgroup"
3332
3433 yarax "github.com/VirusTotal/yara-x/go"
@@ -39,55 +38,17 @@ func interactive(c malcontent.Config) bool {
3938}
4039
4140var (
42- // compiledRuleCache are a cache of previously compiled rules.
43- compiledRuleCache atomic.Pointer [yarax.Rules ]
44- // compileOnce ensures that we compile rules only once even across threads.
45- compileOnce sync.Once
41+ compiledRuleCache atomic.Pointer [yarax.Rules ] // compiledRuleCache are a cache of previously compiled rules.
42+ compileOnce sync.Once // compileOnce ensures that we compile rules only once even across threads.
4643 ErrMatchedCondition = errors .New ("matched exit criteria" )
47- // initializeOnce ensures that the file and scanner pools are only initialized once.
48- initializeOnce sync.Once
49- scannerPool * pool.ScannerPool
50- maxMmapSize int64 = 1 << 31
44+ initReadPool sync.Once // initReadPool ensures that the bytes read pool is only initialized once.
45+ initScannerPool sync.Once // initScannerPool ensures that the scanner pool is only initialized once.
46+ maxBytes int64 = 1 << 32 // 4GB
47+ readBuffer int64 = 64 * 1024 // 64KB
48+ readPool * pool.BufferPool
49+ scannerPool * pool.ScannerPool
5150)
5251
53- // scanFD scans a file descriptor using memory mapping for efficient large file handling.
54- // This avoids loading the entire file into memory while still using yara-x's byte slice scanning.
55- // scanFD also returns the file's contents for match string extraction,
56- // as well as the file's size and its checksum which were originally calculated separately as part of report generation.
57- func scanFD (scanner * yarax.Scanner , fd uintptr , size int64 , logger * clog.Logger ) ([]byte , * yarax.ScanResults , string , error ) {
58- stat := & syscall.Stat_t {}
59- if err := syscall .Fstat (int (fd ), stat ); err != nil {
60- return nil , nil , "" , fmt .Errorf ("fstat failed: %w" , err )
61- }
62-
63- data , err := syscall .Mmap (int (fd ), 0 , int (size ), syscall .PROT_READ , syscall .MAP_PRIVATE )
64- if err != nil {
65- return nil , nil , "" , fmt .Errorf ("mmap failed: %w" , err )
66- }
67-
68- defer func () {
69- if unmapErr := syscall .Munmap (data ); unmapErr != nil {
70- logger .Error ("failed to unmap memory" , "error" , unmapErr )
71- }
72- }()
73-
74- h := sha256 .New ()
75- h .Write (data )
76- checksum := fmt .Sprintf ("%x" , h .Sum (nil ))
77-
78- // Create a copy of the data to return since the mmap will be unmapped
79- // This is necessary because report generation needs access to file content
80- // for match string extraction
81- fc := bytes .Clone (data )
82-
83- mrs , err := scanner .Scan (data )
84- if err != nil {
85- return nil , nil , "" , err
86- }
87-
88- return fc , mrs , checksum , err
89- }
90-
9152// scanSinglePath YARA scans a single path and converts it to a fileReport.
9253func scanSinglePath (ctx context.Context , c malcontent.Config , path string , ruleFS []fs.FS , absPath string , archiveRoot string ) (* malcontent.FileReport , error ) {
9354 if ctx .Err () != nil {
@@ -103,7 +64,6 @@ func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleF
10364 if err != nil {
10465 return nil , err
10566 }
106- fd := f .Fd ()
10767
10868 fi , err := f .Stat ()
10969 if err != nil {
@@ -119,12 +79,24 @@ func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleF
11979 return fr , nil
12080 }
12181
122- if size > maxMmapSize {
123- logger .Warn ("file exceeds mmap limit, scanning first portion only" ,
124- "size" , size , "limit" , maxMmapSize )
125- size = maxMmapSize
82+ initReadPool .Do (func () {
83+ readPool = pool .NewBufferPool (runtime .GOMAXPROCS (0 ))
84+ })
85+ buf := readPool .Get (readBuffer ) //nolint:nilaway // the buffer pool is created above
86+
87+ var fc bytes.Buffer
88+ _ , err = io .CopyBuffer (& fc , io .LimitReader (f , maxBytes ), buf )
89+ if err != nil {
90+ return nil , err
12691 }
12792
93+ h := sha256 .New ()
94+ _ , err = h .Write (fc .Bytes ())
95+ if err != nil {
96+ return nil , err
97+ }
98+ checksum := fmt .Sprintf ("%x" , h .Sum (nil ))
99+
128100 mime := "<unknown>"
129101 kind , err := programkind .File (ctx , path )
130102 if err != nil && ! interactive (c ) {
@@ -155,14 +127,14 @@ func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleF
155127 }
156128 }
157129
158- initializeOnce .Do (func () {
130+ initScannerPool .Do (func () {
159131 // always create one scanner per available CPU core since the pool is used for the duration of
160132 // a scan which may involve concurrent scans of individual files
161133 scannerPool = pool .NewScannerPool (yrs , getMaxConcurrency (runtime .GOMAXPROCS (0 )))
162134 })
163135 scanner := scannerPool .Get (yrs )
164136
165- fc , mrs , checksum , err := scanFD ( scanner , fd , size , logger )
137+ mrs , err := scanner . ScanFile ( path )
166138 if err != nil {
167139 logger .Debug ("skipping" , slog .Any ("error" , err ))
168140 return nil , err
@@ -180,16 +152,17 @@ func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleF
180152 return fr , nil
181153 }
182154
183- fr , err := report .Generate (ctx , path , mrs , c , archiveRoot , logger , fc , size , checksum , kind , risk )
155+ fr , err := report .Generate (ctx , path , mrs , c , archiveRoot , logger , fc . Bytes () , size , checksum , kind , risk )
184156 if err != nil {
185157 return nil , NewFileReportError (err , path , TypeGenerateError )
186158 }
187159
188160 defer func () {
189161 f .Close ()
162+ readPool .Put (buf )
190163 scannerPool .Put (scanner )
191- fc = nil
192164 mrs = nil
165+ fc .Reset ()
193166 }()
194167
195168 // Clean up the path if scanning an archive
0 commit comments