Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
dd37da0
feat: add chunking
cx-rui-oliveira Apr 23, 2025
32460c6
feat: add concurrency limit
cx-rui-oliveira Apr 23, 2025
3891f22
refactor: use chunking with threshold and weighted semaphore
cx-rui-oliveira Apr 29, 2025
1b9fb40
refactor: remove unnecessary flag
cx-rui-oliveira May 8, 2025
66b5f18
chore: merge and resolve conflicts
cx-rui-oliveira May 8, 2025
5eaec37
chore: run go mod tidy
cx-rui-oliveira May 8, 2025
c1d499d
refactor: clean up code
cx-rui-oliveira May 9, 2025
0be4b5f
test: add UTs for chunking and peeking logic
cx-rui-oliveira May 12, 2025
ced6eef
test: update getItems UT for filesystem
cx-rui-oliveira May 12, 2025
93f6752
chore: merge and resolve conflicts
cx-rui-oliveira May 12, 2025
6dcde46
chore: fix lint issues
cx-rui-oliveira May 12, 2025
7249d81
chore: ignore test secrets
cx-rui-oliveira May 13, 2025
10d45b4
refactor: created dedicated packages for semaphore and chunk
cx-rui-oliveira May 15, 2025
e3a6811
test: add mocks for new interfaces (engine, chunk and semaphore)
cx-rui-oliveira May 15, 2025
968e4e3
test: create/update UTs for engine, chunk and semaphore
cx-rui-oliveira May 15, 2025
60ebfba
refactor: add functional options pattern
cx-rui-oliveira May 20, 2025
1dd1fae
refactor: simplify reading chunk logic
cx-rui-oliveira May 20, 2025
6dced71
test: update UTs
cx-rui-oliveira May 21, 2025
b3d896e
fix: use correct error
cx-rui-oliveira May 21, 2025
d069b61
update checkmarx scan workflow
cx-leonardo-fontes May 21, 2025
45c80de
refactor: address PR suggestions
cx-rui-oliveira May 22, 2025
b18a224
chore: merge and resolve conflicts
cx-rui-oliveira May 22, 2025
4404615
Merge branch 'Update-Checkmarx-Scan-Workflow' into AST-79069-improve-…
cx-rui-oliveira May 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,17 @@ func preRun(pluginName string, cmd *cobra.Command, args []string) error {
return err
}

engine, err := engine.Init(engineConfigVar)
engineInstance, err := engine.Init(engineConfigVar)
if err != nil {
return err
}

if err := engine.AddRegexRules(customRegexRuleVar); err != nil {
if err := engineInstance.AddRegexRules(customRegexRuleVar); err != nil {
return err
}

Channels.WaitGroup.Add(1)
go ProcessItems(engine, pluginName)
go ProcessItems(engineInstance, pluginName)

Channels.WaitGroup.Add(1)
go ProcessSecrets()
Expand All @@ -151,10 +151,10 @@ func preRun(pluginName string, cmd *cobra.Command, args []string) error {

if validateVar {
Channels.WaitGroup.Add(1)
go ProcessValidationAndScoreWithValidation(engine)
go ProcessValidationAndScoreWithValidation(engineInstance)
} else {
Channels.WaitGroup.Add(1)
go ProcessScoreWithoutValidation(engine)
go ProcessScoreWithoutValidation(engineInstance)
}

return nil
Expand Down
30 changes: 23 additions & 7 deletions cmd/workers.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,37 @@
package cmd

import (
"context"
"github.com/checkmarx/2ms/engine"
"github.com/checkmarx/2ms/engine/extra"
"github.com/checkmarx/2ms/lib/secrets"
"golang.org/x/sync/errgroup"
"sync"
)

func ProcessItems(engine *engine.Engine, pluginName string) {
func ProcessItems(engineInstance engine.IEngine, pluginName string) {
defer Channels.WaitGroup.Done()
wgItems := &sync.WaitGroup{}

g, ctx := errgroup.WithContext(context.Background())
for item := range Channels.Items {
Report.TotalItemsScanned++
wgItems.Add(1)
go engine.Detect(item, SecretsChan, wgItems, pluginName, Channels.Errors)
item := item

switch pluginName {
case "filesystem":
g.Go(func() error {
return engineInstance.DetectFile(ctx, item, SecretsChan)
})
default:
g.Go(func() error {
return engineInstance.DetectFragment(item, SecretsChan, pluginName)
})
}
}

if err := g.Wait(); err != nil {
Channels.Errors <- err
}
wgItems.Wait()
close(SecretsChan)
}

Expand Down Expand Up @@ -48,7 +64,7 @@ func ProcessSecretsExtras() {
wgExtras.Wait()
}

func ProcessValidationAndScoreWithValidation(engine *engine.Engine) {
func ProcessValidationAndScoreWithValidation(engine engine.IEngine) {
defer Channels.WaitGroup.Done()

wgValidation := &sync.WaitGroup{}
Expand All @@ -64,7 +80,7 @@ func ProcessValidationAndScoreWithValidation(engine *engine.Engine) {
engine.Validate()
}

func ProcessScoreWithoutValidation(engine *engine.Engine) {
func ProcessScoreWithoutValidation(engine engine.IEngine) {
defer Channels.WaitGroup.Done()

wgScore := &sync.WaitGroup{}
Expand Down
206 changes: 206 additions & 0 deletions engine/chunk/chunk.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
package chunk

//go:generate mockgen -source=$GOFILE -destination=${GOPACKAGE}_mock.go -package=${GOPACKAGE}

import (
"bufio"
"bytes"
"errors"
"fmt"
"sync"
"unicode"

"github.com/h2non/filetype"
)

const (
defaultSize = 100 * 1024 // 100Kib
defaultMaxPeekSize = 25 * 1024 // 25Kib
defaultFileThreshold = 1 * 1024 * 1024 // 1MiB
)

var ErrUnsupportedFileType = errors.New("unsupported file type")

type Option func(*Chunk)

// WithSize sets the chunk size
func WithSize(size int) Option {
return func(args *Chunk) {
args.size = size
}
}

// WithMaxPeekSize sets the max size of look-ahead bytes
func WithMaxPeekSize(maxPeekSize int) Option {
return func(args *Chunk) {
args.maxPeekSize = maxPeekSize
}
}

// WithSmallFileThreshold sets the threshold for small files
func WithSmallFileThreshold(smallFileThreshold int64) Option {
return func(args *Chunk) {
args.smallFileThreshold = smallFileThreshold
}
}

// Chunk holds two pools and sizing parameters needed for reading chunks of data with look-ahead
type Chunk struct {
bufPool *sync.Pool // *bytes.Buffer with cap Size + MaxPeekSize
peekedBufPool *sync.Pool // *[]byte slices of length Size + MaxPeekSize
size int // base chunk size
maxPeekSize int // max size of look-ahead bytes
smallFileThreshold int64 // files smaller than this skip chunking
}

type IChunk interface {
GetSize() int
GetMaxPeekSize() int
GetFileThreshold() int64
ReadChunk(reader *bufio.Reader, totalLines int) (string, error)
}

func New(opts ...Option) *Chunk {
// set default options
c := &Chunk{
size: defaultSize,
maxPeekSize: defaultMaxPeekSize,
smallFileThreshold: defaultFileThreshold,
}
// apply overrides
for _, opt := range opts {
opt(c)
}
c.bufPool = &sync.Pool{
New: func() interface{} {
// pre-allocate dynamic-size buffer for reading chunks (up to chunk size + peek size)
return bytes.NewBuffer(make([]byte, 0, c.size+c.maxPeekSize))
},
}
c.peekedBufPool = &sync.Pool{
New: func() interface{} {
// pre-allocate fixed-size block for loading chunks
b := make([]byte, c.size+c.maxPeekSize)
return &b
},
}
return c
}

// GetBuf returns a bytes.Buffer from the pool, seeded with the data
func (c *Chunk) GetBuf(data []byte) (*bytes.Buffer, bool) {
window, ok := c.bufPool.Get().(*bytes.Buffer)
if !ok {
return nil, false
}
window.Write(data) // seed the buffer with the data
return window, ok
}

// PutBuf returns the bytes.Buffer to the pool
func (c *Chunk) PutBuf(window *bytes.Buffer) {
window.Reset()
c.bufPool.Put(window)
}

// GetPeekedBuf returns a fixed-size []byte from the pool
func (c *Chunk) GetPeekedBuf() (*[]byte, bool) {
b, ok := c.peekedBufPool.Get().(*[]byte)
return b, ok
}

// PutPeekedBuf returns the fixed-size []byte to the pool
func (c *Chunk) PutPeekedBuf(b *[]byte) {
*b = (*b)[:0] // reset the slice to zero length
c.peekedBufPool.Put(b)
}

func (c *Chunk) GetSize() int {
return c.size
}

func (c *Chunk) GetMaxPeekSize() int {
return c.maxPeekSize
}

func (c *Chunk) GetFileThreshold() int64 {
return c.smallFileThreshold
}

// ReadChunk reads the next chunk of data from file
func (c *Chunk) ReadChunk(reader *bufio.Reader, totalLines int) (string, error) {
// borrow a []bytes from the pool and seed it with raw data from file (up to chunk size + peek size)
rawData, ok := c.GetPeekedBuf()
if !ok {
return "", fmt.Errorf("expected *bytes.Buffer, got %T", rawData)
}
defer c.PutPeekedBuf(rawData)
n, err := reader.Read(*rawData)

var chunkStr string
// "Callers should always process the n > 0 bytes returned before considering the error err."
// https://pkg.go.dev/io#Reader
if n > 0 {
// only check the filetype at the start of file
if totalLines == 0 && ShouldSkipFile((*rawData)[:n]) {
return "", fmt.Errorf("skipping file: %w", ErrUnsupportedFileType)
}

chunkStr, err = c.generateChunk((*rawData)[:n])
}
if err != nil {
return "", err
}
return chunkStr, nil
}

// generateChunk processes block of raw data and generates chunk to be scanned
func (c *Chunk) generateChunk(rawData []byte) (string, error) {
// Borrow a buffer from the pool and seed it with raw data (up to chunk size)
initialChunkLen := min(len(rawData), c.size)
chunkData, ok := c.GetBuf(rawData[:initialChunkLen])
if !ok {
return "", fmt.Errorf("expected *bytes.Buffer, got %T", chunkData)
}
defer c.PutBuf(chunkData)

// keep seeding chunk until detecting the “\n...\n” (i.e. safe boundary)
// or reaching the max limit of chunk size (i.e. chunk size + peek size)
for i := chunkData.Len(); i < len(rawData); i++ {
if endsWithTwoNewlines(rawData[:i]) {
break
}
chunkData.WriteByte(rawData[i])
}

return chunkData.String(), nil
}

// endsWithTwoNewlines returns true if b ends in at least two '\n's (ignoring any number of ' ', '\r', or '\t' between them)
func endsWithTwoNewlines(b []byte) bool {
count := 0
for i := len(b) - 1; i >= 0; i-- {
if b[i] == '\n' {
count++
if count >= 2 {
return true
}
} else if unicode.IsSpace(rune(b[i])) {
// the presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count
continue
} else {
return false
}
}
return false
}

// ShouldSkipFile checks if the file should be skipped based on its content type
func ShouldSkipFile(data []byte) bool {
// TODO: could other optimizations be introduced here?
mimetype, err := filetype.Match(data)
if err != nil {
return true // could not determine file type
}
return mimetype.MIME.Type == "application" // skip binary files
}
Loading
Loading