Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
dd37da0
feat: add chunking
cx-rui-oliveira Apr 23, 2025
32460c6
feat: add concurrency limit
cx-rui-oliveira Apr 23, 2025
3891f22
refactor: use chunking with threshold and weighted semaphore
cx-rui-oliveira Apr 29, 2025
1b9fb40
refactor: remove unnecessary flag
cx-rui-oliveira May 8, 2025
66b5f18
chore: merge and resolve conflicts
cx-rui-oliveira May 8, 2025
5eaec37
chore: run go mod tidy
cx-rui-oliveira May 8, 2025
c1d499d
refactor: clean up code
cx-rui-oliveira May 9, 2025
0be4b5f
test: add UTs for chunking and peeking logic
cx-rui-oliveira May 12, 2025
ced6eef
test: update getItems UT for filesystem
cx-rui-oliveira May 12, 2025
93f6752
chore: merge and resolve conflicts
cx-rui-oliveira May 12, 2025
6dcde46
chore: fix lint issues
cx-rui-oliveira May 12, 2025
7249d81
chore: ignore test secrets
cx-rui-oliveira May 13, 2025
10d45b4
refactor: created dedicated packages for semaphore and chunk
cx-rui-oliveira May 15, 2025
e3a6811
test: add mocks for new interfaces (engine, chunk and semaphore)
cx-rui-oliveira May 15, 2025
968e4e3
test: create/update UTs for engine, chunk and semaphore
cx-rui-oliveira May 15, 2025
60ebfba
refactor: add functional options pattern
cx-rui-oliveira May 20, 2025
1dd1fae
refactor: simplify reading chunk logic
cx-rui-oliveira May 20, 2025
6dced71
test: update UTs
cx-rui-oliveira May 21, 2025
b3d896e
fix: use correct error
cx-rui-oliveira May 21, 2025
d069b61
update checkmarx scan workflow
cx-leonardo-fontes May 21, 2025
45c80de
refactor: address PR suggestions
cx-rui-oliveira May 22, 2025
b18a224
chore: merge and resolve conflicts
cx-rui-oliveira May 22, 2025
4404615
Merge branch 'Update-Checkmarx-Scan-Workflow' into AST-79069-improve-…
cx-rui-oliveira May 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion .2ms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,14 @@ ignore-result:
- 7b7c1a0b1c5760490d843e0b9bfe540665d20b28 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- 92b1996f9815a2fbd9299a1997ce0bc2c153624f # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- bf2e01278453a987f05b69e6c536358cab343322 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- c9ae034a5a03a540d50a2686f74fcbb5117f181c # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- c9ae034a5a03a540d50a2686f74fcbb5117f181c # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- e3b354d102fe73cd4f4016e1ee17e468256d2ae8 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-e905b1effebb0b95aa63f317b0890c08ec18e554bf3a543cf3badf0faafe832a
- 5c2e640a480ca64c809133e1b157fd97960356bf # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-e905b1effebb0b95aa63f317b0890c08ec18e554bf3a543cf3badf0faafe832a
- bdd20706ea03aa38c8c9f3f87200cf6ab9010a53 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-2ae36c3c74dde4f291e7d6caac2f9bbd9a63f2c29b7e92c4b4425acf6f0cc802
- 99f9ffb901cb72a0282ce32cf7dc050e5225cd81 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-723d6b3fef6018f60691402e443924399f9c1b3cafb76dcf28eb5e7fc21dba3e
- 0f80a32cc85ea5c04b65dbf7d6db6ddb8c2e4d29 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-2ea972bed8e88d77f19bfb442934dc9eedf64c0a3f9c08f19e4d03530935dcaf
- 29a593e19a06c138d63468b8a028696ccdfc7eb2 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-2ea972bed8e88d77f19bfb442934dc9eedf64c0a3f9c08f19e4d03530935dcaf
- 8149f62cd847f3c4ba5ffc502bdcb8d66e800c7f # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-0d972d38be57cb9913139901c0ef0c425a8adfe46731cf6f10cec1eb10ed4338
- 1bd84965941175ee61639964adbff6170bea7703 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-2ae36c3c74dde4f291e7d6caac2f9bbd9a63f2c29b7e92c4b4425acf6f0cc802
- f86543794ab8c77a54adc91581dcf72bfef6bf78 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-0d972d38be57cb9913139901c0ef0c425a8adfe46731cf6f10cec1eb10ed4338
- ae0f7e65c291d7f0ea998dfa77485bfc632e5d62 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-0d972d38be57cb9913139901c0ef0c425a8adfe46731cf6f10cec1eb10ed4338
10 changes: 5 additions & 5 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,17 @@ func preRun(pluginName string, cmd *cobra.Command, args []string) error {
return err
}

engine, err := engine.Init(engineConfigVar)
engineInstance, err := engine.Init(engineConfigVar)
if err != nil {
return err
}

if err := engine.AddRegexRules(customRegexRuleVar); err != nil {
if err := engineInstance.AddRegexRules(customRegexRuleVar); err != nil {
return err
}

Channels.WaitGroup.Add(1)
go ProcessItems(engine, pluginName)
go ProcessItems(engineInstance, pluginName)

Channels.WaitGroup.Add(1)
go ProcessSecrets()
Expand All @@ -151,10 +151,10 @@ func preRun(pluginName string, cmd *cobra.Command, args []string) error {

if validateVar {
Channels.WaitGroup.Add(1)
go ProcessValidationAndScoreWithValidation(engine)
go ProcessValidationAndScoreWithValidation(engineInstance)
} else {
Channels.WaitGroup.Add(1)
go ProcessScoreWithoutValidation(engine)
go ProcessScoreWithoutValidation(engineInstance)
}

return nil
Expand Down
30 changes: 23 additions & 7 deletions cmd/workers.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,37 @@
package cmd

import (
"context"
"github.com/checkmarx/2ms/engine"
"github.com/checkmarx/2ms/engine/extra"
"github.com/checkmarx/2ms/lib/secrets"
"golang.org/x/sync/errgroup"
"sync"
)

func ProcessItems(engine *engine.Engine, pluginName string) {
func ProcessItems(engineInstance engine.IEngine, pluginName string) {
defer Channels.WaitGroup.Done()
wgItems := &sync.WaitGroup{}

g, ctx := errgroup.WithContext(context.Background())
for item := range Channels.Items {
Report.TotalItemsScanned++
wgItems.Add(1)
go engine.Detect(item, SecretsChan, wgItems, pluginName, Channels.Errors)
item := item

switch pluginName {
case "filesystem":
g.Go(func() error {
return engineInstance.DetectFile(ctx, item, SecretsChan)
})
default:
g.Go(func() error {
return engineInstance.DetectFragment(item, SecretsChan, pluginName)
})
}
}

if err := g.Wait(); err != nil {
Channels.Errors <- err
}
wgItems.Wait()
close(SecretsChan)
}

Expand Down Expand Up @@ -48,7 +64,7 @@ func ProcessSecretsExtras() {
wgExtras.Wait()
}

func ProcessValidationAndScoreWithValidation(engine *engine.Engine) {
func ProcessValidationAndScoreWithValidation(engine engine.IEngine) {
defer Channels.WaitGroup.Done()

wgValidation := &sync.WaitGroup{}
Expand All @@ -64,7 +80,7 @@ func ProcessValidationAndScoreWithValidation(engine *engine.Engine) {
engine.Validate()
}

func ProcessScoreWithoutValidation(engine *engine.Engine) {
func ProcessScoreWithoutValidation(engine engine.IEngine) {
defer Channels.WaitGroup.Done()

wgScore := &sync.WaitGroup{}
Expand Down
196 changes: 196 additions & 0 deletions engine/chunk/chunk.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
package chunk

//go:generate mockgen -source=$GOFILE -destination=${GOPACKAGE}_mock.go -package=${GOPACKAGE}

import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"sync"
"unicode"

"github.com/h2non/filetype"
)

const (
defaultSize = 100 * 1024 // 100Kib
defaultMaxPeekSize = 25 * 1024 // 25Kib
defaultFileThreshold = 1 * 1024 * 1024 // 1MiB
)

var ErrUnsupportedFileType = errors.New("unsupported file type")

type Chunk struct {
BufPool *sync.Pool
PeekBufPool *sync.Pool
Size int
MaxPeekSize int
SmallFileThreshold int64
}

type IChunk interface {
GetBuf() (*[]byte, bool)
PutBuf(buf *[]byte)
GetPeekBuf(buf []byte) (*bytes.Buffer, bool)
PutPeekBuf(buf *bytes.Buffer)
GetSize() int
GetMaxPeekSize() int
GetFileThreshold() int64
ReadChunk(reader *bufio.Reader, totalLines int) (string, error)
}

func NewChunk() *Chunk {
return NewChunkWithSize(defaultSize, defaultMaxPeekSize, defaultFileThreshold)
}

func NewChunkWithSize(size, maxPeekSize, smallFileThreshold int) *Chunk {
return &Chunk{
BufPool: &sync.Pool{
New: func() interface{} {
b := make([]byte, size)
return &b
},
},
PeekBufPool: &sync.Pool{
New: func() interface{} {
// pre-allocate enough capacity for initial chunk + peek
return bytes.NewBuffer(make([]byte, 0, size+maxPeekSize))
},
},
Size: size,
MaxPeekSize: maxPeekSize,
SmallFileThreshold: int64(smallFileThreshold),
}
}

func (c *Chunk) GetBuf() (*[]byte, bool) {
buf, ok := c.BufPool.Get().(*[]byte)
return buf, ok
}

func (c *Chunk) PutBuf(buf *[]byte) {
c.BufPool.Put(buf)
}

func (c *Chunk) GetPeekBuf(buf []byte) (*bytes.Buffer, bool) {
peekBuf, ok := c.PeekBufPool.Get().(*bytes.Buffer)
peekBuf.Reset()
peekBuf.Write(buf) // seed with buf
return peekBuf, ok
}

func (c *Chunk) PutPeekBuf(buf *bytes.Buffer) {
c.PeekBufPool.Put(buf)
}

func (c *Chunk) GetSize() int {
return c.Size
}

func (c *Chunk) GetMaxPeekSize() int {
return c.MaxPeekSize
}

func (c *Chunk) GetFileThreshold() int64 {
return c.SmallFileThreshold
}

// ReadChunk reads the next chunk of data from file
func (c *Chunk) ReadChunk(reader *bufio.Reader, totalLines int) (string, error) {
chunk, ok := c.GetBuf()
if !ok {
return "", fmt.Errorf("expected *[]byte, got %T", chunk)
}
defer c.PutBuf(chunk)

n, err := reader.Read(*chunk)
var chunkStr string
// "Callers should always process the n > 0 bytes returned before considering the error err."
// https://pkg.go.dev/io#Reader
if n > 0 {
// only check the filetype at the start of file
if totalLines == 0 && ShouldSkipFile((*chunk)[:n]) {
return "", fmt.Errorf("skipping file: %w", ErrUnsupportedFileType)
}

chunkStr, err = c.processChunk(reader, (*chunk)[:n])
if err != nil {
return "", err
}
}
if err != nil {
return "", err
}
return chunkStr, nil
}

// processChunk processes the chunk, reading until a safe boundary
func (c *Chunk) processChunk(reader *bufio.Reader, chunk []byte) (string, error) {
peekBuf, ok := c.GetPeekBuf(chunk)
if !ok {
return "", fmt.Errorf("expected *bytes.Buffer, got %T", peekBuf)
}
defer c.PutPeekBuf(peekBuf)

if readErr := c.readUntilSafeBoundary(reader, len(chunk), peekBuf); readErr != nil {
return "", fmt.Errorf("failed to read until safe boundary for file: %w", readErr)
}

return peekBuf.String(), nil
}

// readUntilSafeBoundary (hopefully) avoids splitting (https://github.com/gitleaks/gitleaks/issues/1651)
func (c *Chunk) readUntilSafeBoundary(r *bufio.Reader, n int, peekBuf *bytes.Buffer) error {
if peekBuf.Len() == 0 {
return nil
}

// keep reading until see our “\n…\n” boundary or hit limits
for peekBuf.Len()-n < c.MaxPeekSize {
if endsWithTwoNewlines(peekBuf.Bytes()) {
return nil
}

b, err := r.ReadByte()
if err != nil {
if err == io.EOF {
return nil
}
return fmt.Errorf("failed to read byte: %w", err)
}
peekBuf.WriteByte(b)
}

return nil
}

// endsWithTwoNewlines returns true if b ends in at least two '\n's (ignoring any number of ' ', '\r', or '\t' between them)
func endsWithTwoNewlines(b []byte) bool {
count := 0
for i := len(b) - 1; i >= 0; i-- {
if b[i] == '\n' {
count++
if count >= 2 {
return true
}
} else if unicode.IsSpace(rune(b[i])) {
// the presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count
continue
} else {
return false
}
}
return false
}

// ShouldSkipFile checks if the file should be skipped based on its content type
func ShouldSkipFile(data []byte) bool {
// TODO: could other optimizations be introduced here?
mimetype, err := filetype.Match(data)
if err != nil {
return true // could not determine file type
}
return mimetype.MIME.Type == "application" // skip binary files
}
Loading
Loading