Skip to content

Commit 771130e

Browse files
authored
Abdeali/intm 2073 support combo detectors in nightfall code scanner (#75)
* adding sensitive content * Added detector * credit card and ssn * changed config * Changing logical op * adding in single line * Added in different line * adding cc and ssn in single line * Added range map data structure * Added login for combo detectors * reviewdog comments * trying with different sensitive info * handling exception * Skipping files greater than 512kb * fixed unit test cases * testing with phi detector * reverting config * remove unused code * resolving comments for rangemap * Addressed review comment * using new line as content splitter * reverting to space * special character support --------- Co-authored-by: Abdeali Adenwala <[email protected]>
1 parent c07aa5e commit 771130e

File tree

4 files changed

+257
-56
lines changed

4 files changed

+257
-56
lines changed

internal/clients/nightfall/nightfall.go

Lines changed: 114 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ import (
1717
nf "github.com/nightfallai/nightfall-go-sdk"
1818
"github.com/nightfallai/nightfall_code_scanner/internal/clients/diffreviewer"
1919
"github.com/nightfallai/nightfall_code_scanner/internal/clients/logger"
20+
"github.com/nightfallai/nightfall_code_scanner/internal/datastructs"
2021
"github.com/nightfallai/nightfall_code_scanner/internal/nightfallconfig"
2122
"golang.org/x/text/cases"
2223
"golang.org/x/text/language"
2324
)
2425

2526
const (
26-
contentChunkByteSize = 1024
2727
// max number of items that can be sent to Nightfall API at a time
2828
maxItemsForAPIReq = 479
2929
// timeout for the total time spent sending scan requests and receiving responses for a diff
@@ -32,6 +32,8 @@ const (
3232
maxScanAttempts = 5
3333
// initial delay before re-attempting scan request
3434
initialDelay = time.Second
35+
36+
maxAPIRequestSize = 500 * 1024 // 500KB
3537
)
3638

3739
// Client uses the Nightfall API to scan text for findings
@@ -73,6 +75,12 @@ type contentToScan struct {
7375
LineNumber int
7476
}
7577

78+
type fileToScan struct {
79+
Content string
80+
FilePath string
81+
ContentToLineMap *datastructs.RangeMap
82+
}
83+
7684
func getCommentMsg(finding *nf.Finding) string {
7785
if finding.Finding == "" && finding.RedactedFinding == "" {
7886
return ""
@@ -175,6 +183,34 @@ func sliceListBySize(index, numItemsForMaxSize int, contentToScanList []*content
175183
return contentToScanList[startIndex:endIndex]
176184
}
177185

186+
func createCommentsFromScanRespForFiles(inputContent []*fileToScan, resp *nf.ScanTextResponse, tokenExclusionList []string) []*diffreviewer.Comment {
187+
comments := make([]*diffreviewer.Comment, 0)
188+
for j, findingList := range resp.Findings {
189+
for _, finding := range findingList {
190+
if finding.Finding != "" && !isFindingInTokenExclusionList(finding.Finding, tokenExclusionList) {
191+
// Found sensitive info
192+
// Create comment if fragment is not in exclusion set
193+
correspondingContent := inputContent[j]
194+
exists, lineNumber, _ := correspondingContent.ContentToLineMap.Find(int(finding.Location.CodepointRange.Start))
195+
if !exists {
196+
// should not come here
197+
continue
198+
}
199+
findingMsg := getCommentMsg(finding)
200+
findingTitle := getCommentTitle(finding)
201+
c := diffreviewer.Comment{
202+
FilePath: correspondingContent.FilePath,
203+
LineNumber: lineNumber,
204+
Body: findingMsg,
205+
Title: findingTitle,
206+
}
207+
comments = append(comments, &c)
208+
}
209+
}
210+
}
211+
return comments
212+
}
213+
178214
func createCommentsFromScanResp(inputContent []*contentToScan, resp *nf.ScanTextResponse, tokenExclusionList []string) []*diffreviewer.Comment {
179215
comments := make([]*diffreviewer.Comment, 0)
180216
for j, findingList := range resp.Findings {
@@ -230,9 +266,9 @@ func (n *Client) buildScanRequest(items []string) *nf.ScanTextRequest {
230266
}
231267
}
232268

233-
func (n *Client) scanContent(
269+
func (n *Client) scanFileContent(
234270
ctx context.Context,
235-
cts []*contentToScan,
271+
cts []*fileToScan,
236272
requestNum int,
237273
logger logger.Logger,
238274
) ([]*diffreviewer.Comment, error) {
@@ -250,45 +286,70 @@ func (n *Client) scanContent(
250286
}
251287

252288
// Determine findings from response and create comments
253-
createdComments := createCommentsFromScanResp(cts, resp, n.TokenExclusionList)
289+
createdComments := createCommentsFromScanRespForFiles(cts, resp, n.TokenExclusionList)
254290
logger.Info(fmt.Sprintf("Got %d annotations for request #%d", len(createdComments), requestNum))
255291
return createdComments, nil
256292
}
257293

258-
func (n *Client) scanAllContent(
294+
func (n *Client) scanAllFiles(
259295
ctx context.Context,
260296
logger logger.Logger,
261-
cts []*contentToScan,
297+
cts []*fileToScan,
262298
commentCh chan<- []*diffreviewer.Comment,
263299
) {
264300
defer close(commentCh)
265301
blockingCh := make(chan struct{}, n.MaxNumberRoutines)
266302
var wg sync.WaitGroup
303+
requestBatches := make([][]*fileToScan, 0)
304+
endIndex := 0
305+
for endIndex < len(cts) {
306+
requestBatch := make([]*fileToScan, 0)
307+
startIndex := endIndex
308+
currentSize := 0
309+
for {
310+
if endIndex >= len(cts) {
311+
break
312+
}
313+
size := len(cts[endIndex].Content)
314+
if size > maxAPIRequestSize {
315+
// file diff size is greater than supported by API platform, should not come here
316+
logger.Error("terminating the scan early as large file scanning is not supported by API platform")
317+
return
318+
}
319+
if (size + currentSize) > maxAPIRequestSize {
320+
break
321+
}
267322

268-
// Integer round up division
269-
numRequestsRequired := (len(cts) + maxItemsForAPIReq - 1) / maxItemsForAPIReq
323+
if (endIndex - startIndex) >= maxItemsForAPIReq {
324+
break
325+
}
326+
currentSize += size
327+
endIndex++
328+
}
329+
requestBatch = append(requestBatch, cts[startIndex:endIndex]...)
330+
requestBatches = append(requestBatches, requestBatch)
331+
}
270332

333+
numRequestsRequired := len(requestBatches)
271334
logger.Info(fmt.Sprintf("Sending %d requests to Nightfall API", numRequestsRequired))
272335
for i := 0; i < numRequestsRequired; i++ {
273336
// Use max number of items to determine content to send in request
274-
contentSlice := sliceListBySize(i, maxItemsForAPIReq, cts)
275-
276337
wg.Add(1)
277338
blockingCh <- struct{}{}
278-
go func(loopCount int, cts []*contentToScan) {
339+
go func(loopCount int, cts []*fileToScan) {
279340
defer wg.Done()
280341
if ctx.Err() != nil {
281342
return
282343
}
283344

284-
c, err := n.scanContent(ctx, cts, loopCount+1, logger)
345+
c, err := n.scanFileContent(ctx, cts, loopCount+1, logger)
285346
if err != nil {
286347
logger.Error(fmt.Sprintf("Unable to scan %d content items", len(cts)))
287348
} else {
288349
commentCh <- c
289350
}
290351
<-blockingCh
291-
}(i, contentSlice)
352+
}(i, requestBatches[i])
292353
}
293354
wg.Wait()
294355
}
@@ -303,26 +364,25 @@ func (n *Client) Scan(ctx context.Context, items []string) (*nf.ScanTextResponse
303364
// contains sensitive data
304365
func (n *Client) ReviewDiff(ctx context.Context, logger logger.Logger, fileDiffs []*diffreviewer.FileDiff) ([]*diffreviewer.Comment, error) {
305366
fileDiffs = filterFileDiffs(fileDiffs, n.FileInclusionList, n.FileExclusionList, logger)
306-
contentToScanList := make([]*contentToScan, 0, len(fileDiffs))
307-
// Chunk fileDiffs content and store chunk and its metadata
367+
fileToScanList := make([]*fileToScan, 0, len(fileDiffs))
368+
308369
for _, fd := range fileDiffs {
309-
for _, hunk := range fd.Hunks {
310-
for _, line := range hunk.Lines {
311-
chunkedContent, err := chunkContent(contentChunkByteSize, line, fd.PathNew)
312-
if err != nil {
313-
logger.Error("Error chunking git diff")
314-
return nil, err
315-
}
316-
contentToScanList = append(contentToScanList, chunkedContent...)
317-
}
370+
file, err := getFileToScan(fd)
371+
if err != nil {
372+
return nil, err
318373
}
374+
if len(file.Content) > maxAPIRequestSize {
375+
logger.Warning(fmt.Sprintf("unable to scan file %s as its size exceeds the supported limit of %d Kbs", file.FilePath, maxAPIRequestSize/1024))
376+
continue
377+
}
378+
fileToScanList = append(fileToScanList, file)
319379
}
320380

321381
commentCh := make(chan []*diffreviewer.Comment)
322382
newCtx, cancel := context.WithDeadline(ctx, time.Now().Add(defaultTimeout))
323383
defer cancel()
324384

325-
go n.scanAllContent(newCtx, logger, contentToScanList, commentCh)
385+
go n.scanAllFiles(newCtx, logger, fileToScanList, commentCh)
326386

327387
comments := make([]*diffreviewer.Comment, 0)
328388
for {
@@ -338,6 +398,35 @@ func (n *Client) ReviewDiff(ctx context.Context, logger logger.Logger, fileDiffs
338398
}
339399
}
340400

401+
func getFileToScan(fd *diffreviewer.FileDiff) (*fileToScan, error) {
402+
fts := &fileToScan{
403+
FilePath: fd.PathNew,
404+
ContentToLineMap: datastructs.NewRangeMap(),
405+
}
406+
407+
bufferString := bytes.NewBufferString("")
408+
startCodePointRange, endCodePointRange := 0, -1
409+
for _, hunk := range fd.Hunks {
410+
for _, line := range hunk.Lines {
411+
startCodePointRange = endCodePointRange + 1
412+
// adding space between each line
413+
strToAdd := fmt.Sprintf("%s ", line.Content)
414+
_, err := bufferString.WriteString(strToAdd)
415+
if err != nil {
416+
return nil, err
417+
}
418+
endCodePointRange += len([]rune(strToAdd))
419+
err = fts.ContentToLineMap.AddRange(startCodePointRange, endCodePointRange, line.LnumNew)
420+
if err != nil {
421+
return nil, err
422+
}
423+
}
424+
}
425+
426+
fts.Content = bufferString.String()
427+
return fts, nil
428+
}
429+
341430
func filterFileDiffs(fileDiffs []*diffreviewer.FileDiff, fileIncludeList, fileExcludeList []string, logger logger.Logger) []*diffreviewer.FileDiff {
342431
if len(fileIncludeList) > 0 {
343432
fileDiffs = filterByFilePath(fileDiffs, fileIncludeList, true, logger)

0 commit comments

Comments
 (0)