Skip to content

Commit f67831c

Browse files
authored
fix: reduce misidentifications with manual filetype detection (#1263)
* fix: reduce misidentifications with manual filetype detection Signed-off-by: egibs <[email protected]> * add vim to supportedKind map for future rule tuning Signed-off-by: egibs <[email protected]> --------- Signed-off-by: egibs <[email protected]>
1 parent 2742b9c commit f67831c

File tree

13 files changed

+442
-264
lines changed

13 files changed

+442
-264
lines changed

pkg/action/archive_test.go

Lines changed: 0 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import (
1313
"github.com/chainguard-dev/clog"
1414
"github.com/chainguard-dev/malcontent/pkg/archive"
1515
"github.com/chainguard-dev/malcontent/pkg/malcontent"
16-
"github.com/chainguard-dev/malcontent/pkg/programkind"
1716
"github.com/chainguard-dev/malcontent/pkg/render"
1817
"github.com/chainguard-dev/malcontent/rules"
1918
thirdparty "github.com/chainguard-dev/malcontent/third_party"
@@ -398,80 +397,6 @@ func TestScanConflictingArchiveFiles(t *testing.T) {
398397
}
399398
}
400399

401-
func TestGetExt(t *testing.T) {
402-
tests := []struct {
403-
path string
404-
want string
405-
}{
406-
{
407-
path: "testdata/file.apk",
408-
want: ".apk",
409-
}, {
410-
path: "testdata/file.jar",
411-
want: ".jar",
412-
}, {
413-
path: "testdata/file.tar",
414-
want: ".tar",
415-
}, {
416-
path: "testdata/file.tgz",
417-
want: ".tgz",
418-
}, {
419-
path: "testdata/file.tar.gz",
420-
want: ".tar.gz",
421-
}, {
422-
path: "testdata/file.tar.xz",
423-
want: ".tar.xz",
424-
}, {
425-
path: "testdata/file.zip",
426-
want: ".zip",
427-
}, {
428-
path: "testdata/file_1.0.0",
429-
want: "",
430-
}, {
431-
path: "testdata/file_1.0.0.apk",
432-
want: ".apk",
433-
}, {
434-
path: "testdata/file_1.0.0.jar",
435-
want: ".jar",
436-
}, {
437-
path: "testdata/file_1.0.0.tar",
438-
want: ".tar",
439-
}, {
440-
path: "testdata/file_1.0.0.tgz",
441-
want: ".tgz",
442-
}, {
443-
path: "testdata/file_1.0.0.tar.gz",
444-
want: ".tar.gz",
445-
}, {
446-
path: "testdata/file_1.0.0.tar.xz",
447-
want: ".tar.xz",
448-
}, {
449-
path: "testdata/file_1.0.0.zip",
450-
want: ".zip",
451-
}, {
452-
path: "testdata/file.a.b.c.tar.gz",
453-
want: ".tar.gz",
454-
}, {
455-
path: "testdata/file_a.b.c.tar.xz",
456-
want: ".tar.xz",
457-
}, {
458-
path: "testdata/file_a.b.0.tar",
459-
want: ".tar",
460-
}, {
461-
path: "testdata/file_no_ext",
462-
want: "",
463-
},
464-
}
465-
for _, tt := range tests {
466-
t.Run(tt.path, func(t *testing.T) {
467-
t.Parallel()
468-
if got := programkind.GetExt(tt.path); got != tt.want {
469-
t.Errorf("Ext() = %v, want %v", got, tt.want)
470-
}
471-
})
472-
}
473-
}
474-
475400
func TestIsValidPath(t *testing.T) {
476401
tmpRoot, err := os.MkdirTemp("", "isValidPath-*")
477402
if err != nil {

pkg/programkind/programkind.go

Lines changed: 89 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"path/filepath"
1515
"regexp"
1616
"runtime"
17+
"slices"
1718
"strings"
1819
"sync"
1920

@@ -74,6 +75,7 @@ var supportedKind = map[string]string{
7475
"html": "",
7576
"java": "text/x-java",
7677
"js": "application/javascript",
78+
"json": "",
7779
"ko": "application/x-object",
7880
"lnk": "application/x-ms-shortcut",
7981
"lua": "text/x-lua",
@@ -99,7 +101,10 @@ var supportedKind = map[string]string{
99101
"sh": "text/x-shellscript",
100102
"so": "application/x-sharedlib",
101103
"ts": "application/typescript",
104+
"txt": "",
102105
"upx": "application/x-upx",
106+
"vbs": "text/x-vbscript",
107+
"vim": "text/x-vim",
103108
"yaml": "",
104109
"yara": "",
105110
"yml": "",
@@ -116,9 +121,41 @@ var (
116121
initializeOnce sync.Once
117122
versionRegex = regexp.MustCompile(`\d+\.\d+\.\d+$`)
118123
// Magic byte constants for common file signatures.
119-
elfMagic = []byte{0x7f, 'E', 'L', 'F'}
120-
gzipMagic = []byte{0x1f, 0x8b}
121-
ZMagic = []byte{0x78, 0x5E}
124+
elfMagic = []byte{0x7f, 'E', 'L', 'F'}
125+
gzipMagic = []byte{0x1f, 0x8b}
126+
ZMagic = []byte{0x78, 0x5E}
127+
shellShebangs = [][]byte{
128+
[]byte("#!/bin/ash"),
129+
[]byte("#!/bin/bash"),
130+
[]byte("#!/bin/dash"),
131+
[]byte("#!/bin/fish"),
132+
[]byte("#!/bin/ksh"),
133+
[]byte("#!/bin/sh"),
134+
[]byte("#!/bin/zsh"),
135+
[]byte("#!/usr/bin/env bash"),
136+
[]byte("#!/usr/bin/env sh"),
137+
[]byte("#!/usr/bin/env zsh"),
138+
}
139+
shellPatterns = [][]byte{
140+
[]byte("; then\n"),
141+
[]byte("; do\n"),
142+
[]byte("esac"),
143+
[]byte("fi\n"),
144+
[]byte("done\n"),
145+
[]byte("$(("),
146+
[]byte("$("),
147+
[]byte("${"),
148+
[]byte("<<EOF"),
149+
[]byte("<<-EOF"),
150+
[]byte("<<'EOF'"),
151+
[]byte("|| exit"),
152+
[]byte("&& exit"),
153+
[]byte("set -e"),
154+
[]byte("set -x"),
155+
[]byte("set -u"),
156+
[]byte("set -o "),
157+
[]byte("export PATH"),
158+
}
122159
)
123160

124161
// IsSupportedArchive returns whether a path can be processed by our archive extractor.
@@ -203,12 +240,15 @@ func makeFileType(path string, ext string, mime string) *FileType {
203240
ext = strings.TrimPrefix(ext, ".")
204241

205242
// Archives are supported
243+
if _, ok := ArchiveMap[ext]; ok {
244+
return &FileType{Ext: ext, MIME: mime}
245+
}
206246
if _, ok := ArchiveMap[GetExt(path)]; ok {
207247
return &FileType{Ext: ext, MIME: mime}
208248
}
209249

210250
// typically, JSON and YAML files are data files only scanned via --all, but we want to support the NPM ecosystem
211-
if strings.HasSuffix(path, "package.json") || strings.HasSuffix(path, "package-lock.json") {
251+
if strings.HasSuffix(path, "package.json") || strings.HasSuffix(path, "package-lock.json") || strings.Contains(path, ".js.map") {
212252
return &FileType{
213253
Ext: ext,
214254
MIME: "application/json",
@@ -248,6 +288,36 @@ func makeFileType(path string, ext string, mime string) *FileType {
248288
return nil
249289
}
250290

291+
// isLikelyShellScript determines if file content is likely a shell script
292+
// and focuses on multiple criteria to reduce false-positives.
293+
func isLikelyShellScript(fc []byte, path string) bool {
294+
if slices.ContainsFunc(shellShebangs, func(shebang []byte) bool {
295+
return bytes.HasPrefix(fc, shebang)
296+
}) {
297+
return true
298+
}
299+
300+
if strings.HasSuffix(path, "profile") ||
301+
strings.HasSuffix(path, ".bashrc") ||
302+
strings.HasSuffix(path, ".bash_profile") ||
303+
strings.HasSuffix(path, ".zshrc") ||
304+
strings.HasSuffix(path, ".zsh_profile") {
305+
return true
306+
}
307+
308+
matches := 0
309+
for _, pattern := range shellPatterns {
310+
if bytes.Contains(fc, pattern) {
311+
matches++
312+
if matches >= 2 {
313+
return true
314+
}
315+
}
316+
}
317+
318+
return false
319+
}
320+
251321
// File detects what kind of program this file might be.
252322
func File(ctx context.Context, path string) (*FileType, error) {
253323
// Follow symlinks and return cleanly if the target does not exist
@@ -293,7 +363,8 @@ func File(ctx context.Context, path string) (*FileType, error) {
293363
// default strategy: mimetype (no limit for improved magic type detection)
294364
mimetype.SetLimit(0) // a limit of 0 means the whole input file will be used
295365
mtype := mimetype.Detect(fc)
296-
if ft := makeFileType(path, mtype.Extension(), mtype.String()); ft != nil {
366+
ext, mime := mtype.Extension(), mtype.String()
367+
if ft := makeFileType(path, ext, mime); ft != nil {
297368
return ft, nil
298369
}
299370

@@ -302,6 +373,17 @@ func File(ctx context.Context, path string) (*FileType, error) {
302373
return mtype, nil
303374
}
304375

376+
pathExt := strings.TrimPrefix(GetExt(path), ".")
377+
378+
if _, pathExtKnown := supportedKind[pathExt]; pathExtKnown {
379+
return nil, nil
380+
}
381+
382+
if mime == "application/octet-stream" && len(pathExt) >= 2 {
383+
return nil, nil
384+
}
385+
386+
// Content-based detection for files with no recognized extension or mimetype
305387
switch {
306388
case bytes.HasPrefix(fc, elfMagic):
307389
return Path(".elf"), nil
@@ -311,18 +393,7 @@ func File(ctx context.Context, path string) (*FileType, error) {
311393
return Path(".py"), nil
312394
case bytes.Contains(fc, []byte(" = require(")):
313395
return Path(".js"), nil
314-
case bytes.HasPrefix(fc, []byte("#!/bin/ash")) ||
315-
bytes.HasPrefix(fc, []byte("#!/bin/bash")) ||
316-
bytes.HasPrefix(fc, []byte("#!/bin/fish")) ||
317-
bytes.HasPrefix(fc, []byte("#!/bin/sh")) ||
318-
bytes.HasPrefix(fc, []byte("#!/bin/zsh")) ||
319-
bytes.Contains(fc, []byte("if [")) ||
320-
bytes.Contains(fc, []byte("if !")) ||
321-
bytes.Contains(fc, []byte("echo ")) ||
322-
bytes.Contains(fc, []byte("grep ")) ||
323-
bytes.Contains(fc, []byte("; then")) ||
324-
bytes.Contains(fc, []byte("export ")) ||
325-
strings.HasSuffix(path, "profile"):
396+
case isLikelyShellScript(fc, path):
326397
return Path(".sh"), nil
327398
case bytes.HasPrefix(fc, []byte("#!")):
328399
return Path(".script"), nil
@@ -347,7 +418,7 @@ func initializeHeaderPool() {
347418

348419
// Path returns a filetype based strictly on file path.
349420
func Path(path string) *FileType {
350-
ext := strings.ReplaceAll(filepath.Ext(path), ".", "")
421+
ext := strings.TrimPrefix(GetExt(path), ".")
351422
mime := supportedKind[ext]
352423
return makeFileType(path, ext, mime)
353424
}

0 commit comments

Comments
 (0)