Skip to content

Commit 8aef903

Browse files
authored
feat: improve detection of man pages; improve clarity of programkind code (#1286)
* feat: improve detection of man pages; improve clarity of programkind code Signed-off-by: egibs <[email protected]> * move man page check into isLikelyShellScript Signed-off-by: egibs <[email protected]> --------- Signed-off-by: egibs <[email protected]>
1 parent 9668626 commit 8aef903

File tree

1 file changed

+97
-60
lines changed

1 file changed

+97
-60
lines changed

pkg/programkind/programkind.go

Lines changed: 97 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"regexp"
1616
"runtime"
1717
"slices"
18+
"strconv"
1819
"strings"
1920
"sync"
2021

@@ -121,24 +122,28 @@ type FileType struct {
121122
}
122123

123124
var (
125+
ZMagic = []byte{0x78, 0x5E} // Z magic bytes
126+
// default, partial MIME types we want to consider as valid by default.
127+
defaultMIME = []string{
128+
"application",
129+
"executable",
130+
"text/x-",
131+
}
132+
elfMagic = []byte{0x7f, 'E', 'L', 'F'} // ELF magic bytes
133+
gzipMagic = []byte{0x1f, 0x8b} // gZip magic bytes
124134
headerPool *pool.BufferPool
125135
initializeOnce sync.Once
126-
versionRegex = regexp.MustCompile(`\d+\.\d+\.\d+$`)
127-
// Magic byte constants for common file signatures.
128-
elfMagic = []byte{0x7f, 'E', 'L', 'F'}
129-
gzipMagic = []byte{0x1f, 0x8b}
130-
ZMagic = []byte{0x78, 0x5E}
131-
shellShebangs = [][]byte{
132-
[]byte("#!/bin/ash"),
133-
[]byte("#!/bin/bash"),
134-
[]byte("#!/bin/dash"),
135-
[]byte("#!/bin/fish"),
136-
[]byte("#!/bin/ksh"),
137-
[]byte("#!/bin/sh"),
138-
[]byte("#!/bin/zsh"),
139-
[]byte("#!/usr/bin/env bash"),
140-
[]byte("#!/usr/bin/env sh"),
141-
[]byte("#!/usr/bin/env zsh"),
136+
// supported NPM JSON extensions or file names we want to avoid classifying as data files.
137+
npmJSON = []string{
138+
".js.map",
139+
"package-lock.json",
140+
"package.json",
141+
}
142+
// supported NPM YAML file names we want to avoid classsifying as data files.
143+
npmYAML = []string{
144+
"pnpm-lock.yaml",
145+
"pnpm-workspace.yaml",
146+
"yarn.lock",
142147
}
143148
shellPatterns = [][]byte{
144149
[]byte("; then\n"),
@@ -160,6 +165,19 @@ var (
160165
[]byte("set -o "),
161166
[]byte("export PATH"),
162167
}
168+
shellShebangs = [][]byte{
169+
[]byte("#!/bin/ash"),
170+
[]byte("#!/bin/bash"),
171+
[]byte("#!/bin/dash"),
172+
[]byte("#!/bin/fish"),
173+
[]byte("#!/bin/ksh"),
174+
[]byte("#!/bin/sh"),
175+
[]byte("#!/bin/zsh"),
176+
[]byte("#!/usr/bin/env bash"),
177+
[]byte("#!/usr/bin/env sh"),
178+
[]byte("#!/usr/bin/env zsh"),
179+
}
180+
versionRegex = regexp.MustCompile(`\d+\.\d+\.\d+$`)
163181
)
164182

165183
// IsSupportedArchive returns whether a path can be processed by our archive extractor.
@@ -251,51 +269,43 @@ func makeFileType(path string, ext string, mime string) *FileType {
251269
return &FileType{Ext: ext, MIME: mime}
252270
}
253271

254-
// typically, JSON and YAML files are data files only scanned via --all, but we want to support the NPM ecosystem
255-
if strings.HasSuffix(path, "package.json") || strings.HasSuffix(path, "package-lock.json") || strings.Contains(path, ".js.map") {
256-
return &FileType{
257-
Ext: ext,
258-
MIME: "application/json",
259-
}
260-
}
261-
262-
if strings.HasSuffix(path, "pnpm-lock.yaml") ||
263-
strings.HasSuffix(path, "pnpm-workspace.yaml") ||
264-
strings.HasSuffix(path, "yarn.lock") ||
265-
strings.HasSuffix(path, ".policy") {
266-
return &FileType{
267-
Ext: ext,
268-
MIME: "application/x-yaml",
269-
}
270-
}
271-
272-
if supportedKind[ext] == "" {
272+
switch {
273+
// by default, JSON files will not have a defined MIME type,
274+
// but we want to specifically target the NPM ecosystem
275+
// using --all or --include-data-files will override these distinctions
276+
case containsSuffix(path, npmJSON):
277+
return &FileType{Ext: ext, MIME: "application/json"}
278+
// by default, YAML files will also not have a defined MIME type,
279+
// but we want to specifically target the NPM ecosystem
280+
// using --all or --include-data-files will override these distinctions
281+
case containsSuffix(path, npmYAML):
282+
return &FileType{Ext: ext, MIME: "application/x-yaml"}
283+
// the ordering of this statement is important
284+
// placing it first would prevent the preceding JSON/YAML statemments from taking effect
285+
case supportedKind[ext] == "":
273286
return nil
274-
}
275-
287+
// the follwing statements are not at risk of being preempted by the preceding statement
276288
// fix mimetype bug that defaults elf binaries to x-sharedlib
277-
if mime == "application/x-sharedlib" && !strings.Contains(path, ".so") {
289+
case mime == "application/x-sharedlib" && !strings.Contains(path, ".so"):
278290
return Path(".elf")
279-
}
280-
281291
// fix mimetype bug that detects certain .js files as shellscript
282-
if mime == "text/x-shellscript" && strings.Contains(path, ".js") {
292+
case mime == "text/x-shellscript" && strings.Contains(path, ".js"):
283293
return Path(".js")
294+
// treat all other MIME types as valid
295+
case containsValue(mime, defaultMIME):
296+
return &FileType{Ext: ext, MIME: mime}
297+
default:
298+
return nil
284299
}
285-
286-
if strings.Contains(mime, "application") || strings.Contains(mime, "text/x-") || strings.Contains(mime, "executable") {
287-
return &FileType{
288-
Ext: ext,
289-
MIME: mime,
290-
}
291-
}
292-
293-
return nil
294300
}
295301

296-
// isLikelyShellScript determines if file content is likely a shell script
302+
// isLikelyShellScript determines if a file's content resembles a shell script
297303
// and focuses on multiple criteria to reduce false-positives.
298304
func isLikelyShellScript(fc []byte, path string) bool {
305+
if isLikelyManPage(path) {
306+
return false
307+
}
308+
299309
if slices.ContainsFunc(shellShebangs, func(shebang []byte) bool {
300310
return bytes.HasPrefix(fc, shebang)
301311
}) {
@@ -323,6 +333,31 @@ func isLikelyShellScript(fc []byte, path string) bool {
323333
return false
324334
}
325335

336+
// isLikelyManPage checks a file's path and its extension to determine
337+
// if it is a man page (e.g., usr/share/man/man7/parallel_examples.7).
338+
func isLikelyManPage(path string) bool {
339+
if strings.Contains(path, "usr/share/man/") {
340+
if _, err := strconv.Atoi(strings.TrimPrefix(GetExt(path), ".")); err == nil {
341+
return true
342+
}
343+
}
344+
return false
345+
}
346+
347+
// containsSuffix determines whether a value contains any of the specified strings as a suffix.
348+
func containsSuffix(value string, slice []string) bool {
349+
return slices.ContainsFunc(slice, func(s string) bool {
350+
return strings.HasSuffix(value, s)
351+
})
352+
}
353+
354+
// containsValue determines whether a value contains any of the specified substrings.
355+
func containsValue(value string, slice []string) bool {
356+
return slices.ContainsFunc(slice, func(s string) bool {
357+
return strings.Contains(value, s)
358+
})
359+
}
360+
326361
// File detects what kind of program this file might be.
327362
func File(ctx context.Context, path string) (*FileType, error) {
328363
// Follow symlinks and return cleanly if the target does not exist
@@ -380,16 +415,18 @@ func File(ctx context.Context, path string) (*FileType, error) {
380415

381416
pathExt := strings.TrimPrefix(GetExt(path), ".")
382417

383-
if _, pathExtKnown := supportedKind[pathExt]; pathExtKnown {
384-
return nil, nil
385-
}
386-
387-
if mime == "application/octet-stream" && len(pathExt) >= 2 {
388-
return nil, nil
389-
}
418+
_, pathExtKnown := supportedKind[pathExt]
390419

391420
// Content-based detection for files with no recognized extension or mimetype
392421
switch {
422+
// if we track an extension in our supportedKind map and the files's type is still nil,
423+
// return nil (e.g., valid JSON or YAML files that we want to treat as data files by default)
424+
case pathExtKnown:
425+
return nil, nil
426+
case mime == "application/octet-stream" && len(pathExt) >= 2:
427+
return nil, nil
428+
case strings.Contains(mime, "text/plain") && isLikelyManPage(path):
429+
return nil, nil
393430
case bytes.HasPrefix(fc, elfMagic):
394431
return Path(".elf"), nil
395432
case bytes.Contains(fc, []byte("<?php")):
@@ -410,9 +447,9 @@ func File(ctx context.Context, path string) (*FileType, error) {
410447
return Path(".gzip"), nil
411448
case bytes.HasPrefix(fc, ZMagic):
412449
return Path(".Z"), nil
450+
default:
451+
return nil, nil
413452
}
414-
415-
return nil, nil
416453
}
417454

418455
func initializeHeaderPool() {

0 commit comments

Comments
 (0)