@@ -15,6 +15,7 @@ import (
1515 "regexp"
1616 "runtime"
1717 "slices"
18+ "strconv"
1819 "strings"
1920 "sync"
2021
@@ -121,24 +122,28 @@ type FileType struct {
121122}
122123
123124var (
125+ ZMagic = []byte {0x78 , 0x5E } // Z magic bytes
126+ // default, partial MIME types we want to consider as valid by default.
127+ defaultMIME = []string {
128+ "application" ,
129+ "executable" ,
130+ "text/x-" ,
131+ }
132+ elfMagic = []byte {0x7f , 'E' , 'L' , 'F' } // ELF magic bytes
133+ gzipMagic = []byte {0x1f , 0x8b } // gZip magic bytes
124134 headerPool * pool.BufferPool
125135 initializeOnce sync.Once
126- versionRegex = regexp .MustCompile (`\d+\.\d+\.\d+$` )
127- // Magic byte constants for common file signatures.
128- elfMagic = []byte {0x7f , 'E' , 'L' , 'F' }
129- gzipMagic = []byte {0x1f , 0x8b }
130- ZMagic = []byte {0x78 , 0x5E }
131- shellShebangs = [][]byte {
132- []byte ("#!/bin/ash" ),
133- []byte ("#!/bin/bash" ),
134- []byte ("#!/bin/dash" ),
135- []byte ("#!/bin/fish" ),
136- []byte ("#!/bin/ksh" ),
137- []byte ("#!/bin/sh" ),
138- []byte ("#!/bin/zsh" ),
139- []byte ("#!/usr/bin/env bash" ),
140- []byte ("#!/usr/bin/env sh" ),
141- []byte ("#!/usr/bin/env zsh" ),
136+ // supported NPM JSON extensions or file names we want to avoid classifying as data files.
137+ npmJSON = []string {
138+ ".js.map" ,
139+ "package-lock.json" ,
140+ "package.json" ,
141+ }
142+ // supported NPM YAML file names we want to avoid classsifying as data files.
143+ npmYAML = []string {
144+ "pnpm-lock.yaml" ,
145+ "pnpm-workspace.yaml" ,
146+ "yarn.lock" ,
142147 }
143148 shellPatterns = [][]byte {
144149 []byte ("; then\n " ),
@@ -160,6 +165,19 @@ var (
160165 []byte ("set -o " ),
161166 []byte ("export PATH" ),
162167 }
168+ shellShebangs = [][]byte {
169+ []byte ("#!/bin/ash" ),
170+ []byte ("#!/bin/bash" ),
171+ []byte ("#!/bin/dash" ),
172+ []byte ("#!/bin/fish" ),
173+ []byte ("#!/bin/ksh" ),
174+ []byte ("#!/bin/sh" ),
175+ []byte ("#!/bin/zsh" ),
176+ []byte ("#!/usr/bin/env bash" ),
177+ []byte ("#!/usr/bin/env sh" ),
178+ []byte ("#!/usr/bin/env zsh" ),
179+ }
180+ versionRegex = regexp .MustCompile (`\d+\.\d+\.\d+$` )
163181)
164182
165183// IsSupportedArchive returns whether a path can be processed by our archive extractor.
@@ -251,51 +269,43 @@ func makeFileType(path string, ext string, mime string) *FileType {
251269 return & FileType {Ext : ext , MIME : mime }
252270 }
253271
254- // typically, JSON and YAML files are data files only scanned via --all, but we want to support the NPM ecosystem
255- if strings .HasSuffix (path , "package.json" ) || strings .HasSuffix (path , "package-lock.json" ) || strings .Contains (path , ".js.map" ) {
256- return & FileType {
257- Ext : ext ,
258- MIME : "application/json" ,
259- }
260- }
261-
262- if strings .HasSuffix (path , "pnpm-lock.yaml" ) ||
263- strings .HasSuffix (path , "pnpm-workspace.yaml" ) ||
264- strings .HasSuffix (path , "yarn.lock" ) ||
265- strings .HasSuffix (path , ".policy" ) {
266- return & FileType {
267- Ext : ext ,
268- MIME : "application/x-yaml" ,
269- }
270- }
271-
272- if supportedKind [ext ] == "" {
272+ switch {
273+ // by default, JSON files will not have a defined MIME type,
274+ // but we want to specifically target the NPM ecosystem
275+ // using --all or --include-data-files will override these distinctions
276+ case containsSuffix (path , npmJSON ):
277+ return & FileType {Ext : ext , MIME : "application/json" }
278+ // by default, YAML files will also not have a defined MIME type,
279+ // but we want to specifically target the NPM ecosystem
280+ // using --all or --include-data-files will override these distinctions
281+ case containsSuffix (path , npmYAML ):
282+ return & FileType {Ext : ext , MIME : "application/x-yaml" }
283+ // the ordering of this statement is important
284+ // placing it first would prevent the preceding JSON/YAML statemments from taking effect
285+ case supportedKind [ext ] == "" :
273286 return nil
274- }
275-
287+ // the follwing statements are not at risk of being preempted by the preceding statement
276288 // fix mimetype bug that defaults elf binaries to x-sharedlib
277- if mime == "application/x-sharedlib" && ! strings .Contains (path , ".so" ) {
289+ case mime == "application/x-sharedlib" && ! strings .Contains (path , ".so" ):
278290 return Path (".elf" )
279- }
280-
281291 // fix mimetype bug that detects certain .js files as shellscript
282- if mime == "text/x-shellscript" && strings .Contains (path , ".js" ) {
292+ case mime == "text/x-shellscript" && strings .Contains (path , ".js" ):
283293 return Path (".js" )
294+ // treat all other MIME types as valid
295+ case containsValue (mime , defaultMIME ):
296+ return & FileType {Ext : ext , MIME : mime }
297+ default :
298+ return nil
284299 }
285-
286- if strings .Contains (mime , "application" ) || strings .Contains (mime , "text/x-" ) || strings .Contains (mime , "executable" ) {
287- return & FileType {
288- Ext : ext ,
289- MIME : mime ,
290- }
291- }
292-
293- return nil
294300}
295301
296- // isLikelyShellScript determines if file content is likely a shell script
302+ // isLikelyShellScript determines if a file's content resembles a shell script
297303// and focuses on multiple criteria to reduce false-positives.
298304func isLikelyShellScript (fc []byte , path string ) bool {
305+ if isLikelyManPage (path ) {
306+ return false
307+ }
308+
299309 if slices .ContainsFunc (shellShebangs , func (shebang []byte ) bool {
300310 return bytes .HasPrefix (fc , shebang )
301311 }) {
@@ -323,6 +333,31 @@ func isLikelyShellScript(fc []byte, path string) bool {
323333 return false
324334}
325335
336+ // isLikelyManPage checks a file's path and its extension to determine
337+ // if it is a man page (e.g., usr/share/man/man7/parallel_examples.7).
338+ func isLikelyManPage (path string ) bool {
339+ if strings .Contains (path , "usr/share/man/" ) {
340+ if _ , err := strconv .Atoi (strings .TrimPrefix (GetExt (path ), "." )); err == nil {
341+ return true
342+ }
343+ }
344+ return false
345+ }
346+
347+ // containsSuffix determines whether a value contains any of the specified strings as a suffix.
348+ func containsSuffix (value string , slice []string ) bool {
349+ return slices .ContainsFunc (slice , func (s string ) bool {
350+ return strings .HasSuffix (value , s )
351+ })
352+ }
353+
354+ // containsValue determines whether a value contains any of the specified substrings.
355+ func containsValue (value string , slice []string ) bool {
356+ return slices .ContainsFunc (slice , func (s string ) bool {
357+ return strings .Contains (value , s )
358+ })
359+ }
360+
326361// File detects what kind of program this file might be.
327362func File (ctx context.Context , path string ) (* FileType , error ) {
328363 // Follow symlinks and return cleanly if the target does not exist
@@ -380,16 +415,18 @@ func File(ctx context.Context, path string) (*FileType, error) {
380415
381416 pathExt := strings .TrimPrefix (GetExt (path ), "." )
382417
383- if _ , pathExtKnown := supportedKind [pathExt ]; pathExtKnown {
384- return nil , nil
385- }
386-
387- if mime == "application/octet-stream" && len (pathExt ) >= 2 {
388- return nil , nil
389- }
418+ _ , pathExtKnown := supportedKind [pathExt ]
390419
391420 // Content-based detection for files with no recognized extension or mimetype
392421 switch {
422+ // if we track an extension in our supportedKind map and the files's type is still nil,
423+ // return nil (e.g., valid JSON or YAML files that we want to treat as data files by default)
424+ case pathExtKnown :
425+ return nil , nil
426+ case mime == "application/octet-stream" && len (pathExt ) >= 2 :
427+ return nil , nil
428+ case strings .Contains (mime , "text/plain" ) && isLikelyManPage (path ):
429+ return nil , nil
393430 case bytes .HasPrefix (fc , elfMagic ):
394431 return Path (".elf" ), nil
395432 case bytes .Contains (fc , []byte ("<?php" )):
@@ -410,9 +447,9 @@ func File(ctx context.Context, path string) (*FileType, error) {
410447 return Path (".gzip" ), nil
411448 case bytes .HasPrefix (fc , ZMagic ):
412449 return Path (".Z" ), nil
450+ default :
451+ return nil , nil
413452 }
414-
415- return nil , nil
416453}
417454
418455func initializeHeaderPool () {
0 commit comments