|
3 | 3 | * Handles text file detection, reading, and validation |
4 | 4 | */ |
5 | 5 |
|
| 6 | +import { |
| 7 | + DEFAULT_BINARY_DETECTION_OPTIONS, |
| 8 | + type BinaryDetectionOptions |
| 9 | +} from '$lib/constants/binary-detection'; |
6 | 10 | import { FileExtensionText } from '$lib/enums/files'; |
7 | 11 |
|
8 | 12 | /** |
@@ -43,41 +47,51 @@ export async function readFileAsText(file: File): Promise<string> { |
43 | 47 | * Heuristic check to determine if content is likely from a text file |
44 | 48 | * Detects binary files by counting suspicious characters and null bytes |
45 | 49 | * @param content - The file content to analyze |
| 50 | + * @param options - Optional configuration for detection parameters |
46 | 51 | * @returns True if the content appears to be text-based |
47 | 52 | */ |
48 | | -export function isLikelyTextFile(content: string): boolean { |
| 53 | +export function isLikelyTextFile( |
| 54 | + content: string, |
| 55 | + options: Partial<BinaryDetectionOptions> = {} |
| 56 | +): boolean { |
49 | 57 | if (!content) return true; |
50 | 58 |
|
51 | | - const sample = content.substring(0, 1000); |
| 59 | + const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options }; |
| 60 | + const sample = content.substring(0, config.prefixLength); |
52 | 61 |
|
53 | | - let suspiciousCount = 0; |
54 | 62 | let nullCount = 0; |
| 63 | + let suspiciousControlCount = 0; |
55 | 64 |
|
56 | 65 | for (let i = 0; i < sample.length; i++) { |
57 | 66 | const charCode = sample.charCodeAt(i); |
58 | 67 |
|
59 | | - // Count null bytes |
| 68 | + // Count null bytes - these are strong indicators of binary files |
60 | 69 | if (charCode === 0) { |
61 | 70 | nullCount++; |
62 | | - suspiciousCount++; |
63 | 71 |
|
64 | 72 | continue; |
65 | 73 | } |
66 | 74 |
|
67 | | - // Count suspicious control characters (excluding common ones like tab, newline, carriage return) |
| 75 | + // Count suspicious control characters |
| 76 | + // Allow common whitespace characters: tab (9), newline (10), carriage return (13) |
68 | 77 | if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) { |
69 | | - suspiciousCount++; |
| 78 | + // Count most suspicious control characters |
| 79 | + if (charCode < 8 || (charCode > 13 && charCode < 27)) { |
| 80 | + suspiciousControlCount++; |
| 81 | + } |
70 | 82 | } |
71 | 83 |
|
72 | 84 | // Count replacement characters (indicates encoding issues) |
73 | 85 | if (charCode === 0xfffd) { |
74 | | - suspiciousCount++; |
| 86 | + suspiciousControlCount++; |
75 | 87 | } |
76 | 88 | } |
77 | 89 |
|
78 | | - // Reject if too many null bytes or suspicious characters |
79 | | - if (nullCount > 2) return false; |
80 | | - if (suspiciousCount / sample.length > 0.1) return false; |
| 90 | + // Reject if too many null bytes |
| 91 | + if (nullCount > config.maxAbsoluteNullBytes) return false; |
| 92 | + |
| 93 | + // Reject if too many suspicious characters |
| 94 | + if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false; |
81 | 95 |
|
82 | 96 | return true; |
83 | 97 | } |
0 commit comments