|
| 1 | +import { describe, it, expect, beforeAll, afterAll, vi } from "vitest" |
1 | 2 | import { promises as fs } from "fs" |
2 | 3 | import path from "path" |
3 | 4 | import { readLines } from "../read-lines" |
4 | 5 |
|
5 | 6 | describe("nthline", () => { |
6 | 7 | const testFile = path.join(__dirname, "test.txt") |
7 | 8 |
|
| 9 | + // Helper function to create a temporary file, run a test, and clean up |
| 10 | + async function withTempFile(filename: string, content: string, testFn: (filepath: string) => Promise<void>) { |
| 11 | + const filepath = path.join(__dirname, filename) |
| 12 | + await fs.writeFile(filepath, content) |
| 13 | + try { |
| 14 | + await testFn(filepath) |
| 15 | + } finally { |
| 16 | + await fs.unlink(filepath) |
| 17 | + } |
| 18 | + } |
| 19 | + |
8 | 20 | beforeAll(async () => { |
9 | 21 | // Create a test file with numbered lines |
10 | 22 | const content = Array.from({ length: 10 }, (_, i) => `Line ${i + 1}`).join("\n") |
@@ -71,17 +83,6 @@ describe("nthline", () => { |
71 | 83 | await expect(readLines(testFile, 20, 15)).rejects.toThrow("does not exist") |
72 | 84 | }) |
73 | 85 |
|
74 | | - // Helper function to create a temporary file, run a test, and clean up |
75 | | - async function withTempFile(filename: string, content: string, testFn: (filepath: string) => Promise<void>) { |
76 | | - const filepath = path.join(__dirname, filename) |
77 | | - await fs.writeFile(filepath, content) |
78 | | - try { |
79 | | - await testFn(filepath) |
80 | | - } finally { |
81 | | - await fs.unlink(filepath) |
82 | | - } |
83 | | - } |
84 | | - |
85 | 86 | it("should handle empty files", async () => { |
86 | 87 | await withTempFile("empty.txt", "", async (filepath) => { |
87 | 88 | await expect(readLines(filepath, 0, 0)).rejects.toThrow("does not exist") |
@@ -129,4 +130,244 @@ describe("nthline", () => { |
129 | 130 | }) |
130 | 131 | }) |
131 | 132 | }) |
| 133 | + |
| 134 | + describe("bytesRead sampling for encoding detection", () => { |
| 135 | + it("should sample exactly 64KB for encoding detection on large files", async () => { |
| 136 | + // Create a large file with line breaks to test proper sampling |
| 137 | + const lineContent = "This is a test line for large file sampling\n" |
| 138 | + const linesNeeded = Math.ceil(100000 / lineContent.length) // Ensure > 64KB |
| 139 | + const largeContent = lineContent.repeat(linesNeeded) |
| 140 | + |
| 141 | + await withTempFile("large-file.txt", largeContent, async (filepath) => { |
| 142 | + // For large files, the function should read and process correctly |
| 143 | + // We'll verify the function works with large files that exceed 64KB |
| 144 | + const lines = await readLines(filepath, 1) // Read first 2 lines (0-1) |
| 145 | + |
| 146 | + // Verify that the content is read correctly |
| 147 | + expect(lines).toContain("This is a test line for large file sampling") |
| 148 | + // Should only contain 2 lines |
| 149 | + const lineArray = lines.split("\n").filter((line) => line.length > 0) |
| 150 | + expect(lineArray).toHaveLength(2) |
| 151 | + }) |
| 152 | + }) |
| 153 | + |
| 154 | + it("should handle files smaller than 64KB sampling correctly", async () => { |
| 155 | + const smallContent = "Line 1\nLine 2\nLine 3\n" |
| 156 | + |
| 157 | + await withTempFile("small-file.txt", smallContent, async (filepath) => { |
| 158 | + // For small files, the function should still attempt to read 64KB for encoding detection |
| 159 | + // We'll just verify the function works correctly with small files |
| 160 | + const lines = await readLines(filepath, 0) // Read first line (0) |
| 161 | + |
| 162 | + // Verify that the content is read correctly |
| 163 | + expect(lines).toContain("Line 1") |
| 164 | + expect(lines).not.toContain("Line 2") // Should only read first line |
| 165 | + }) |
| 166 | + }) |
| 167 | + |
| 168 | + it("should handle UTF-8 BOM in the 64KB sample correctly", async () => { |
| 169 | + // Create content with UTF-8 BOM at the beginning |
| 170 | + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) |
| 171 | + const textContent = "Line 1 with UTF-8 content\nLine 2\nLine 3\n" |
| 172 | + const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(textContent, "utf8")]) |
| 173 | + |
| 174 | + await withTempFile("bom-file.txt", contentWithBOM.toString(), async (filepath) => { |
| 175 | + // Write the actual binary content with BOM |
| 176 | + await fs.writeFile(filepath, contentWithBOM) |
| 177 | + |
| 178 | + const lines = await readLines(filepath, 0) // Read first line (0) |
| 179 | + |
| 180 | + // Should successfully read the content, BOM should be handled by encoding detection |
| 181 | + expect(lines).toContain("Line 1 with UTF-8 content") |
| 182 | + }) |
| 183 | + }) |
| 184 | + |
| 185 | + it("should handle UTF-16 LE BOM in the 64KB sample correctly", async () => { |
| 186 | + // Create content with UTF-16 LE BOM |
| 187 | + const bomBytes = Buffer.from([0xff, 0xfe]) |
| 188 | + const textContent = "Line 1\nLine 2\n" |
| 189 | + const utf16Content = Buffer.from(textContent, "utf16le") |
| 190 | + const contentWithBOM = Buffer.concat([bomBytes, utf16Content]) |
| 191 | + |
| 192 | + await withTempFile("utf16le-bom-file.txt", "", async (filepath) => { |
| 193 | + // Write the actual binary content with BOM |
| 194 | + await fs.writeFile(filepath, contentWithBOM) |
| 195 | + |
| 196 | + const lines = await readLines(filepath, 1) |
| 197 | + |
| 198 | + // Should successfully read the content, BOM should be handled by encoding detection |
| 199 | + expect(lines).toContain("Line 1") |
| 200 | + }) |
| 201 | + }) |
| 202 | + |
| 203 | + it("should handle partial multi-byte characters at 64KB boundary", async () => { |
| 204 | + // Create content where a multi-byte UTF-8 character might be split at 64KB boundary |
| 205 | + const lineContent = "Line with content: 你好世界\n" |
| 206 | + const linesNeeded = Math.ceil(65536 / lineContent.length) + 5 // Ensure > 64KB |
| 207 | + const fullContent = lineContent.repeat(linesNeeded) + "Final line after boundary\n" |
| 208 | + |
| 209 | + await withTempFile("multibyte-boundary.txt", fullContent, async (filepath) => { |
| 210 | + // Read the last few lines to check the content after the boundary |
| 211 | + const lines = await readLines(filepath, linesNeeded + 1, linesNeeded - 1) // Read last 3 lines |
| 212 | + expect(lines).toContain("Final line after boundary") |
| 213 | + // The multi-byte characters should be preserved |
| 214 | + expect(lines).toContain("你好世界") |
| 215 | + }) |
| 216 | + }) |
| 217 | + |
| 218 | + it("should handle encoding detection failure gracefully with 64KB sampling", async () => { |
| 219 | + // Create binary-like content that might confuse encoding detection |
| 220 | + const binaryLikeContent = Buffer.alloc(70000) // Larger than 64KB |
| 221 | + // Fill with values that might be detected as binary |
| 222 | + for (let i = 0; i < binaryLikeContent.length; i++) { |
| 223 | + binaryLikeContent[i] = i % 256 |
| 224 | + } |
| 225 | + // Add some text at the end |
| 226 | + const textPortion = Buffer.from("\nSome text at the end\n", "utf8") |
| 227 | + const mixedContent = Buffer.concat([binaryLikeContent, textPortion]) |
| 228 | + |
| 229 | + await withTempFile("mixed-content.txt", "", async (filepath) => { |
| 230 | + await fs.writeFile(filepath, mixedContent) |
| 231 | + |
| 232 | + // Should either succeed with fallback encoding or handle gracefully |
| 233 | + try { |
| 234 | + const lines = await readLines(filepath, 0, 0) |
| 235 | + // If it succeeds, it should contain the text portion |
| 236 | + expect(typeof lines).toBe("string") |
| 237 | + } catch (error) { |
| 238 | + // If it fails, it should be a meaningful error about binary content |
| 239 | + expect(error).toBeInstanceOf(Error) |
| 240 | + } |
| 241 | + }) |
| 242 | + }) |
| 243 | + }) |
| 244 | + |
| 245 | + describe("BOM preservation integration tests", () => { |
| 246 | + it("should preserve UTF-8 BOM when reading lines from file", async () => { |
| 247 | + // Create content with UTF-8 BOM |
| 248 | + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) |
| 249 | + const textContent = "First line with UTF-8 content\nSecond line\nThird line\n" |
| 250 | + const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(textContent, "utf8")]) |
| 251 | + |
| 252 | + await withTempFile("utf8-bom-integration.txt", "", async (filepath) => { |
| 253 | + // Write the actual binary content with BOM |
| 254 | + await fs.writeFile(filepath, contentWithBOM) |
| 255 | + |
| 256 | + // Read first line |
| 257 | + const firstLine = await readLines(filepath, 1) |
| 258 | + expect(firstLine).toContain("First line with UTF-8 content") |
| 259 | + |
| 260 | + // Read multiple lines |
| 261 | + const multipleLines = await readLines(filepath, 2) |
| 262 | + expect(multipleLines).toContain("First line with UTF-8 content") |
| 263 | + expect(multipleLines).toContain("Second line") |
| 264 | + |
| 265 | + // Read from specific line |
| 266 | + const fromSecondLine = await readLines(filepath, 1, 1) |
| 267 | + expect(fromSecondLine).toContain("Second line") |
| 268 | + }) |
| 269 | + }) |
| 270 | + |
| 271 | + it("should preserve UTF-16 LE BOM when reading lines from file", async () => { |
| 272 | + // Create content with UTF-16 LE BOM |
| 273 | + const bomBytes = Buffer.from([0xff, 0xfe]) |
| 274 | + const textContent = "UTF-16 LE first line\nUTF-16 LE second line\n" |
| 275 | + const utf16Content = Buffer.from(textContent, "utf16le") |
| 276 | + const contentWithBOM = Buffer.concat([bomBytes, utf16Content]) |
| 277 | + |
| 278 | + await withTempFile("utf16le-bom-integration.txt", "", async (filepath) => { |
| 279 | + // Write the actual binary content with BOM |
| 280 | + await fs.writeFile(filepath, contentWithBOM) |
| 281 | + |
| 282 | + // Read first line |
| 283 | + const firstLine = await readLines(filepath, 0) // Read first line (0) |
| 284 | + expect(firstLine).toContain("UTF-16 LE first line") |
| 285 | + |
| 286 | + // Read multiple lines |
| 287 | + const multipleLines = await readLines(filepath, 1) // Read first 2 lines (0-1) |
| 288 | + expect(multipleLines).toContain("UTF-16 LE first line") |
| 289 | + expect(multipleLines).toContain("UTF-16 LE second line") |
| 290 | + }) |
| 291 | + }) |
| 292 | + |
| 293 | + it("should preserve UTF-16 BE BOM when reading lines from file", async () => { |
| 294 | + // Create content with UTF-16 BE BOM |
| 295 | + const bomBytes = Buffer.from([0xfe, 0xff]) |
| 296 | + const textContent = "UTF-16 BE first line\nUTF-16 BE second line\n" |
| 297 | + // Manually create UTF-16 BE content |
| 298 | + const utf16beBytes = [] |
| 299 | + for (let i = 0; i < textContent.length; i++) { |
| 300 | + const charCode = textContent.charCodeAt(i) |
| 301 | + utf16beBytes.push((charCode >> 8) & 0xff) // High byte first |
| 302 | + utf16beBytes.push(charCode & 0xff) // Low byte second |
| 303 | + } |
| 304 | + const utf16Content = Buffer.from(utf16beBytes) |
| 305 | + const contentWithBOM = Buffer.concat([bomBytes, utf16Content]) |
| 306 | + |
| 307 | + await withTempFile("utf16be-bom-integration.txt", "", async (filepath) => { |
| 308 | + // Write the actual binary content with BOM |
| 309 | + await fs.writeFile(filepath, contentWithBOM) |
| 310 | + |
| 311 | + // Read first line |
| 312 | + const firstLine = await readLines(filepath, 0) // Read first line (0) |
| 313 | + expect(firstLine).toContain("UTF-16 BE first line") |
| 314 | + |
| 315 | + // Read multiple lines |
| 316 | + const multipleLines = await readLines(filepath, 1) // Read first 2 lines (0-1) |
| 317 | + expect(multipleLines).toContain("UTF-16 BE first line") |
| 318 | + expect(multipleLines).toContain("UTF-16 BE second line") |
| 319 | + }) |
| 320 | + }) |
| 321 | + |
| 322 | + it("should handle BOM preservation with large files that exceed 64KB sampling", async () => { |
| 323 | + // Create a large file with UTF-8 BOM that exceeds 64KB |
| 324 | + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) |
| 325 | + const lineContent = "This is a test line with UTF-8 content and BOM: 你好世界 🌍\n" |
| 326 | + const linesNeeded = Math.ceil(65536 / lineContent.length) + 100 // Ensure > 64KB |
| 327 | + const largeTextContent = lineContent.repeat(linesNeeded) |
| 328 | + const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(largeTextContent, "utf8")]) |
| 329 | + |
| 330 | + await withTempFile("large-utf8-bom-integration.txt", "", async (filepath) => { |
| 331 | + // Write the actual binary content with BOM |
| 332 | + await fs.writeFile(filepath, contentWithBOM) |
| 333 | + |
| 334 | + // Read first few lines |
| 335 | + const firstLines = await readLines(filepath, 3) |
| 336 | + expect(firstLines).toContain("This is a test line with UTF-8 content and BOM: 你好世界 🌍") |
| 337 | + |
| 338 | + // Read from middle of file (read 2 lines starting from line 50) |
| 339 | + const middleLines = await readLines(filepath, 51, 49) // Read lines 49-51 (0-based) |
| 340 | + expect(middleLines).toContain("This is a test line with UTF-8 content and BOM: 你好世界 🌍") |
| 341 | + |
| 342 | + // Verify the content is properly decoded despite BOM |
| 343 | + const lines = firstLines.split("\n") |
| 344 | + expect(lines[0]).not.toMatch(/^\uFEFF/) // BOM should not appear in decoded text |
| 345 | + }) |
| 346 | + }) |
| 347 | + |
| 348 | + it("should handle mixed BOM and non-BOM files correctly", async () => { |
| 349 | + // Test reading from a UTF-8 BOM file and then a regular UTF-8 file |
| 350 | + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) |
| 351 | + const bomContent = Buffer.concat([bomBytes, Buffer.from("BOM file content\nSecond line\n", "utf8")]) |
| 352 | + const regularContent = "Regular UTF-8 content\nAnother line\n" |
| 353 | + |
| 354 | + await withTempFile("bom-file-mixed.txt", "", async (bomFilepath) => { |
| 355 | + await fs.writeFile(bomFilepath, bomContent) |
| 356 | + |
| 357 | + await withTempFile("regular-file-mixed.txt", regularContent, async (regularFilepath) => { |
| 358 | + // Read from BOM file |
| 359 | + const bomLines = await readLines(bomFilepath, 0) // Read first line (0) |
| 360 | + expect(bomLines).toContain("BOM file content") |
| 361 | + |
| 362 | + // Read from regular file |
| 363 | + const regularLines = await readLines(regularFilepath, 0) // Read first line (0) |
| 364 | + expect(regularLines).toContain("Regular UTF-8 content") |
| 365 | + |
| 366 | + // Both should work correctly without interference |
| 367 | + expect(bomLines).not.toContain("Regular UTF-8 content") |
| 368 | + expect(regularLines).not.toContain("BOM file content") |
| 369 | + }) |
| 370 | + }) |
| 371 | + }) |
| 372 | + }) |
132 | 373 | }) |
0 commit comments