Skip to content

Commit d81f79d

Browse files
committed
Fixes #5146: Apply maxReadFileLine limit to Excel files
- Modified extractTextFromXLSX to accept maxRows option - Updated extractTextFromFile to pass row limit to Excel extraction - Modified readFileTool to apply maxReadFileLine setting to Excel files - Added comprehensive tests for row limiting functionality - Excel files now respect the same line limits as regular text files - Prevents 'input length too long' errors when reading large Excel files
1 parent 3a8ba27 commit d81f79d

File tree

4 files changed

+140
-10
lines changed

4 files changed

+140
-10
lines changed

src/core/tools/readFileTool.ts

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -517,11 +517,28 @@ export async function readFileTool(
517517
}
518518

519519
// Handle normal file read
520-
const content = await extractTextFromFile(fullPath)
521-
const lineRangeAttr = ` lines="1-${totalLines}"`
522-
let xmlInfo = totalLines > 0 ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
520+
const fileExtension = path.extname(relPath).toLowerCase()
521+
let content: string
522+
let actualLines = totalLines
523+
524+
// For Excel files, apply maxReadFileLine as row limit
525+
if (fileExtension === ".xlsx" && maxReadFileLine > 0) {
526+
content = await extractTextFromFile(fullPath, { maxRows: maxReadFileLine })
527+
// Count actual lines in the extracted content to get accurate line count
528+
actualLines = content.split("\n").length
529+
} else {
530+
content = await extractTextFromFile(fullPath)
531+
}
532+
533+
const lineRangeAttr = ` lines="1-${actualLines}"`
534+
let xmlInfo = actualLines > 0 ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
535+
536+
// Add truncation notice for Excel files if they were limited
537+
if (fileExtension === ".xlsx" && maxReadFileLine > 0 && content.includes("[... truncated at")) {
538+
xmlInfo += `<notice>Excel file truncated to ${maxReadFileLine} rows. Use line_range if you need to read more data</notice>\n`
539+
}
523540

524-
if (totalLines === 0) {
541+
if (actualLines === 0) {
525542
xmlInfo += `<notice>File is empty</notice>\n`
526543
}
527544

src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,4 +218,98 @@ describe("extractTextFromXLSX", () => {
218218
await expect(extractTextFromXLSX("/non/existent/file.xlsx")).rejects.toThrow()
219219
})
220220
})
221+
222+
describe("row limiting", () => {
223+
it("should respect maxRows option", async () => {
224+
const workbook = new ExcelJS.Workbook()
225+
const worksheet = workbook.addWorksheet("Sheet1")
226+
227+
// Add 10 rows of data
228+
for (let i = 1; i <= 10; i++) {
229+
worksheet.getCell(`A${i}`).value = `Row ${i}`
230+
worksheet.getCell(`B${i}`).value = `Data ${i}`
231+
}
232+
233+
const result = await extractTextFromXLSX(workbook, { maxRows: 5 })
234+
235+
expect(result).toContain("Row 1")
236+
expect(result).toContain("Row 5")
237+
expect(result).not.toContain("Row 6")
238+
expect(result).toContain("[... truncated at 5 total rows across all sheets ...]")
239+
})
240+
241+
it("should not truncate when maxRows is not exceeded", async () => {
242+
const workbook = new ExcelJS.Workbook()
243+
const worksheet = workbook.addWorksheet("Sheet1")
244+
245+
// Add 3 rows of data
246+
for (let i = 1; i <= 3; i++) {
247+
worksheet.getCell(`A${i}`).value = `Row ${i}`
248+
}
249+
250+
const result = await extractTextFromXLSX(workbook, { maxRows: 5 })
251+
252+
expect(result).toContain("Row 1")
253+
expect(result).toContain("Row 3")
254+
expect(result).not.toContain("truncated")
255+
})
256+
257+
it("should handle maxRows across multiple sheets", async () => {
258+
const workbook = new ExcelJS.Workbook()
259+
260+
const sheet1 = workbook.addWorksheet("Sheet1")
261+
for (let i = 1; i <= 3; i++) {
262+
sheet1.getCell(`A${i}`).value = `Sheet1 Row ${i}`
263+
}
264+
265+
const sheet2 = workbook.addWorksheet("Sheet2")
266+
for (let i = 1; i <= 3; i++) {
267+
sheet2.getCell(`A${i}`).value = `Sheet2 Row ${i}`
268+
}
269+
270+
const result = await extractTextFromXLSX(workbook, { maxRows: 4 })
271+
272+
expect(result).toContain("Sheet1 Row 1")
273+
expect(result).toContain("Sheet1 Row 3")
274+
expect(result).toContain("Sheet2 Row 1")
275+
expect(result).not.toContain("Sheet2 Row 2")
276+
expect(result).toContain("[... truncated at 4 total rows across all sheets ...]")
277+
})
278+
279+
it("should use default limit when no maxRows specified", async () => {
280+
const workbook = new ExcelJS.Workbook()
281+
const worksheet = workbook.addWorksheet("Sheet1")
282+
283+
// Add a few rows
284+
for (let i = 1; i <= 5; i++) {
285+
worksheet.getCell(`A${i}`).value = `Row ${i}`
286+
}
287+
288+
const result = await extractTextFromXLSX(workbook)
289+
290+
expect(result).toContain("Row 1")
291+
expect(result).toContain("Row 5")
292+
expect(result).not.toContain("truncated")
293+
})
294+
295+
it("should handle empty rows correctly when counting towards limit", async () => {
296+
const workbook = new ExcelJS.Workbook()
297+
const worksheet = workbook.addWorksheet("Sheet1")
298+
299+
// Add rows with some empty ones
300+
worksheet.getCell("A1").value = "Row 1"
301+
// Row 2 is empty
302+
worksheet.getCell("A3").value = "Row 3"
303+
worksheet.getCell("A4").value = "Row 4"
304+
worksheet.getCell("A5").value = "Row 5"
305+
306+
const result = await extractTextFromXLSX(workbook, { maxRows: 3 })
307+
308+
expect(result).toContain("Row 1")
309+
expect(result).toContain("Row 3")
310+
expect(result).toContain("Row 4")
311+
expect(result).not.toContain("Row 5")
312+
expect(result).toContain("[... truncated at 3 total rows across all sheets ...]")
313+
})
314+
})
221315
})

src/integrations/misc/extract-text-from-xlsx.ts

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import ExcelJS from "exceljs"
22

3-
const ROW_LIMIT = 50000
3+
const DEFAULT_ROW_LIMIT = 50000
44

55
function formatCellValue(cell: ExcelJS.Cell): string {
66
const value = cell.value
@@ -40,9 +40,13 @@ function formatCellValue(cell: ExcelJS.Cell): string {
4040
return value.toString()
4141
}
4242

43-
export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.Workbook): Promise<string> {
43+
export async function extractTextFromXLSX(
44+
filePathOrWorkbook: string | ExcelJS.Workbook,
45+
options?: { maxRows?: number },
46+
): Promise<string> {
4447
let workbook: ExcelJS.Workbook
4548
let excelText = ""
49+
const maxRows = options?.maxRows ?? DEFAULT_ROW_LIMIT
4650

4751
if (typeof filePathOrWorkbook === "string") {
4852
workbook = new ExcelJS.Workbook()
@@ -51,16 +55,24 @@ export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.W
5155
workbook = filePathOrWorkbook
5256
}
5357

58+
let totalRowsProcessed = 0
59+
let truncated = false
60+
5461
workbook.eachSheet((worksheet, sheetId) => {
5562
if (worksheet.state === "hidden" || worksheet.state === "veryHidden") {
5663
return
5764
}
5865

66+
if (truncated) {
67+
return false // Stop processing sheets if we've already truncated
68+
}
69+
5970
excelText += `--- Sheet: ${worksheet.name} ---\n`
6071

6172
worksheet.eachRow({ includeEmpty: false }, (row, rowNumber) => {
62-
if (rowNumber > ROW_LIMIT) {
63-
excelText += `[... truncated at row ${rowNumber} ...]\n`
73+
if (totalRowsProcessed >= maxRows) {
74+
excelText += `[... truncated at ${totalRowsProcessed} total rows across all sheets ...]\n`
75+
truncated = true
6476
return false
6577
}
6678

@@ -77,12 +89,15 @@ export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.W
7789

7890
if (hasContent) {
7991
excelText += rowTexts.join("\t") + "\n"
92+
totalRowsProcessed++
8093
}
8194

8295
return true
8396
})
8497

85-
excelText += "\n"
98+
if (!truncated) {
99+
excelText += "\n"
100+
}
86101
})
87102

88103
return excelText.trim()

src/integrations/misc/extract-text.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ export function getSupportedBinaryFormats(): string[] {
4848
return Object.keys(SUPPORTED_BINARY_FORMATS)
4949
}
5050

51-
export async function extractTextFromFile(filePath: string): Promise<string> {
51+
export async function extractTextFromFile(filePath: string, options?: { maxRows?: number }): Promise<string> {
5252
try {
5353
await fs.access(filePath)
5454
} catch (error) {
@@ -60,6 +60,10 @@ export async function extractTextFromFile(filePath: string): Promise<string> {
6060
// Check if we have a specific extractor for this format
6161
const extractor = SUPPORTED_BINARY_FORMATS[fileExtension as keyof typeof SUPPORTED_BINARY_FORMATS]
6262
if (extractor) {
63+
// Pass options to Excel extractor, ignore for others
64+
if (fileExtension === ".xlsx") {
65+
return extractTextFromXLSX(filePath, options)
66+
}
6367
return extractor(filePath)
6468
}
6569

0 commit comments

Comments
 (0)