diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts
index 1459838fe0d..0ec23417d3a 100644
--- a/src/core/tools/readFileTool.ts
+++ b/src/core/tools/readFileTool.ts
@@ -517,11 +517,28 @@ export async function readFileTool(
}
// Handle normal file read
- const content = await extractTextFromFile(fullPath)
- const lineRangeAttr = ` lines="1-${totalLines}"`
- let xmlInfo = totalLines > 0 ? `\n${content}\n` : ``
+ const fileExtension = path.extname(relPath).toLowerCase()
+ let content: string
+ let actualLines = totalLines
+
+ // For Excel files, apply maxReadFileLine as row limit
+ if (fileExtension === ".xlsx" && maxReadFileLine > 0) {
+ content = await extractTextFromFile(fullPath, { maxRows: maxReadFileLine })
+ // Count actual lines in the extracted content to get accurate line count
+ actualLines = content.split("\n").length
+ } else {
+ content = await extractTextFromFile(fullPath)
+ }
+
+ const lineRangeAttr = ` lines="1-${actualLines}"`
+ let xmlInfo = actualLines > 0 ? `\n${content}\n` : ``
+
+ // Add truncation notice for Excel files if they were limited
+ if (fileExtension === ".xlsx" && maxReadFileLine > 0 && content.includes("[... truncated at")) {
+ xmlInfo += `Excel file truncated to ${maxReadFileLine} rows. Use line_range if you need to read more data\n`
+ }
- if (totalLines === 0) {
+ if (actualLines === 0) {
xmlInfo += `File is empty\n`
}
diff --git a/src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts b/src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts
index a3c46e30b77..1b26920386c 100644
--- a/src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts
+++ b/src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts
@@ -218,4 +218,98 @@ describe("extractTextFromXLSX", () => {
await expect(extractTextFromXLSX("/non/existent/file.xlsx")).rejects.toThrow()
})
})
+
+ describe("row limiting", () => {
+ it("should respect maxRows option", async () => {
+ const workbook = new ExcelJS.Workbook()
+ const worksheet = workbook.addWorksheet("Sheet1")
+
+ // Add 10 rows of data
+ for (let i = 1; i <= 10; i++) {
+ worksheet.getCell(`A${i}`).value = `Row ${i}`
+ worksheet.getCell(`B${i}`).value = `Data ${i}`
+ }
+
+ const result = await extractTextFromXLSX(workbook, { maxRows: 5 })
+
+ expect(result).toContain("Row 1")
+ expect(result).toContain("Row 5")
+ expect(result).not.toContain("Row 6")
+ expect(result).toContain("[... truncated at 5 total rows across all sheets ...]")
+ })
+
+ it("should not truncate when maxRows is not exceeded", async () => {
+ const workbook = new ExcelJS.Workbook()
+ const worksheet = workbook.addWorksheet("Sheet1")
+
+ // Add 3 rows of data
+ for (let i = 1; i <= 3; i++) {
+ worksheet.getCell(`A${i}`).value = `Row ${i}`
+ }
+
+ const result = await extractTextFromXLSX(workbook, { maxRows: 5 })
+
+ expect(result).toContain("Row 1")
+ expect(result).toContain("Row 3")
+ expect(result).not.toContain("truncated")
+ })
+
+ it("should handle maxRows across multiple sheets", async () => {
+ const workbook = new ExcelJS.Workbook()
+
+ const sheet1 = workbook.addWorksheet("Sheet1")
+ for (let i = 1; i <= 3; i++) {
+ sheet1.getCell(`A${i}`).value = `Sheet1 Row ${i}`
+ }
+
+ const sheet2 = workbook.addWorksheet("Sheet2")
+ for (let i = 1; i <= 3; i++) {
+ sheet2.getCell(`A${i}`).value = `Sheet2 Row ${i}`
+ }
+
+ const result = await extractTextFromXLSX(workbook, { maxRows: 4 })
+
+ expect(result).toContain("Sheet1 Row 1")
+ expect(result).toContain("Sheet1 Row 3")
+ expect(result).toContain("Sheet2 Row 1")
+ expect(result).not.toContain("Sheet2 Row 2")
+ expect(result).toContain("[... truncated at 4 total rows across all sheets ...]")
+ })
+
+ it("should use default limit when no maxRows specified", async () => {
+ const workbook = new ExcelJS.Workbook()
+ const worksheet = workbook.addWorksheet("Sheet1")
+
+ // Add a few rows
+ for (let i = 1; i <= 5; i++) {
+ worksheet.getCell(`A${i}`).value = `Row ${i}`
+ }
+
+ const result = await extractTextFromXLSX(workbook)
+
+ expect(result).toContain("Row 1")
+ expect(result).toContain("Row 5")
+ expect(result).not.toContain("truncated")
+ })
+
+ it("should handle empty rows correctly when counting towards limit", async () => {
+ const workbook = new ExcelJS.Workbook()
+ const worksheet = workbook.addWorksheet("Sheet1")
+
+ // Add rows with some empty ones
+ worksheet.getCell("A1").value = "Row 1"
+ // Row 2 is empty
+ worksheet.getCell("A3").value = "Row 3"
+ worksheet.getCell("A4").value = "Row 4"
+ worksheet.getCell("A5").value = "Row 5"
+
+ const result = await extractTextFromXLSX(workbook, { maxRows: 3 })
+
+ expect(result).toContain("Row 1")
+ expect(result).toContain("Row 3")
+ expect(result).toContain("Row 4")
+ expect(result).not.toContain("Row 5")
+ expect(result).toContain("[... truncated at 3 total rows across all sheets ...]")
+ })
+ })
})
diff --git a/src/integrations/misc/extract-text-from-xlsx.ts b/src/integrations/misc/extract-text-from-xlsx.ts
index 82e1d4db9a3..1b04712b64a 100644
--- a/src/integrations/misc/extract-text-from-xlsx.ts
+++ b/src/integrations/misc/extract-text-from-xlsx.ts
@@ -1,6 +1,6 @@
import ExcelJS from "exceljs"
-const ROW_LIMIT = 50000
+const DEFAULT_ROW_LIMIT = 50000
function formatCellValue(cell: ExcelJS.Cell): string {
const value = cell.value
@@ -40,9 +40,13 @@ function formatCellValue(cell: ExcelJS.Cell): string {
return value.toString()
}
-export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.Workbook): Promise {
+export async function extractTextFromXLSX(
+ filePathOrWorkbook: string | ExcelJS.Workbook,
+ options?: { maxRows?: number },
+): Promise {
let workbook: ExcelJS.Workbook
let excelText = ""
+ const maxRows = options?.maxRows ?? DEFAULT_ROW_LIMIT
if (typeof filePathOrWorkbook === "string") {
workbook = new ExcelJS.Workbook()
@@ -51,16 +55,24 @@ export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.W
workbook = filePathOrWorkbook
}
+ let totalRowsProcessed = 0
+ let truncated = false
+
workbook.eachSheet((worksheet, sheetId) => {
if (worksheet.state === "hidden" || worksheet.state === "veryHidden") {
return
}
+ if (truncated) {
+ return false // Stop processing sheets if we've already truncated
+ }
+
excelText += `--- Sheet: ${worksheet.name} ---\n`
worksheet.eachRow({ includeEmpty: false }, (row, rowNumber) => {
- if (rowNumber > ROW_LIMIT) {
- excelText += `[... truncated at row ${rowNumber} ...]\n`
+ if (totalRowsProcessed >= maxRows) {
+ excelText += `[... truncated at ${totalRowsProcessed} total rows across all sheets ...]\n`
+ truncated = true
return false
}
@@ -77,12 +89,17 @@ export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.W
if (hasContent) {
excelText += rowTexts.join("\t") + "\n"
+ totalRowsProcessed++
}
return true
})
- excelText += "\n"
+ if (!truncated) {
+ excelText += "\n"
+ }
+
+ return true
})
return excelText.trim()
diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts
index 8c7e7408a68..9a1048f76b9 100644
--- a/src/integrations/misc/extract-text.ts
+++ b/src/integrations/misc/extract-text.ts
@@ -48,7 +48,7 @@ export function getSupportedBinaryFormats(): string[] {
return Object.keys(SUPPORTED_BINARY_FORMATS)
}
-export async function extractTextFromFile(filePath: string): Promise {
+export async function extractTextFromFile(filePath: string, options?: { maxRows?: number }): Promise {
try {
await fs.access(filePath)
} catch (error) {
@@ -60,6 +60,10 @@ export async function extractTextFromFile(filePath: string): Promise {
// Check if we have a specific extractor for this format
const extractor = SUPPORTED_BINARY_FORMATS[fileExtension as keyof typeof SUPPORTED_BINARY_FORMATS]
if (extractor) {
+ // Pass options to Excel extractor, ignore for others
+ if (fileExtension === ".xlsx") {
+ return extractTextFromXLSX(filePath, options)
+ }
return extractor(filePath)
}