diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts index 1459838fe0d..0ec23417d3a 100644 --- a/src/core/tools/readFileTool.ts +++ b/src/core/tools/readFileTool.ts @@ -517,11 +517,28 @@ export async function readFileTool( } // Handle normal file read - const content = await extractTextFromFile(fullPath) - const lineRangeAttr = ` lines="1-${totalLines}"` - let xmlInfo = totalLines > 0 ? `\n${content}\n` : `` + const fileExtension = path.extname(relPath).toLowerCase() + let content: string + let actualLines = totalLines + + // For Excel files, apply maxReadFileLine as row limit + if (fileExtension === ".xlsx" && maxReadFileLine > 0) { + content = await extractTextFromFile(fullPath, { maxRows: maxReadFileLine }) + // Count actual lines in the extracted content to get accurate line count + actualLines = content.split("\n").length + } else { + content = await extractTextFromFile(fullPath) + } + + const lineRangeAttr = ` lines="1-${actualLines}"` + let xmlInfo = actualLines > 0 ? `\n${content}\n` : `` + + // Add truncation notice for Excel files if they were limited + if (fileExtension === ".xlsx" && maxReadFileLine > 0 && content.includes("[... truncated at")) { + xmlInfo += `Excel file truncated to ${maxReadFileLine} rows. Use line_range if you need to read more data\n` + } - if (totalLines === 0) { + if (actualLines === 0) { xmlInfo += `File is empty\n` } diff --git a/src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts b/src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts index a3c46e30b77..1b26920386c 100644 --- a/src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts +++ b/src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts @@ -218,4 +218,98 @@ describe("extractTextFromXLSX", () => { await expect(extractTextFromXLSX("/non/existent/file.xlsx")).rejects.toThrow() }) }) + + describe("row limiting", () => { + it("should respect maxRows option", async () => { + const workbook = new ExcelJS.Workbook() + const worksheet = workbook.addWorksheet("Sheet1") + + // Add 10 rows of data + for (let i = 1; i <= 10; i++) { + worksheet.getCell(`A${i}`).value = `Row ${i}` + worksheet.getCell(`B${i}`).value = `Data ${i}` + } + + const result = await extractTextFromXLSX(workbook, { maxRows: 5 }) + + expect(result).toContain("Row 1") + expect(result).toContain("Row 5") + expect(result).not.toContain("Row 6") + expect(result).toContain("[... truncated at 5 total rows across all sheets ...]") + }) + + it("should not truncate when maxRows is not exceeded", async () => { + const workbook = new ExcelJS.Workbook() + const worksheet = workbook.addWorksheet("Sheet1") + + // Add 3 rows of data + for (let i = 1; i <= 3; i++) { + worksheet.getCell(`A${i}`).value = `Row ${i}` + } + + const result = await extractTextFromXLSX(workbook, { maxRows: 5 }) + + expect(result).toContain("Row 1") + expect(result).toContain("Row 3") + expect(result).not.toContain("truncated") + }) + + it("should handle maxRows across multiple sheets", async () => { + const workbook = new ExcelJS.Workbook() + + const sheet1 = workbook.addWorksheet("Sheet1") + for (let i = 1; i <= 3; i++) { + sheet1.getCell(`A${i}`).value = `Sheet1 Row ${i}` + } + + const sheet2 = workbook.addWorksheet("Sheet2") + for (let i = 1; i <= 3; i++) { + sheet2.getCell(`A${i}`).value = `Sheet2 Row ${i}` + } + + const result = await extractTextFromXLSX(workbook, { maxRows: 4 }) + + expect(result).toContain("Sheet1 Row 1") + expect(result).toContain("Sheet1 Row 3") + expect(result).toContain("Sheet2 Row 1") + expect(result).not.toContain("Sheet2 Row 2") + expect(result).toContain("[... truncated at 4 total rows across all sheets ...]") + }) + + it("should use default limit when no maxRows specified", async () => { + const workbook = new ExcelJS.Workbook() + const worksheet = workbook.addWorksheet("Sheet1") + + // Add a few rows + for (let i = 1; i <= 5; i++) { + worksheet.getCell(`A${i}`).value = `Row ${i}` + } + + const result = await extractTextFromXLSX(workbook) + + expect(result).toContain("Row 1") + expect(result).toContain("Row 5") + expect(result).not.toContain("truncated") + }) + + it("should handle empty rows correctly when counting towards limit", async () => { + const workbook = new ExcelJS.Workbook() + const worksheet = workbook.addWorksheet("Sheet1") + + // Add rows with some empty ones + worksheet.getCell("A1").value = "Row 1" + // Row 2 is empty + worksheet.getCell("A3").value = "Row 3" + worksheet.getCell("A4").value = "Row 4" + worksheet.getCell("A5").value = "Row 5" + + const result = await extractTextFromXLSX(workbook, { maxRows: 3 }) + + expect(result).toContain("Row 1") + expect(result).toContain("Row 3") + expect(result).toContain("Row 4") + expect(result).not.toContain("Row 5") + expect(result).toContain("[... truncated at 3 total rows across all sheets ...]") + }) + }) }) diff --git a/src/integrations/misc/extract-text-from-xlsx.ts b/src/integrations/misc/extract-text-from-xlsx.ts index 82e1d4db9a3..1b04712b64a 100644 --- a/src/integrations/misc/extract-text-from-xlsx.ts +++ b/src/integrations/misc/extract-text-from-xlsx.ts @@ -1,6 +1,6 @@ import ExcelJS from "exceljs" -const ROW_LIMIT = 50000 +const DEFAULT_ROW_LIMIT = 50000 function formatCellValue(cell: ExcelJS.Cell): string { const value = cell.value @@ -40,9 +40,13 @@ function formatCellValue(cell: ExcelJS.Cell): string { return value.toString() } -export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.Workbook): Promise { +export async function extractTextFromXLSX( + filePathOrWorkbook: string | ExcelJS.Workbook, + options?: { maxRows?: number }, +): Promise { let workbook: ExcelJS.Workbook let excelText = "" + const maxRows = options?.maxRows ?? DEFAULT_ROW_LIMIT if (typeof filePathOrWorkbook === "string") { workbook = new ExcelJS.Workbook() @@ -51,16 +55,24 @@ export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.W workbook = filePathOrWorkbook } + let totalRowsProcessed = 0 + let truncated = false + workbook.eachSheet((worksheet, sheetId) => { if (worksheet.state === "hidden" || worksheet.state === "veryHidden") { return } + if (truncated) { + return false // Stop processing sheets if we've already truncated + } + excelText += `--- Sheet: ${worksheet.name} ---\n` worksheet.eachRow({ includeEmpty: false }, (row, rowNumber) => { - if (rowNumber > ROW_LIMIT) { - excelText += `[... truncated at row ${rowNumber} ...]\n` + if (totalRowsProcessed >= maxRows) { + excelText += `[... truncated at ${totalRowsProcessed} total rows across all sheets ...]\n` + truncated = true return false } @@ -77,12 +89,17 @@ export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.W if (hasContent) { excelText += rowTexts.join("\t") + "\n" + totalRowsProcessed++ } return true }) - excelText += "\n" + if (!truncated) { + excelText += "\n" + } + + return true }) return excelText.trim() diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts index 8c7e7408a68..9a1048f76b9 100644 --- a/src/integrations/misc/extract-text.ts +++ b/src/integrations/misc/extract-text.ts @@ -48,7 +48,7 @@ export function getSupportedBinaryFormats(): string[] { return Object.keys(SUPPORTED_BINARY_FORMATS) } -export async function extractTextFromFile(filePath: string): Promise { +export async function extractTextFromFile(filePath: string, options?: { maxRows?: number }): Promise { try { await fs.access(filePath) } catch (error) { @@ -60,6 +60,10 @@ export async function extractTextFromFile(filePath: string): Promise { // Check if we have a specific extractor for this format const extractor = SUPPORTED_BINARY_FORMATS[fileExtension as keyof typeof SUPPORTED_BINARY_FORMATS] if (extractor) { + // Pass options to Excel extractor, ignore for others + if (fileExtension === ".xlsx") { + return extractTextFromXLSX(filePath, options) + } return extractor(filePath) }