Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions src/core/tools/readFileTool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -517,11 +517,28 @@ export async function readFileTool(
}

// Handle normal file read
const content = await extractTextFromFile(fullPath)
const lineRangeAttr = ` lines="1-${totalLines}"`
let xmlInfo = totalLines > 0 ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
const fileExtension = path.extname(relPath).toLowerCase()
let content: string
let actualLines = totalLines

// For Excel files, apply maxReadFileLine as row limit
if (fileExtension === ".xlsx" && maxReadFileLine > 0) {
content = await extractTextFromFile(fullPath, { maxRows: maxReadFileLine })
// Count actual lines in the extracted content to get accurate line count
actualLines = content.split("\n").length
} else {
content = await extractTextFromFile(fullPath)
}

const lineRangeAttr = ` lines="1-${actualLines}"`
let xmlInfo = actualLines > 0 ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`

// Add truncation notice for Excel files if they were limited
if (fileExtension === ".xlsx" && maxReadFileLine > 0 && content.includes("[... truncated at")) {
xmlInfo += `<notice>Excel file truncated to ${maxReadFileLine} rows. Use line_range if you need to read more data</notice>\n`
}

if (totalLines === 0) {
if (actualLines === 0) {
xmlInfo += `<notice>File is empty</notice>\n`
}

Expand Down
94 changes: 94 additions & 0 deletions src/integrations/misc/__tests__/extract-text-from-xlsx.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,4 +218,98 @@ describe("extractTextFromXLSX", () => {
await expect(extractTextFromXLSX("/non/existent/file.xlsx")).rejects.toThrow()
})
})

describe("row limiting", () => {
it("should respect maxRows option", async () => {
const workbook = new ExcelJS.Workbook()
const worksheet = workbook.addWorksheet("Sheet1")

// Add 10 rows of data
for (let i = 1; i <= 10; i++) {
worksheet.getCell(`A${i}`).value = `Row ${i}`
worksheet.getCell(`B${i}`).value = `Data ${i}`
}

const result = await extractTextFromXLSX(workbook, { maxRows: 5 })

expect(result).toContain("Row 1")
expect(result).toContain("Row 5")
expect(result).not.toContain("Row 6")
expect(result).toContain("[... truncated at 5 total rows across all sheets ...]")
})

it("should not truncate when maxRows is not exceeded", async () => {
const workbook = new ExcelJS.Workbook()
const worksheet = workbook.addWorksheet("Sheet1")

// Add 3 rows of data
for (let i = 1; i <= 3; i++) {
worksheet.getCell(`A${i}`).value = `Row ${i}`
}

const result = await extractTextFromXLSX(workbook, { maxRows: 5 })

expect(result).toContain("Row 1")
expect(result).toContain("Row 3")
expect(result).not.toContain("truncated")
})

it("should handle maxRows across multiple sheets", async () => {
const workbook = new ExcelJS.Workbook()

const sheet1 = workbook.addWorksheet("Sheet1")
for (let i = 1; i <= 3; i++) {
sheet1.getCell(`A${i}`).value = `Sheet1 Row ${i}`
}

const sheet2 = workbook.addWorksheet("Sheet2")
for (let i = 1; i <= 3; i++) {
sheet2.getCell(`A${i}`).value = `Sheet2 Row ${i}`
}

const result = await extractTextFromXLSX(workbook, { maxRows: 4 })

expect(result).toContain("Sheet1 Row 1")
expect(result).toContain("Sheet1 Row 3")
expect(result).toContain("Sheet2 Row 1")
expect(result).not.toContain("Sheet2 Row 2")
expect(result).toContain("[... truncated at 4 total rows across all sheets ...]")
})

it("should use default limit when no maxRows specified", async () => {
const workbook = new ExcelJS.Workbook()
const worksheet = workbook.addWorksheet("Sheet1")

// Add a few rows
for (let i = 1; i <= 5; i++) {
worksheet.getCell(`A${i}`).value = `Row ${i}`
}

const result = await extractTextFromXLSX(workbook)

expect(result).toContain("Row 1")
expect(result).toContain("Row 5")
expect(result).not.toContain("truncated")
})

it("should handle empty rows correctly when counting towards limit", async () => {
const workbook = new ExcelJS.Workbook()
const worksheet = workbook.addWorksheet("Sheet1")

// Add rows with some empty ones
worksheet.getCell("A1").value = "Row 1"
// Row 2 is empty
worksheet.getCell("A3").value = "Row 3"
worksheet.getCell("A4").value = "Row 4"
worksheet.getCell("A5").value = "Row 5"

const result = await extractTextFromXLSX(workbook, { maxRows: 3 })

expect(result).toContain("Row 1")
expect(result).toContain("Row 3")
expect(result).toContain("Row 4")
expect(result).not.toContain("Row 5")
expect(result).toContain("[... truncated at 3 total rows across all sheets ...]")
})
})
})
27 changes: 22 additions & 5 deletions src/integrations/misc/extract-text-from-xlsx.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import ExcelJS from "exceljs"

const ROW_LIMIT = 50000
const DEFAULT_ROW_LIMIT = 50000

function formatCellValue(cell: ExcelJS.Cell): string {
const value = cell.value
Expand Down Expand Up @@ -40,9 +40,13 @@ function formatCellValue(cell: ExcelJS.Cell): string {
return value.toString()
}

export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.Workbook): Promise<string> {
export async function extractTextFromXLSX(
filePathOrWorkbook: string | ExcelJS.Workbook,
options?: { maxRows?: number },
): Promise<string> {
let workbook: ExcelJS.Workbook
let excelText = ""
const maxRows = options?.maxRows ?? DEFAULT_ROW_LIMIT

if (typeof filePathOrWorkbook === "string") {
workbook = new ExcelJS.Workbook()
Expand All @@ -51,16 +55,24 @@ export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.W
workbook = filePathOrWorkbook
}

let totalRowsProcessed = 0
let truncated = false

workbook.eachSheet((worksheet, sheetId) => {
if (worksheet.state === "hidden" || worksheet.state === "veryHidden") {
return
}

if (truncated) {
return false // Stop processing sheets if we've already truncated
}

excelText += `--- Sheet: ${worksheet.name} ---\n`

worksheet.eachRow({ includeEmpty: false }, (row, rowNumber) => {
if (rowNumber > ROW_LIMIT) {
excelText += `[... truncated at row ${rowNumber} ...]\n`
if (totalRowsProcessed >= maxRows) {
excelText += `[... truncated at ${totalRowsProcessed} total rows across all sheets ...]\n`
truncated = true
return false
}

Expand All @@ -77,12 +89,17 @@ export async function extractTextFromXLSX(filePathOrWorkbook: string | ExcelJS.W

if (hasContent) {
excelText += rowTexts.join("\t") + "\n"
totalRowsProcessed++
}

return true
})

excelText += "\n"
if (!truncated) {
excelText += "\n"
}

return true
})

return excelText.trim()
Expand Down
6 changes: 5 additions & 1 deletion src/integrations/misc/extract-text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ export function getSupportedBinaryFormats(): string[] {
return Object.keys(SUPPORTED_BINARY_FORMATS)
}

export async function extractTextFromFile(filePath: string): Promise<string> {
export async function extractTextFromFile(filePath: string, options?: { maxRows?: number }): Promise<string> {
try {
await fs.access(filePath)
} catch (error) {
Expand All @@ -60,6 +60,10 @@ export async function extractTextFromFile(filePath: string): Promise<string> {
// Check if we have a specific extractor for this format
const extractor = SUPPORTED_BINARY_FORMATS[fileExtension as keyof typeof SUPPORTED_BINARY_FORMATS]
if (extractor) {
// Pass options to Excel extractor, ignore for others
if (fileExtension === ".xlsx") {
return extractTextFromXLSX(filePath, options)
}
return extractor(filePath)
}

Expand Down