Skip to content

Commit 2505ac6

Browse files
committed
mostly fixed
1 parent 8fc176d commit 2505ac6

File tree

3 files changed

+230
-169
lines changed

3 files changed

+230
-169
lines changed

src/core/tools/contextValidator.ts

Lines changed: 111 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { Task } from "../task/Task"
22
import { readLines } from "../../integrations/misc/read-lines"
3+
import { readPartialSingleLineContent } from "../../integrations/misc/read-partial-content"
34
import { getModelMaxOutputTokens, getFormatForProvider } from "../../shared/api"
45
import * as fs from "fs/promises"
56

@@ -9,14 +10,8 @@ import * as fs from "fs/promises"
910
* when reading files without affecting other context window calculations.
1011
*/
1112
const FILE_READ_BUFFER_PERCENTAGE = 0.25 // 25% buffer for file reads
12-
13-
/**
14-
* Constants for the 2-phase validation approach
15-
*/
1613
const CHARS_PER_TOKEN_ESTIMATE = 3
17-
const CUTBACK_PERCENTAGE = 0.2 // 20% reduction when over limit
1814
const READ_BATCH_SIZE = 50 // Read 50 lines at a time for efficiency
19-
const MAX_API_CALLS = 5 // Safety limit to prevent infinite loops
2015
const MIN_USEFUL_LINES = 50 // Minimum lines to consider useful
2116

2217
/**
@@ -27,7 +22,7 @@ const SMALL_FILE_SIZE = 100 * 1024 // 100KB - safe if context is mostly empty
2722

2823
export interface ContextValidationResult {
2924
shouldLimit: boolean
30-
safeMaxLines: number
25+
safeMaxLines: number // For single-line files, this represents character count; for multi-line files, it's line count
3126
reason?: string
3227
}
3328

@@ -79,7 +74,6 @@ async function shouldSkipValidation(filePath: string, totalLines: number, cline:
7974
// Get file size
8075
const stats = await fs.stat(filePath)
8176
const fileSizeBytes = stats.size
82-
const fileSizeMB = fileSizeBytes / (1024 * 1024)
8377

8478
// Very small files by size are definitely safe to skip validation
8579
if (fileSizeBytes < TINY_FILE_SIZE) {
@@ -99,65 +93,100 @@ async function shouldSkipValidation(filePath: string, totalLines: number, cline:
9993
// we can skip validation as there's plenty of room
10094
if (contextUsagePercent < 0.5 && fileSizeBytes < SMALL_FILE_SIZE) {
10195
console.log(
102-
`[validateFileSizeForContext] Skipping validation for ${filePath} - context mostly empty (${Math.round(contextUsagePercent * 100)}% used) and file is moderate size (${fileSizeMB.toFixed(2)}MB)`,
96+
`[shouldSkipValidation] Skipping validation for ${filePath} - context mostly empty (${Math.round(contextUsagePercent * 100)}% used) and file is moderate size`,
10397
)
10498
return true
10599
}
106100
} catch (error) {
107101
// If we can't check file size or context state, don't skip validation
108-
console.warn(`[validateFileSizeForContext] Could not check file size or context state: ${error}`)
102+
console.warn(`[shouldSkipValidation] Could not check file size or context state: ${error}`)
109103
}
110104

111105
return false
112106
}
113107

108+
/**
109+
* Detects if a file is effectively a single-line file (1-5 lines with only one non-empty line)
110+
* This handles cases where minified files might have a few empty lines but are essentially single-line
111+
*/
112+
async function isEffectivelySingleLine(filePath: string, totalLines: number): Promise<boolean> {
113+
// Only check files with 1-5 lines
114+
if (totalLines < 1 || totalLines > 5) {
115+
return false
116+
}
117+
118+
// Single line files are always effectively single line
119+
if (totalLines === 1) {
120+
return true
121+
}
122+
123+
try {
124+
// Check if file is big (>100KB) and lines 2-5 are empty
125+
const stats = await fs.stat(filePath)
126+
const fileSizeBytes = stats.size
127+
128+
// Only apply this logic to big files
129+
if (fileSizeBytes < 100 * 1024) {
130+
// Less than 100KB
131+
return false
132+
}
133+
134+
// Read all lines to check if lines 2-5 are empty
135+
const content = await readLines(filePath, totalLines - 1, 0)
136+
const lines = content.split("\n")
137+
138+
// Check if lines 2-5 (indices 1-4) are empty
139+
let hasEmptyLines2to5 = true
140+
for (let i = 1; i < Math.min(lines.length, 5); i++) {
141+
if (lines[i].trim().length > 0) {
142+
hasEmptyLines2to5 = false
143+
break
144+
}
145+
}
146+
147+
console.log(
148+
`[isEffectivelySingleLine] File ${filePath}: totalLines=${totalLines}, fileSize=${(fileSizeBytes / 1024).toFixed(1)}KB, hasEmptyLines2to5=${hasEmptyLines2to5}`,
149+
)
150+
151+
return hasEmptyLines2to5
152+
} catch (error) {
153+
console.warn(`[isEffectivelySingleLine] Error checking file ${filePath}: ${error}`)
154+
return false
155+
}
156+
}
157+
114158
/**
115159
* Validates a single-line file (likely minified) to see if it fits in context
116-
* Uses the same heuristic and backoff strategy as multi-line files
160+
* Uses only heuristic estimation without actual token counting
117161
*/
118162
async function validateSingleLineFile(
119163
filePath: string,
120164
cline: Task,
121165
contextInfo: ContextInfo,
122166
): Promise<ContextValidationResult | null> {
123-
console.log(`[validateFileSizeForContext] Single-line file detected: ${filePath} - checking if it fits in context`)
124-
125167
try {
126-
// Phase 1: Use char/3 heuristic to estimate safe content size
168+
// Use char heuristic to estimate safe content size with additional safety margin
127169
const estimatedSafeChars = contextInfo.targetTokenLimit * CHARS_PER_TOKEN_ESTIMATE
128170

129-
// Read the single line
130-
const fullContent = await readLines(filePath, 0, 0)
171+
// Read only up to the limited chars to avoid loading huge files into memory
172+
const partialContent = await readPartialSingleLineContent(filePath, estimatedSafeChars)
131173

132-
// If the full content fits within our estimated safe chars, try it
133-
let contentToValidate = fullContent
134-
if (fullContent.length > estimatedSafeChars) {
135-
// Content is too large, start with estimated safe portion
136-
contentToValidate = fullContent.substring(0, estimatedSafeChars)
137-
console.log(
138-
`[validateFileSizeForContext] Single-line file exceeds estimated safe chars (${fullContent.length} > ${estimatedSafeChars}), starting with truncated content`,
139-
)
140-
}
141-
142-
// Phase 2: Use shared validation function with cutback
143-
const { finalContent, actualTokens } = await validateAndCutbackContent(
144-
contentToValidate,
145-
contextInfo.targetTokenLimit,
146-
cline,
147-
true,
148-
)
174+
// Get the full file size to determine if we read the entire file
175+
const stats = await fs.stat(filePath)
176+
const fullFileSize = stats.size
177+
const isPartialRead = partialContent.length < fullFileSize
149178

150-
// Determine the result based on what we could read
151-
if (finalContent.length === fullContent.length) {
179+
if (!isPartialRead) {
152180
// The entire single line fits
153181
return { shouldLimit: false, safeMaxLines: -1 }
154-
} else if (finalContent.length > 0) {
182+
} else if (partialContent.length > 0) {
155183
// Only a portion of the line fits
156-
const percentageRead = Math.round((finalContent.length / fullContent.length) * 100)
184+
const percentageRead = Math.round((partialContent.length / fullFileSize) * 100)
185+
157186
return {
158187
shouldLimit: true,
159-
safeMaxLines: 1, // Still technically 1 line, but truncated
160-
reason: `Large single-line file (likely minified) exceeds available context space. Only the first ${percentageRead}% (${finalContent.length} of ${fullContent.length} characters) can be loaded. The file contains ${actualTokens} tokens of the available ${contextInfo.targetTokenLimit} tokens. Context: ${contextInfo.currentlyUsed}/${contextInfo.contextWindow} tokens used (${Math.round((contextInfo.currentlyUsed / contextInfo.contextWindow) * 100)}%). This is a hard limit - no additional content from this file can be accessed.`,
188+
safeMaxLines: partialContent.length, // Return actual character count for single-line files
189+
reason: `Large single-line file (likely minified) exceeds available context space. Only the first ${percentageRead}% (${partialContent.length} of ${fullFileSize} characters) can be loaded. Context: ${contextInfo.currentlyUsed}/${contextInfo.contextWindow} tokens used (${Math.round((contextInfo.currentlyUsed / contextInfo.contextWindow) * 100)}%). This is a hard limit - no additional content from this file can be accessed.`,
161190
}
162191
} else {
163192
// Can't fit any content
@@ -168,8 +197,24 @@ async function validateSingleLineFile(
168197
}
169198
}
170199
} catch (error) {
171-
console.warn(`[validateFileSizeForContext] Error processing single-line file: ${error}`)
172-
return null // Fall through to regular validation
200+
// Check for specific error types that indicate memory issues
201+
if (error instanceof Error) {
202+
const errorMessage = error.message.toLowerCase()
203+
if (
204+
errorMessage.includes("heap") ||
205+
errorMessage.includes("memory") ||
206+
errorMessage.includes("allocation")
207+
) {
208+
// Return a safe fallback instead of crashing
209+
return {
210+
shouldLimit: true,
211+
safeMaxLines: 0,
212+
reason: `File is too large to process due to memory constraints. Error: ${error.message}. This file cannot be accessed.`,
213+
}
214+
}
215+
}
216+
217+
return null // Fall through to regular validation for other errors
173218
}
174219
}
175220

@@ -216,97 +261,6 @@ async function readFileInBatches(
216261
return { content: accumulatedContent, lineCount: currentLine, lineToCharMap }
217262
}
218263

219-
/**
220-
* Shared function to validate content with actual API and apply cutback if needed
221-
* Works for both single-line and multi-line content
222-
*/
223-
async function validateAndCutbackContent(
224-
content: string,
225-
targetTokenLimit: number,
226-
cline: Task,
227-
isSingleLine: boolean = false,
228-
): Promise<{ finalContent: string; actualTokens: number; didCutback: boolean }> {
229-
let finalContent = content
230-
let apiCallCount = 0
231-
let actualTokens = 0
232-
let didCutback = false
233-
234-
while (apiCallCount < MAX_API_CALLS) {
235-
apiCallCount++
236-
237-
// Make the actual API call to count tokens
238-
actualTokens = await cline.api.countTokens([{ type: "text", text: finalContent }])
239-
240-
console.log(
241-
`[validateFileSizeForContext] API call ${apiCallCount}: ${actualTokens} tokens for ${finalContent.length} chars${isSingleLine ? " (single-line)" : ""}`,
242-
)
243-
244-
if (actualTokens <= targetTokenLimit) {
245-
// We're under the limit, we're done!
246-
break
247-
}
248-
249-
// We're over the limit - cut back by CUTBACK_PERCENTAGE
250-
const targetLength = Math.floor(finalContent.length * (1 - CUTBACK_PERCENTAGE))
251-
252-
// Safety check
253-
if (targetLength === 0 || targetLength === finalContent.length) {
254-
break
255-
}
256-
257-
finalContent = finalContent.substring(0, targetLength)
258-
didCutback = true
259-
}
260-
261-
return { finalContent, actualTokens, didCutback }
262-
}
263-
264-
/**
265-
* Validates content with actual API and cuts back if needed (for multi-line files)
266-
*/
267-
async function validateAndAdjustContent(
268-
accumulatedContent: string,
269-
initialLineCount: number,
270-
lineToCharMap: Map<number, number>,
271-
targetTokenLimit: number,
272-
totalLines: number,
273-
cline: Task,
274-
): Promise<{ finalContent: string; finalLineCount: number }> {
275-
// Use the shared validation function
276-
const { finalContent, didCutback } = await validateAndCutbackContent(
277-
accumulatedContent,
278-
targetTokenLimit,
279-
cline,
280-
false,
281-
)
282-
283-
// If no cutback was needed, return original line count
284-
if (!didCutback) {
285-
return { finalContent, finalLineCount: initialLineCount }
286-
}
287-
288-
// Find the line that corresponds to the cut content length
289-
let cutoffLine = 0
290-
for (const [lineNum, charPos] of lineToCharMap.entries()) {
291-
if (charPos > finalContent.length) {
292-
break
293-
}
294-
cutoffLine = lineNum
295-
}
296-
297-
// Ensure we don't cut back too far
298-
if (cutoffLine < 10) {
299-
console.warn(`[validateFileSizeForContext] Cutback resulted in too few lines (${cutoffLine}), using minimum`)
300-
cutoffLine = Math.min(MIN_USEFUL_LINES, totalLines)
301-
}
302-
303-
// Get the character position for the cutoff line
304-
const cutoffCharPos = lineToCharMap.get(cutoffLine) || 0
305-
const adjustedContent = accumulatedContent.substring(0, cutoffCharPos)
306-
307-
return { finalContent: adjustedContent, finalLineCount: cutoffLine }
308-
}
309-
310264
/**
311265
* Handles error cases with conservative fallback
312266
*/
@@ -316,8 +270,6 @@ async function handleValidationError(
316270
currentMaxReadFileLine: number,
317271
error: unknown,
318272
): Promise<ContextValidationResult> {
319-
console.warn(`[validateFileSizeForContext] Error accessing runtime state: ${error}`)
320-
321273
// In error cases, we can't check context state, so use simple file size heuristics
322274
try {
323275
const stats = await fs.stat(filePath)
@@ -329,7 +281,6 @@ async function handleValidationError(
329281
}
330282
} catch (statError) {
331283
// If we can't even stat the file, proceed with conservative defaults
332-
console.warn(`[validateFileSizeForContext] Could not stat file: ${statError}`)
333284
}
334285

335286
if (totalLines > 10000) {
@@ -362,54 +313,54 @@ export async function validateFileSizeForContext(
362313
// Get context information
363314
const contextInfo = await getContextInfo(cline)
364315

365-
// Special handling for single-line files (likely minified)
366-
if (totalLines === 1) {
316+
// Special handling for single-line files (likely minified) or effectively single-line files
317+
const isEffSingleLine = await isEffectivelySingleLine(filePath, totalLines)
318+
if (isEffSingleLine) {
367319
const singleLineResult = await validateSingleLineFile(filePath, cline, contextInfo)
368320
if (singleLineResult) {
369321
return singleLineResult
370322
}
371323
// Fall through to regular validation if single-line validation failed
372324
}
373325

374-
// Phase 1: Read content up to estimated safe character limit
326+
// Read content up to estimated safe character limit
375327
const estimatedSafeChars = contextInfo.targetTokenLimit * CHARS_PER_TOKEN_ESTIMATE
376-
const { content, lineCount, lineToCharMap } = await readFileInBatches(filePath, totalLines, estimatedSafeChars)
377-
378-
// Phase 2: Validate with actual API and cutback if needed
379-
const { finalContent, finalLineCount } = await validateAndAdjustContent(
380-
content,
381-
lineCount,
382-
lineToCharMap,
383-
contextInfo.targetTokenLimit,
384-
totalLines,
385-
cline,
386-
)
328+
console.log(`[validateFileSizeForContext] Estimated safe chars for ${filePath}: ${estimatedSafeChars}`)
387329

388-
// Log final statistics
389-
console.log(`[validateFileSizeForContext] Final: ${finalLineCount} lines, ${finalContent.length} chars`)
330+
const { content, lineCount } = await readFileInBatches(filePath, totalLines, estimatedSafeChars)
331+
console.log(`[validateFileSizeForContext] Read ${lineCount} lines (${content.length} chars) from ${filePath}`)
390332

391-
// Ensure we provide at least a minimum useful amount
392-
const finalSafeMaxLines = Math.max(MIN_USEFUL_LINES, finalLineCount)
393-
394-
// If we read the entire file without exceeding the limit, no limitation needed
395-
if (finalLineCount >= totalLines) {
333+
// If we read the entire file without hitting the character limit, no limitation needed
334+
if (lineCount >= totalLines) {
335+
console.log(`[validateFileSizeForContext] Read entire file ${filePath} without hitting limit`)
396336
return { shouldLimit: false, safeMaxLines: currentMaxReadFileLine }
397337
}
398338

339+
// We hit the character limit before reading all lines
340+
// Ensure we provide at least a minimum useful amount
341+
const finalSafeMaxLines = Math.max(MIN_USEFUL_LINES, lineCount)
342+
console.log(
343+
`[validateFileSizeForContext] Hit character limit for ${filePath}: lineCount=${lineCount}, finalSafeMaxLines=${finalSafeMaxLines}`,
344+
)
345+
399346
// If we couldn't read even the minimum useful lines
400-
if (finalLineCount < MIN_USEFUL_LINES) {
401-
return {
347+
if (lineCount < MIN_USEFUL_LINES) {
348+
const result = {
402349
shouldLimit: true,
403350
safeMaxLines: finalSafeMaxLines,
404-
reason: `Very limited context space. Could only safely read ${finalLineCount} lines before exceeding token limit. Context: ${contextInfo.currentlyUsed}/${contextInfo.contextWindow} tokens used (${Math.round((contextInfo.currentlyUsed / contextInfo.contextWindow) * 100)}%). Limited to ${finalSafeMaxLines} lines. Consider using search_files or line_range for specific sections.`,
351+
reason: `Very limited context space. Could only safely read ${lineCount} lines before exceeding token limit. Context: ${contextInfo.currentlyUsed}/${contextInfo.contextWindow} tokens used (${Math.round((contextInfo.currentlyUsed / contextInfo.contextWindow) * 100)}%). Limited to ${finalSafeMaxLines} lines. Consider using search_files or line_range for specific sections.`,
405352
}
353+
console.log(`[validateFileSizeForContext] Returning very limited context result for ${filePath}:`, result)
354+
return result
406355
}
407356

408-
return {
357+
const result = {
409358
shouldLimit: true,
410359
safeMaxLines: finalSafeMaxLines,
411360
reason: `File exceeds available context space. Safely read ${finalSafeMaxLines} lines out of ${totalLines} total lines. Context usage: ${contextInfo.currentlyUsed}/${contextInfo.contextWindow} tokens (${Math.round((contextInfo.currentlyUsed / contextInfo.contextWindow) * 100)}%). Use line_range to read specific sections.`,
412361
}
362+
console.log(`[validateFileSizeForContext] Returning limited context result for ${filePath}:`, result)
363+
return result
413364
} catch (error) {
414365
return handleValidationError(filePath, totalLines, currentMaxReadFileLine, error)
415366
}

0 commit comments

Comments
 (0)