From a1aae5db3c4957828edf9abf04154e0a33d3d1ef Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Wed, 7 Jan 2026 08:27:45 -0700 Subject: [PATCH] fix(read_file): use VSCode decoding for native reads --- src/core/tools/ReadFileTool.ts | 60 ++++++-- src/core/tools/__tests__/readFileTool.spec.ts | 111 +++++++++++++++ .../misc/read-text-with-budget.ts | 128 ++++++++++++++++++ 3 files changed, 290 insertions(+), 9 deletions(-) create mode 100644 src/integrations/misc/read-text-with-budget.ts diff --git a/src/core/tools/ReadFileTool.ts b/src/core/tools/ReadFileTool.ts index 483d4f00252..f66d5229d6b 100644 --- a/src/core/tools/ReadFileTool.ts +++ b/src/core/tools/ReadFileTool.ts @@ -15,6 +15,7 @@ import { getReadablePath } from "../../utils/path" import { countFileLines } from "../../integrations/misc/line-counter" import { readLines } from "../../integrations/misc/read-lines" import { extractTextFromFile, addLineNumbers, getSupportedBinaryFormats } from "../../integrations/misc/extract-text" +import { readTextWithTokenBudget } from "../../integrations/misc/read-text-with-budget" import { parseSourceCodeDefinitionsForFile } from "../../services/tree-sitter" import { parseXml } from "../../utils/xml" import { resolveToolProtocol } from "../../utils/resolveToolProtocol" @@ -45,6 +46,28 @@ interface FileResult { feedbackImages?: any[] } +function sliceTextLines(text: string, startLine0: number, endLine0: number): string { + const lines = text.split(/\r?\n/) + // Mirror other readers: if text ends with newline, drop the synthetic last empty line + if (lines.length > 0 && lines[lines.length - 1] === "") { + lines.pop() + } + return lines.slice(startLine0, endLine0 + 1).join("\n") +} + +async function tryReadTextViaVscode(fullPath: string): Promise { + try { + const vscode = await import("vscode") + const uri = vscode.Uri.file(fullPath) + const doc = await vscode.workspace.openTextDocument(uri) + return doc.getText() + } catch { + return undefined + } +} + +const MAX_VSCODE_TEXT_READ_BYTES = 2 * 1024 * 1024 // avoid loading very large files into memory just to detect encoding + export class ReadFileTool extends BaseTool<"read_file"> { readonly name = "read_file" as const @@ -365,6 +388,16 @@ export class ReadFileTool extends BaseTool<"read_file"> { continue } + const fileSizeBytes = typeof stats.size === "number" ? stats.size : 0 + let vscodeText: string | undefined + const getVscodeText = async (): Promise => { + if (!useNative) return undefined + if (fileSizeBytes > MAX_VSCODE_TEXT_READ_BYTES) return undefined + if (vscodeText !== undefined) return vscodeText + vscodeText = await tryReadTextViaVscode(fullPath) + return vscodeText + } + const [totalLines, isBinary] = await Promise.all([countFileLines(fullPath), isBinaryFile(fullPath)]) if (isBinary) { @@ -460,12 +493,14 @@ export class ReadFileTool extends BaseTool<"read_file"> { if (fileResult.lineRanges && fileResult.lineRanges.length > 0) { const rangeResults: string[] = [] const nativeRangeResults: string[] = [] + const maybeText = await getVscodeText() for (const range of fileResult.lineRanges) { - const content = addLineNumbers( - await readLines(fullPath, range.end - 1, range.start - 1), - range.start, - ) + const rawRangeText = + useNative && maybeText !== undefined + ? sliceTextLines(maybeText, range.start - 1, range.end - 1) + : await readLines(fullPath, range.end - 1, range.start - 1) + const content = addLineNumbers(rawRangeText, range.start) const lineRangeAttr = ` lines="${range.start}-${range.end}"` rangeResults.push(`\n${content}`) nativeRangeResults.push(`Lines ${range.start}-${range.end}:\n${content}`) @@ -504,7 +539,12 @@ export class ReadFileTool extends BaseTool<"read_file"> { } if (maxReadFileLine > 0 && totalLines > maxReadFileLine) { - const content = addLineNumbers(await readLines(fullPath, maxReadFileLine - 1, 0)) + const maybeText = await getVscodeText() + const rawText = + useNative && maybeText !== undefined + ? sliceTextLines(maybeText, 0, maxReadFileLine - 1) + : await readLines(fullPath, maxReadFileLine - 1, 0) + const content = addLineNumbers(rawText) const lineRangeAttr = ` lines="1-${maxReadFileLine}"` let xmlInfo = `\n${content}\n` let nativeInfo = `Lines 1-${maxReadFileLine}:\n${content}\n` @@ -566,10 +606,12 @@ export class ReadFileTool extends BaseTool<"read_file"> { xmlInfo = `\n${notice}\n` nativeInfo = `Note: ${notice}` } else { - // Read file with incremental token counting - const result = await readFileWithTokenBudget(fullPath, { - budgetTokens: safeReadBudget, - }) + // Prefer VSCode decoding (encoding-aware) for native tool protocol. + const maybeText = await getVscodeText() + const result = + useNative && maybeText !== undefined + ? await readTextWithTokenBudget(maybeText, { budgetTokens: safeReadBudget }) + : await readFileWithTokenBudget(fullPath, { budgetTokens: safeReadBudget }) content = addLineNumbers(result.content) diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts index f178e38026c..ce5cc167dca 100644 --- a/src/core/tools/__tests__/readFileTool.spec.ts +++ b/src/core/tools/__tests__/readFileTool.spec.ts @@ -2,6 +2,8 @@ import * as path from "path" +import * as vscode from "vscode" + import { countFileLines } from "../../../integrations/misc/line-counter" import { readLines } from "../../../integrations/misc/read-lines" import { extractTextFromFile } from "../../../integrations/misc/extract-text" @@ -23,6 +25,21 @@ vi.mock("path", async () => { vi.mock("isbinaryfile") +vi.mock("vscode", () => ({ + Uri: { + file: vi.fn((fsPath: string) => ({ fsPath })), + }, + workspace: { + // Default: behave like VSCode isn't available in this test environment. + openTextDocument: vi.fn().mockRejectedValue(new Error("vscode not available")), + }, +})) + +// Avoid spawning tokenizer workers from `read-text-with-budget` in unit tests. +vi.mock("../../../utils/countTokens", () => ({ + countTokens: vi.fn().mockResolvedValue(1), +})) + vi.mock("../../../integrations/misc/line-counter") vi.mock("../../../integrations/misc/read-lines") @@ -2011,3 +2028,97 @@ describe("read_file tool concurrent file reads limit", () => { expect(toolResult).toContain("but the concurrent file reads limit is 5") }) }) + +describe("read_file tool native protocol - VSCode decoding path", () => { + const testFilePath = "test/encoded.txt" + const absoluteFilePath = "/test/encoded.txt" + + const mockedCountFileLines = vi.mocked(countFileLines) + const mockedReadLines = vi.mocked(readLines) + const mockedIsBinaryFile = vi.mocked(isBinaryFile) + const mockedPathResolve = vi.mocked(path.resolve) + const mockedOpenTextDocument = vi.mocked(vscode.workspace.openTextDocument) + + let mockCline: any + let mockProvider: any + let toolResult: ToolResponse | undefined + + beforeEach(() => { + mockedCountFileLines.mockClear() + mockedReadLines.mockClear() + mockedIsBinaryFile.mockClear() + mockedPathResolve.mockClear() + mockedOpenTextDocument.mockClear() + mockReadFileWithTokenBudget.mockClear() + + const mocks = createMockCline() + mockCline = mocks.mockCline + mockProvider = mocks.mockProvider + setImageSupport(mockCline, false) + + mockedPathResolve.mockReturnValue(absoluteFilePath) + mockedIsBinaryFile.mockResolvedValue(false) + mockedCountFileLines.mockResolvedValue(1) + + fsPromises.stat.mockResolvedValue({ + isDirectory: () => false, + isFile: () => true, + isSymbolicLink: () => false, + } as any) + + mockProvider.getState.mockResolvedValue({ + maxReadFileLine: -1, + maxImageFileSize: 20, + maxTotalImageSize: 20, + }) + + toolResult = undefined + }) + + async function executeReadFile(args: string): Promise { + const toolUse: ReadFileToolUse = { + type: "tool_use", + name: "read_file", + params: { args }, + partial: false, + } + + await readFileTool.handle(mockCline, toolUse, { + askApproval: mockCline.ask, + handleError: vi.fn(), + pushToolResult: (result: ToolResponse) => { + toolResult = result + }, + removeClosingTag: (_: ToolParamName, content?: string) => content ?? "", + toolProtocol: "xml", + }) + + return toolResult + } + + it("should prefer vscode.workspace.openTextDocument() when available (full read)", async () => { + mockedOpenTextDocument.mockResolvedValue({ getText: () => "caf\u00e9" } as any) + mockReadFileWithTokenBudget.mockRejectedValue(new Error("should not be called")) + + const result = await executeReadFile(`${testFilePath}`) + + expect(mockedOpenTextDocument).toHaveBeenCalledTimes(1) + expect(result).toContain(`File: ${testFilePath}`) + expect(result).toContain("caf\u00e9") + expect(mockReadFileWithTokenBudget).not.toHaveBeenCalled() + }) + + it("should use vscode-decoded text for line_range reads", async () => { + mockedCountFileLines.mockResolvedValue(3) + mockedOpenTextDocument.mockResolvedValue({ getText: () => "L1\nL2\nL3" } as any) + mockedReadLines.mockRejectedValue(new Error("should not be called")) + + const result = await executeReadFile(`${testFilePath}2-3`) + + expect(mockedOpenTextDocument).toHaveBeenCalledTimes(1) + expect(mockedReadLines).not.toHaveBeenCalled() + expect(result).toContain(`File: ${testFilePath}`) + expect(result).toContain("2 | L2") + expect(result).toContain("3 | L3") + }) +}) diff --git a/src/integrations/misc/read-text-with-budget.ts b/src/integrations/misc/read-text-with-budget.ts new file mode 100644 index 00000000000..e9f3acad790 --- /dev/null +++ b/src/integrations/misc/read-text-with-budget.ts @@ -0,0 +1,128 @@ +import { Anthropic } from "@anthropic-ai/sdk" + +import { countTokens } from "../../utils/countTokens" + +export interface ReadTextWithBudgetResult { + /** The content read up to the token budget */ + content: string + /** Actual token count of returned content */ + tokenCount: number + /** Total lines in the returned content */ + lineCount: number + /** Whether the entire text was read (false if truncated) */ + complete: boolean +} + +export interface ReadTextWithBudgetOptions { + /** Maximum tokens allowed. Required. */ + budgetTokens: number + /** Number of lines to buffer before token counting (default: 256) */ + chunkLines?: number +} + +function normalizeTextToLines(text: string): string[] { + // Normalize line endings and mirror `readFileWithTokenBudget()` behavior: + // - split on line boundaries + // - do not include a trailing empty line caused solely by a trailing newline + const lines = text.split(/\r?\n/) + if (lines.length > 0 && lines[lines.length - 1] === "") { + lines.pop() + } + return lines +} + +async function countTextTokens(text: string): Promise { + try { + const contentBlocks: Anthropic.Messages.ContentBlockParam[] = [{ type: "text", text }] + return await countTokens(contentBlocks) + } catch { + // Fallback: conservative estimate (2 chars per token) + return Math.ceil(text.length / 2) + } +} + +/** + * Reads text while incrementally counting tokens, stopping when budget is reached. + * + * This is the in-memory analogue of [`readFileWithTokenBudget()`](src/integrations/misc/read-file-with-budget.ts:35). + */ +export async function readTextWithTokenBudget( + text: string, + options: ReadTextWithBudgetOptions, +): Promise { + const { budgetTokens, chunkLines = 256 } = options + + const allLines = normalizeTextToLines(text) + if (allLines.length === 0) { + return { content: "", tokenCount: 0, lineCount: 0, complete: true } + } + + let content = "" + let lineCount = 0 + let tokenCount = 0 + let complete = true + let lineBuffer: string[] = [] + + const processBuffer = async (): Promise => { + if (lineBuffer.length === 0) return true + + const bufferText = lineBuffer.join("\n") + const currentBuffer = [...lineBuffer] + lineBuffer = [] + + const chunkTokens = await countTextTokens(bufferText) + + if (tokenCount + chunkTokens > budgetTokens) { + // Find cutoff within this chunk (binary search by line count) + let low = 0 + let high = currentBuffer.length + let bestFit = 0 + let bestTokens = 0 + + while (low < high) { + const mid = Math.floor((low + high + 1) / 2) + const testContent = currentBuffer.slice(0, mid).join("\n") + const testTokens = await countTextTokens(testContent) + + if (tokenCount + testTokens <= budgetTokens) { + bestFit = mid + bestTokens = testTokens + low = mid + } else { + high = mid - 1 + } + } + + if (bestFit > 0) { + const fitContent = currentBuffer.slice(0, bestFit).join("\n") + content += (content.length > 0 ? "\n" : "") + fitContent + tokenCount += bestTokens + lineCount += bestFit + } + + complete = false + return false + } + + content += (content.length > 0 ? "\n" : "") + bufferText + tokenCount += chunkTokens + lineCount += currentBuffer.length + return true + } + + for (const line of allLines) { + lineBuffer.push(line) + if (lineBuffer.length >= chunkLines) { + const continueReading = await processBuffer() + if (!continueReading) { + return { content, tokenCount, lineCount, complete } + } + } + } + + if (lineBuffer.length > 0) { + await processBuffer() + } + + return { content, tokenCount, lineCount, complete } +}