From a1aae5db3c4957828edf9abf04154e0a33d3d1ef Mon Sep 17 00:00:00 2001
From: Hannes Rudolph <hrudolph@gmail.com>
Date: Wed, 7 Jan 2026 08:27:45 -0700
Subject: [PATCH] fix(read_file): use VSCode decoding for native reads

---
 src/core/tools/ReadFileTool.ts                |  60 ++++++--
 src/core/tools/__tests__/readFileTool.spec.ts | 111 +++++++++++++++
 .../misc/read-text-with-budget.ts             | 128 ++++++++++++++++++
 3 files changed, 290 insertions(+), 9 deletions(-)
 create mode 100644 src/integrations/misc/read-text-with-budget.ts
diff --git a/src/core/tools/ReadFileTool.ts b/src/core/tools/ReadFileTool.ts
index 483d4f00252..f66d5229d6b 100644
--- a/src/core/tools/ReadFileTool.ts
+++ b/src/core/tools/ReadFileTool.ts
@@ -15,6 +15,7 @@ import { getReadablePath } from "../../utils/path"
 import { countFileLines } from "../../integrations/misc/line-counter"
 import { readLines } from "../../integrations/misc/read-lines"
 import { extractTextFromFile, addLineNumbers, getSupportedBinaryFormats } from "../../integrations/misc/extract-text"
+import { readTextWithTokenBudget } from "../../integrations/misc/read-text-with-budget"
 import { parseSourceCodeDefinitionsForFile } from "../../services/tree-sitter"
 import { parseXml } from "../../utils/xml"
 import { resolveToolProtocol } from "../../utils/resolveToolProtocol"
@@ -45,6 +46,28 @@ interface FileResult {
 	feedbackImages?: any[]
 }
 
+function sliceTextLines(text: string, startLine0: number, endLine0: number): string {
+	const lines = text.split(/\r?\n/)
+	// Mirror other readers: if text ends with newline, drop the synthetic last empty line
+	if (lines.length > 0 && lines[lines.length - 1] === "") {
+		lines.pop()
+	}
+	return lines.slice(startLine0, endLine0 + 1).join("\n")
+}
+
+async function tryReadTextViaVscode(fullPath: string): Promise<string | undefined> {
+	try {
+		const vscode = await import("vscode")
+		const uri = vscode.Uri.file(fullPath)
+		const doc = await vscode.workspace.openTextDocument(uri)
+		return doc.getText()
+	} catch {
+		return undefined
+	}
+}
+
+const MAX_VSCODE_TEXT_READ_BYTES = 2 * 1024 * 1024 // avoid loading very large files into memory just to detect encoding
+
 export class ReadFileTool extends BaseTool<"read_file"> {
 	readonly name = "read_file" as const
 
@@ -365,6 +388,16 @@ export class ReadFileTool extends BaseTool<"read_file"> {
 						continue
 					}
 
+					const fileSizeBytes = typeof stats.size === "number" ? stats.size : 0
+					let vscodeText: string | undefined
+					const getVscodeText = async (): Promise<string | undefined> => {
+						if (!useNative) return undefined
+						if (fileSizeBytes > MAX_VSCODE_TEXT_READ_BYTES) return undefined
+						if (vscodeText !== undefined) return vscodeText
+						vscodeText = await tryReadTextViaVscode(fullPath)
+						return vscodeText
+					}
+
 					const [totalLines, isBinary] = await Promise.all([countFileLines(fullPath), isBinaryFile(fullPath)])
 
 					if (isBinary) {
@@ -460,12 +493,14 @@ export class ReadFileTool extends BaseTool<"read_file"> {
 					if (fileResult.lineRanges && fileResult.lineRanges.length > 0) {
 						const rangeResults: string[] = []
 						const nativeRangeResults: string[] = []
+						const maybeText = await getVscodeText()
 
 						for (const range of fileResult.lineRanges) {
-							const content = addLineNumbers(
-								await readLines(fullPath, range.end - 1, range.start - 1),
-								range.start,
-							)
+							const rawRangeText =
+								useNative && maybeText !== undefined
+									? sliceTextLines(maybeText, range.start - 1, range.end - 1)
+									: await readLines(fullPath, range.end - 1, range.start - 1)
+							const content = addLineNumbers(rawRangeText, range.start)
 							const lineRangeAttr = ` lines="${range.start}-${range.end}"`
 							rangeResults.push(`<content${lineRangeAttr}>\n${content}</content>`)
 							nativeRangeResults.push(`Lines ${range.start}-${range.end}:\n${content}`)
@@ -504,7 +539,12 @@ export class ReadFileTool extends BaseTool<"read_file"> {
 					}
 
 					if (maxReadFileLine > 0 && totalLines > maxReadFileLine) {
-						const content = addLineNumbers(await readLines(fullPath, maxReadFileLine - 1, 0))
+						const maybeText = await getVscodeText()
+						const rawText =
+							useNative && maybeText !== undefined
+								? sliceTextLines(maybeText, 0, maxReadFileLine - 1)
+								: await readLines(fullPath, maxReadFileLine - 1, 0)
+						const content = addLineNumbers(rawText)
 						const lineRangeAttr = ` lines="1-${maxReadFileLine}"`
 						let xmlInfo = `<content${lineRangeAttr}>\n${content}</content>\n`
 						let nativeInfo = `Lines 1-${maxReadFileLine}:\n${content}\n`
@@ -566,10 +606,12 @@ export class ReadFileTool extends BaseTool<"read_file"> {
 						xmlInfo = `<content/>\n<notice>${notice}</notice>\n`
 						nativeInfo = `Note: ${notice}`
 					} else {
-						// Read file with incremental token counting
-						const result = await readFileWithTokenBudget(fullPath, {
-							budgetTokens: safeReadBudget,
-						})
+						// Prefer VSCode decoding (encoding-aware) for native tool protocol.
+						const maybeText = await getVscodeText()
+						const result =
+							useNative && maybeText !== undefined
+								? await readTextWithTokenBudget(maybeText, { budgetTokens: safeReadBudget })
+								: await readFileWithTokenBudget(fullPath, { budgetTokens: safeReadBudget })
 
 						content = addLineNumbers(result.content)
 
diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts
index f178e38026c..ce5cc167dca 100644
--- a/src/core/tools/__tests__/readFileTool.spec.ts
+++ b/src/core/tools/__tests__/readFileTool.spec.ts
@@ -2,6 +2,8 @@
 
 import * as path from "path"
 
+import * as vscode from "vscode"
+
 import { countFileLines } from "../../../integrations/misc/line-counter"
 import { readLines } from "../../../integrations/misc/read-lines"
 import { extractTextFromFile } from "../../../integrations/misc/extract-text"
@@ -23,6 +25,21 @@ vi.mock("path", async () => {
 
 vi.mock("isbinaryfile")
 
+vi.mock("vscode", () => ({
+	Uri: {
+		file: vi.fn((fsPath: string) => ({ fsPath })),
+	},
+	workspace: {
+		// Default: behave like VSCode isn't available in this test environment.
+		openTextDocument: vi.fn().mockRejectedValue(new Error("vscode not available")),
+	},
+}))
+
+// Avoid spawning tokenizer workers from `read-text-with-budget` in unit tests.
+vi.mock("../../../utils/countTokens", () => ({
+	countTokens: vi.fn().mockResolvedValue(1),
+}))
+
 vi.mock("../../../integrations/misc/line-counter")
 vi.mock("../../../integrations/misc/read-lines")
 
@@ -2011,3 +2028,97 @@ describe("read_file tool concurrent file reads limit", () => {
 		expect(toolResult).toContain("but the concurrent file reads limit is 5")
 	})
 })
+
+describe("read_file tool native protocol - VSCode decoding path", () => {
+	const testFilePath = "test/encoded.txt"
+	const absoluteFilePath = "/test/encoded.txt"
+
+	const mockedCountFileLines = vi.mocked(countFileLines)
+	const mockedReadLines = vi.mocked(readLines)
+	const mockedIsBinaryFile = vi.mocked(isBinaryFile)
+	const mockedPathResolve = vi.mocked(path.resolve)
+	const mockedOpenTextDocument = vi.mocked(vscode.workspace.openTextDocument)
+
+	let mockCline: any
+	let mockProvider: any
+	let toolResult: ToolResponse | undefined
+
+	beforeEach(() => {
+		mockedCountFileLines.mockClear()
+		mockedReadLines.mockClear()
+		mockedIsBinaryFile.mockClear()
+		mockedPathResolve.mockClear()
+		mockedOpenTextDocument.mockClear()
+		mockReadFileWithTokenBudget.mockClear()
+
+		const mocks = createMockCline()
+		mockCline = mocks.mockCline
+		mockProvider = mocks.mockProvider
+		setImageSupport(mockCline, false)
+
+		mockedPathResolve.mockReturnValue(absoluteFilePath)
+		mockedIsBinaryFile.mockResolvedValue(false)
+		mockedCountFileLines.mockResolvedValue(1)
+
+		fsPromises.stat.mockResolvedValue({
+			isDirectory: () => false,
+			isFile: () => true,
+			isSymbolicLink: () => false,
+		} as any)
+
+		mockProvider.getState.mockResolvedValue({
+			maxReadFileLine: -1,
+			maxImageFileSize: 20,
+			maxTotalImageSize: 20,
+		})
+
+		toolResult = undefined
+	})
+
+	async function executeReadFile(args: string): Promise<ToolResponse | undefined> {
+		const toolUse: ReadFileToolUse = {
+			type: "tool_use",
+			name: "read_file",
+			params: { args },
+			partial: false,
+		}
+
+		await readFileTool.handle(mockCline, toolUse, {
+			askApproval: mockCline.ask,
+			handleError: vi.fn(),
+			pushToolResult: (result: ToolResponse) => {
+				toolResult = result
+			},
+			removeClosingTag: (_: ToolParamName, content?: string) => content ?? "",
+			toolProtocol: "xml",
+		})
+
+		return toolResult
+	}
+
+	it("should prefer vscode.workspace.openTextDocument() when available (full read)", async () => {
+		mockedOpenTextDocument.mockResolvedValue({ getText: () => "caf\u00e9" } as any)
+		mockReadFileWithTokenBudget.mockRejectedValue(new Error("should not be called"))
+
+		const result = await executeReadFile(`<file><path>${testFilePath}</path></file>`)
+
+		expect(mockedOpenTextDocument).toHaveBeenCalledTimes(1)
+		expect(result).toContain(`File: ${testFilePath}`)
+		expect(result).toContain("caf\u00e9")
+		expect(mockReadFileWithTokenBudget).not.toHaveBeenCalled()
+	})
+
+	it("should use vscode-decoded text for line_range reads", async () => {
+		mockedCountFileLines.mockResolvedValue(3)
+		mockedOpenTextDocument.mockResolvedValue({ getText: () => "L1\nL2\nL3" } as any)
+		mockedReadLines.mockRejectedValue(new Error("should not be called"))
+
+		const result = await executeReadFile(`<file><path>${testFilePath}</path><line_range>2-3</line_range></file>`)
+
+		expect(mockedOpenTextDocument).toHaveBeenCalledTimes(1)
+		expect(mockedReadLines).not.toHaveBeenCalled()
+		expect(result).toContain(`File: ${testFilePath}`)
+		expect(result).toContain("2 | L2")
+		expect(result).toContain("3 | L3")
+	})
+})
diff --git a/src/integrations/misc/read-text-with-budget.ts b/src/integrations/misc/read-text-with-budget.ts
new file mode 100644
index 00000000000..e9f3acad790
--- /dev/null
+++ b/src/integrations/misc/read-text-with-budget.ts
@@ -0,0 +1,128 @@
+import { Anthropic } from "@anthropic-ai/sdk"
+
+import { countTokens } from "../../utils/countTokens"
+
+export interface ReadTextWithBudgetResult {
+	/** The content read up to the token budget */
+	content: string
+	/** Actual token count of returned content */
+	tokenCount: number
+	/** Total lines in the returned content */
+	lineCount: number
+	/** Whether the entire text was read (false if truncated) */
+	complete: boolean
+}
+
+export interface ReadTextWithBudgetOptions {
+	/** Maximum tokens allowed. Required. */
+	budgetTokens: number
+	/** Number of lines to buffer before token counting (default: 256) */
+	chunkLines?: number
+}
+
+function normalizeTextToLines(text: string): string[] {
+	// Normalize line endings and mirror `readFileWithTokenBudget()` behavior:
+	// - split on line boundaries
+	// - do not include a trailing empty line caused solely by a trailing newline
+	const lines = text.split(/\r?\n/)
+	if (lines.length > 0 && lines[lines.length - 1] === "") {
+		lines.pop()
+	}
+	return lines
+}
+
+async function countTextTokens(text: string): Promise<number> {
+	try {
+		const contentBlocks: Anthropic.Messages.ContentBlockParam[] = [{ type: "text", text }]
+		return await countTokens(contentBlocks)
+	} catch {
+		// Fallback: conservative estimate (2 chars per token)
+		return Math.ceil(text.length / 2)
+	}
+}
+
+/**
+ * Reads text while incrementally counting tokens, stopping when budget is reached.
+ *
+ * This is the in-memory analogue of [`readFileWithTokenBudget()`](src/integrations/misc/read-file-with-budget.ts:35).
+ */
+export async function readTextWithTokenBudget(
+	text: string,
+	options: ReadTextWithBudgetOptions,
+): Promise<ReadTextWithBudgetResult> {
+	const { budgetTokens, chunkLines = 256 } = options
+
+	const allLines = normalizeTextToLines(text)
+	if (allLines.length === 0) {
+		return { content: "", tokenCount: 0, lineCount: 0, complete: true }
+	}
+
+	let content = ""
+	let lineCount = 0
+	let tokenCount = 0
+	let complete = true
+	let lineBuffer: string[] = []
+
+	const processBuffer = async (): Promise<boolean> => {
+		if (lineBuffer.length === 0) return true
+
+		const bufferText = lineBuffer.join("\n")
+		const currentBuffer = [...lineBuffer]
+		lineBuffer = []
+
+		const chunkTokens = await countTextTokens(bufferText)
+
+		if (tokenCount + chunkTokens > budgetTokens) {
+			// Find cutoff within this chunk (binary search by line count)
+			let low = 0
+			let high = currentBuffer.length
+			let bestFit = 0
+			let bestTokens = 0
+
+			while (low < high) {
+				const mid = Math.floor((low + high + 1) / 2)
+				const testContent = currentBuffer.slice(0, mid).join("\n")
+				const testTokens = await countTextTokens(testContent)
+
+				if (tokenCount + testTokens <= budgetTokens) {
+					bestFit = mid
+					bestTokens = testTokens
+					low = mid
+				} else {
+					high = mid - 1
+				}
+			}
+
+			if (bestFit > 0) {
+				const fitContent = currentBuffer.slice(0, bestFit).join("\n")
+				content += (content.length > 0 ? "\n" : "") + fitContent
+				tokenCount += bestTokens
+				lineCount += bestFit
+			}
+
+			complete = false
+			return false
+		}
+
+		content += (content.length > 0 ? "\n" : "") + bufferText
+		tokenCount += chunkTokens
+		lineCount += currentBuffer.length
+		return true
+	}
+
+	for (const line of allLines) {
+		lineBuffer.push(line)
+		if (lineBuffer.length >= chunkLines) {
+			const continueReading = await processBuffer()
+			if (!continueReading) {
+				return { content, tokenCount, lineCount, complete }
+			}
+		}
+	}
+
+	if (lineBuffer.length > 0) {
+		await processBuffer()
+	}
+
+	return { content, tokenCount, lineCount, complete }
+}