vscode/extensions/copilot/script/analyzeEdits.ts

/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the MIT License. See License.txt in the project root for license information.
 *--------------------------------------------------------------------------------------------*/

import { promises as fs } from 'fs';
import * as path from 'path';
import * as readline from 'readline';

// Edit tool names we're tracking
const EDIT_TOOL_NAMES = ['insert_edit_into_file', 'replace_string_in_file', 'multi_replace_string_in_file', 'apply_patch'];

// Tool names that indicate a continuation/retry attempt
const CONTINUATION_TOOL_NAMES = ['read_file'];

interface ToolCall {
	tool: string;
	input_tokens?: number;
	cached_input_tokens?: number;
	output_tokens?: number;
	response: string | string[];
	edits?: Array<{
		path: string;
		edits: {
			replacements: Array<{
				replaceRange: { start: number; endExclusive: number };
				newText: string;
			}>;
		};
	}>;
}

interface EditOperation {
	toolName: string;
	timestamp: string;
	success: boolean;
	filePath?: string;
	turnIndex: number;
	isRetry: boolean;
	retrySucceeded?: boolean;
}

interface ConversationAnalysis {
	conversationPath: string;
	edits: EditOperation[];
	totalEdits: number;
	successfulEdits: number;
	failedEdits: number;
	successfulEditsWithRetries: number;
	totalUniqueEdits: number;
	modelName?: string;
}

interface RunAnalysis {
	runId: string;
	conversations: ConversationAnalysis[];
	totalEdits: number;
	successRate: number;
	successRateWithRetries: number;
	totalUniqueEdits: number;
	modelName?: string;
}

async function listRuns(amlOutPath: string): Promise<string[]> {
	const entries = await fs.readdir(amlOutPath, { withFileTypes: true });
	// Filter directories that are numeric run IDs
	const runs = entries
		.filter(e => e.isDirectory() && /^\d+$/.test(e.name))
		.map(e => e.name)
		.sort((a, b) => parseInt(b) - parseInt(a)); // Sort descending (newest first)
	return runs;
}

async function promptUserForRun(runs: string[]): Promise<string> {
	console.log('\nAvailable test runs (newest first):');
	runs.slice(0, 10).forEach((run, i) => {
		console.log(`  ${i + 1}. ${run}`);
	});
	if (runs.length > 10) {
		console.log(`  ... and ${runs.length - 10} more`);
	}

	const rl = readline.createInterface({
		input: process.stdin,
		output: process.stdout
	});

	return new Promise((resolve) => {
		rl.question('\nEnter run number (or press Enter for the most recent): ', (answer) => {
			rl.close();
			const choice = answer.trim();
			if (choice === '') {
				resolve(runs[0]);
			} else {
				const index = parseInt(choice) - 1;
				if (index >= 0 && index < runs.length) {
					resolve(runs[index]);
				} else {
					console.log('Invalid selection, using most recent run.');
					resolve(runs[0]);
				}
			}
		});
	});
}

async function analyzeConversation(conversationPath: string): Promise<ConversationAnalysis> {
	const trajectoryPath = path.join(conversationPath, 'trajectories', 'trajectory.json');

	let toolCalls: ToolCall[] = [];
	let modelName: string | undefined;

	try {
		const content = await fs.readFile(trajectoryPath, 'utf-8');
		toolCalls = JSON.parse(content);
	} catch (error) {
		console.warn(`Could not read trajectory file: ${trajectoryPath}`);
		return {
			conversationPath,
			edits: [],
			totalEdits: 0,
			successfulEdits: 0,
			failedEdits: 0,
			successfulEditsWithRetries: 0,
			totalUniqueEdits: 0
		};
	}

	const edits: EditOperation[] = [];
	let turnIndex = 0;

	for (let i = 0; i < toolCalls.length; i++) {
		const toolCall = toolCalls[i];

		if (!EDIT_TOOL_NAMES.includes(toolCall.tool)) {
			continue;
		}

		// Determine success based on response
		const response = Array.isArray(toolCall.response) ? toolCall.response[0] : toolCall.response;
		const success = typeof response === 'string' && response.includes('successfully edited');

		// Get file path from edits if available
		const filePath = toolCall.edits && toolCall.edits.length > 0
			? toolCall.edits[0].path
			: undefined;

		// Detect retry pattern: failed edit -> continuation tool -> another edit
		let isRetry = false;
		let retrySucceeded: boolean | undefined;

		if (!success) {
			// Look ahead to see if there's a continuation tool followed by another edit
			let j = i + 1;
			let foundContinuationTool = false;
			while (j < toolCalls.length && j < i + 10) { // Look ahead max 10 calls
				if (CONTINUATION_TOOL_NAMES.includes(toolCalls[j].tool)) {
					foundContinuationTool = true;
				} else if (foundContinuationTool && EDIT_TOOL_NAMES.includes(toolCalls[j].tool)) {
					// Found a retry!
					isRetry = true;
					const retryResponse = Array.isArray(toolCalls[j].response)
						? toolCalls[j].response[0]
						: toolCalls[j].response;
					retrySucceeded = typeof retryResponse === 'string' && retryResponse.includes('successfully edited');
					break;
				} else if (EDIT_TOOL_NAMES.includes(toolCalls[j].tool)) {
					// Another edit without continuation tool in between, not a retry
					break;
				}
				j++;
			}
		}

		edits.push({
			toolName: toolCall.tool,
			timestamp: new Date().toISOString(), // Trajectory doesn't have timestamps, use current time
			success,
			filePath,
			turnIndex: turnIndex++,
			isRetry,
			retrySucceeded
		});
	}

	const successfulEdits = edits.filter(e => e.success).length;

	// Calculate success rate accounting for retries (final outcome only)
	const editsWithRetries = edits.filter(e => !e.success && e.isRetry);
	const retriedSuccesses = editsWithRetries.filter(e => e.retrySucceeded).length;
	const successfulEditsWithRetries = successfulEdits + retriedSuccesses;
	const totalUniqueEdits = edits.length - editsWithRetries.length + editsWithRetries.filter(e => e.retrySucceeded !== undefined).length;

	return {
		conversationPath,
		edits,
		totalEdits: edits.length,
		successfulEdits,
		failedEdits: edits.length - successfulEdits,
		successfulEditsWithRetries,
		totalUniqueEdits,
		modelName
	};
}

async function analyzeRun(runId: string, basePath: string): Promise<RunAnalysis> {
	const runPath = path.join(basePath, runId);

	const conversations: ConversationAnalysis[] = [];

	try {
		const entries = await fs.readdir(runPath, { withFileTypes: true });

		for (const entry of entries) {
			if (entry.isDirectory()) {
				const conversationPath = path.join(runPath, entry.name);
				const analysis = await analyzeConversation(conversationPath);
				if (analysis.totalEdits > 0) {
					conversations.push(analysis);
				}
			}
		}
	} catch (error) {
		console.error(`Error reading run directory: ${error}`);
	}

	const totalEdits = conversations.reduce((sum, c) => sum + c.totalEdits, 0);
	const totalSuccessful = conversations.reduce((sum, c) => sum + c.successfulEdits, 0);
	const totalSuccessfulWithRetries = conversations.reduce((sum, c) => sum + c.successfulEditsWithRetries, 0);
	const totalUniqueEdits = conversations.reduce((sum, c) => sum + c.totalUniqueEdits, 0);

	// Get model name from first conversation that has one
	const modelName = conversations.find(c => c.modelName)?.modelName;

	return {
		runId,
		conversations,
		totalEdits,
		successRate: totalEdits > 0 ? totalSuccessful / totalEdits : 0,
		successRateWithRetries: totalUniqueEdits > 0 ? totalSuccessfulWithRetries / totalUniqueEdits : 0,
		totalUniqueEdits,
		modelName
	};
}

function generateHTML(analysis: RunAnalysis, outputPath: string, includeRetries: boolean = false): string {
	// Build Sankey data
	const sankeyNodes: string[] = [];
	const sankeyLinks: Array<{ source: number; target: number; value: number }> = [];

	const nodeMap = new Map<string, number>();

	const getNodeIndex = (name: string): number => {
		if (!nodeMap.has(name)) {
			nodeMap.set(name, sankeyNodes.length);
			sankeyNodes.push(name);
		}
		return nodeMap.get(name)!;
	};

	// Track flows
	const flows = new Map<string, number>();

	for (const conv of analysis.conversations) {
		for (const edit of conv.edits) {
			const toolNode = edit.toolName;

			// Check if this is a failed edit with a retry
			if (includeRetries && !edit.success && edit.isRetry && edit.retrySucceeded !== undefined) {
				// Show full retry flow: Tool -> Failed -> read_file -> Retry Edit -> Final Result
				const failedNode = 'Failed (will retry)';
				const readFileNode = 'read_file';
				const retryEditNode = `${toolNode} (retry)`;
				const finalResult = edit.retrySucceeded ? 'Success' : 'Failed';

				flows.set(`${toolNode}->${failedNode}`, (flows.get(`${toolNode}->${failedNode}`) || 0) + 1);
				flows.set(`${failedNode}->${readFileNode}`, (flows.get(`${failedNode}->${readFileNode}`) || 0) + 1);
				flows.set(`${readFileNode}->${retryEditNode}`, (flows.get(`${readFileNode}->${retryEditNode}`) || 0) + 1);
				flows.set(`${retryEditNode}->${finalResult}`, (flows.get(`${retryEditNode}->${finalResult}`) || 0) + 1);
				continue;
			}

			// Tool -> Success/Fail
			const resultNode = edit.success ? 'Success' : 'Failed';
			const flowKey = `${toolNode}->${resultNode}`;
			flows.set(flowKey, (flows.get(flowKey) || 0) + 1);
		}
	}

	// Convert flows to Sankey links
	for (const [flowKey, count] of flows.entries()) {
		const [source, target] = flowKey.split('->');
		sankeyLinks.push({
			source: getNodeIndex(source),
			target: getNodeIndex(target),
			value: count
		});
	}

	// Build table rows
	const tableRows = analysis.conversations.flatMap(conv =>
		conv.edits.map(edit => ({
			conversation: path.basename(conv.conversationPath),
			toolName: edit.toolName,
			timestamp: edit.timestamp,
			success: edit.success,
			turnIndex: edit.turnIndex,
			isRetry: edit.isRetry,
			retrySucceeded: edit.retrySucceeded,
			filePath: edit.filePath
		}))
	);

	const html = `<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Run ${analysis.runId}${analysis.modelName ? ' - ' + analysis.modelName : ''}</title>
	<script src="https://unpkg.com/d3@7/dist/d3.min.js"></script>
	<script src="https://unpkg.com/d3-sankey@0.12.3/dist/d3-sankey.min.js"></script>
	<style>
		* {
			box-sizing: border-box;
		}

		body {
			font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
			margin: 0;
			padding: 20px;
			background: #f5f5f5;
			color: #333;
		}

		.container {
			max-width: 1400px;
			margin: 0 auto;
			background: white;
			padding: 30px;
			border-radius: 8px;
			box-shadow: 0 2px 8px rgba(0,0,0,0.1);
		}

		h1 {
			margin: 0 0 10px 0;
			color: #1a1a1a;
		}

		.stats {
			display: grid;
			grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
			gap: 15px;
			margin: 20px 0;
		}

		.stat-card {
			background: #f8f9fa;
			padding: 15px;
			border-radius: 6px;
			border-left: 4px solid #0969da;
		}

		.stat-label {
			font-size: 12px;
			text-transform: uppercase;
			color: #666;
			margin-bottom: 5px;
		}

		.stat-value {
			font-size: 24px;
			font-weight: 600;
			color: #1a1a1a;
		}

		.controls {
			margin: 20px 0;
			padding: 15px;
			background: #f8f9fa;
			border-radius: 6px;
		}

		.controls label {
			display: inline-flex;
			align-items: center;
			cursor: pointer;
			font-size: 14px;
		}

		.controls input[type="checkbox"] {
			margin-right: 8px;
			width: 18px;
			height: 18px;
			cursor: pointer;
		}

		#sankey-diagram {
			margin: 30px 0;
			overflow-x: auto;
		}

		.table-container {
			margin-top: 30px;
			overflow-x: auto;
		}

		table {
			width: 100%;
			border-collapse: collapse;
			font-size: 14px;
		}

		thead {
			background: #f8f9fa;
		}

		th {
			text-align: left;
			padding: 12px;
			font-weight: 600;
			color: #1a1a1a;
			border-bottom: 2px solid #dee2e6;
		}

		td {
			padding: 10px 12px;
			border-bottom: 1px solid #dee2e6;
		}

		tbody tr:hover {
			background: #f8f9fa;
		}

		.badge {
			display: inline-block;
			padding: 3px 8px;
			border-radius: 12px;
			font-size: 12px;
			font-weight: 500;
		}

		.badge-success {
			background: #d1f4e0;
			color: #0f6d31;
		}

		.badge-failed {
			background: #ffd8d8;
			color: #d1242f;
		}

		.sankey-node rect {
			cursor: pointer;
			fill-opacity: 0.9;
		}

		.sankey-node rect:hover {
			fill-opacity: 1;
		}

		.sankey-link {
			fill: none;
			stroke-opacity: 0.3;
		}

		.sankey-link:hover {
			stroke-opacity: 0.5;
		}

		.sankey-node text {
			pointer-events: none;
			font-size: 12px;
			fill: #1a1a1a;
		}
	</style>
</head>
<body>
	<div class="container">
		<h1>🔧 Run ${analysis.runId}${analysis.modelName ? ' - ' + analysis.modelName : ''}</h1>
		<p style="color: #666; margin: 5px 0 0 0;">Analysis of edit tool operations and success rates</p>

		<div class="stats">
			<div class="stat-card">
				<div class="stat-label">Total Edits</div>
				<div class="stat-value">${analysis.totalEdits}</div>
			</div>
			<div class="stat-card" style="border-left-color: #2da44e;">
				<div class="stat-label">Success Rate</div>
				<div class="stat-value" id="success-rate-value">${(analysis.successRate * 100).toFixed(1)}%</div>
			</div>
			<div class="stat-card" style="border-left-color: #8250df;">
				<div class="stat-label">Conversations</div>
				<div class="stat-value">${analysis.conversations.length}</div>
			</div>
		</div>

		<div class="controls">
			<label>
				<input type="checkbox" id="includeRetries" ${includeRetries ? 'checked' : ''}>
				Include retries (show re-evaluate → retry flows)
			</label>
		</div>

		<div id="sankey-diagram"></div>

		<h2 style="margin-top: 40px;">Edit Operations</h2>
		<div class="table-container">
			<table>
				<thead>
					<tr>
						<th>Conversation</th>
						<th>Tool</th>
						<th>Turn</th>
						<th>File</th>
						<th>Status</th>
						<th>Retry</th>
					</tr>
				</thead>
				<tbody>
					${tableRows.map(row => `
						<tr>
							<td>${row.conversation}</td>
							<td><code style="background: #f6f8fa; padding: 2px 6px; border-radius: 3px; font-size: 12px;">${row.toolName}</code></td>
							<td>${row.turnIndex}</td>
							<td style="color: #666; font-size: 12px; max-width: 300px; overflow: hidden; text-overflow: ellipsis;">${row.filePath || '-'}</td>
							<td><span class="badge ${row.success ? 'badge-success' : 'badge-failed'}">${row.success ? '✓ Success' : '✗ Failed'}</span></td>
							<td>${row.isRetry ? (row.retrySucceeded === true ? '<span class="badge badge-success">✓ Retry Success</span>' : row.retrySucceeded === false ? '<span class="badge badge-failed">✗ Retry Failed</span>' : '<span class="badge" style="background: #e3e3e3; color: #666;">Retry Pending</span>') : '-'}</td>
						</tr>
					`).join('')}
				</tbody>
			</table>
		</div>
	</div>

	<script>
		const sankeyData = {
			nodes: ${JSON.stringify(sankeyNodes.map(name => ({ name })))},
			links: ${JSON.stringify(sankeyLinks)}
		};
		const analysisData = {
			successRate: ${analysis.successRate},
			successRateWithRetries: ${analysis.successRateWithRetries},
			totalEdits: ${analysis.totalEdits},
			totalUniqueEdits: ${analysis.totalUniqueEdits}
		};

		function drawSankey(includeRetries) {
			// Clear previous diagram
			d3.select('#sankey-diagram').html('');

			// Rebuild data based on includeRetries flag
			const allEdits = ${JSON.stringify(tableRows)};
			const nodes = [];
			const links = [];
			const nodeMap = new Map();

			const getNodeIndex = (name) => {
				if (!nodeMap.has(name)) {
					nodeMap.set(name, nodes.length);
					nodes.push({ name });
				}
				return nodeMap.get(name);
			};

			const flows = new Map();

			for (const edit of allEdits) {
				const toolNode = edit.toolName;

				// Check if this is a failed edit with a retry
				if (includeRetries && !edit.success && edit.isRetry && edit.retrySucceeded !== undefined) {
					// Show full retry flow
					const failedNode = 'Failed (will retry)';
					const readFileNode = 'read_file';
					const retryEditNode = toolNode + ' (retry)';
					const finalResult = edit.retrySucceeded ? 'Success' : 'Failed';

					flows.set(toolNode + '->' + failedNode, (flows.get(toolNode + '->' + failedNode) || 0) + 1);
					flows.set(failedNode + '->' + readFileNode, (flows.get(failedNode + '->' + readFileNode) || 0) + 1);
					flows.set(readFileNode + '->' + retryEditNode, (flows.get(readFileNode + '->' + retryEditNode) || 0) + 1);
					flows.set(retryEditNode + '->' + finalResult, (flows.get(retryEditNode + '->' + finalResult) || 0) + 1);
					continue;
				}

				const resultNode = edit.success ? 'Success' : 'Failed';
				const flowKey = toolNode + '->' + resultNode;
				flows.set(flowKey, (flows.get(flowKey) || 0) + 1);
			}

			for (const [flowKey, count] of flows.entries()) {
				const [source, target] = flowKey.split('->');
				links.push({
					source: getNodeIndex(source),
					target: getNodeIndex(target),
					value: count
				});
			}

			const width = Math.max(800, document.getElementById('sankey-diagram').offsetWidth);
			const height = 500;

			const svg = d3.select('#sankey-diagram')
				.append('svg')
				.attr('width', width)
				.attr('height', height);

			const sankey = d3.sankey()
				.nodeWidth(15)
				.nodePadding(10)
				.extent([[1, 1], [width - 1, height - 5]]);

			const graph = sankey({
				nodes: nodes.map(d => Object.assign({}, d)),
				links: links.map(d => Object.assign({}, d))
			});

			const colorScale = d3.scaleOrdinal()
				.domain(['replace_string_in_file', 'multi_replace_string_in_file', 'read_file', 'Failed (will retry)', 'Success', 'Failed'])
				.range(['#0969da', '#8250df', '#a855f7', '#ff9800', '#2da44e', '#d1242f']);

			// Links
			svg.append('g')
				.attr('class', 'links')
				.selectAll('path')
				.data(graph.links)
				.enter()
				.append('path')
				.attr('class', 'sankey-link')
				.attr('d', d3.sankeyLinkHorizontal())
				.attr('stroke', d => colorScale(d.source.name))
				.attr('stroke-width', d => Math.max(1, d.width));

			// Nodes
			const node = svg.append('g')
				.attr('class', 'nodes')
				.selectAll('g')
				.data(graph.nodes)
				.enter()
				.append('g')
				.attr('class', 'sankey-node');

			node.append('rect')
				.attr('x', d => d.x0)
				.attr('y', d => d.y0)
				.attr('height', d => d.y1 - d.y0)
				.attr('width', d => d.x1 - d.x0)
				.attr('fill', d => colorScale(d.name))
				.append('title')
				.text(d => d.name + '\\n' + d.value + ' edits');

			node.append('text')
				.attr('x', d => d.x0 < width / 2 ? d.x1 + 6 : d.x0 - 6)
				.attr('y', d => (d.y1 + d.y0) / 2)
				.attr('dy', '0.35em')
				.attr('text-anchor', d => d.x0 < width / 2 ? 'start' : 'end')
				.text(d => d.name + ' (' + d.value + ')');
		}

		// Initial draw
		drawSankey(${includeRetries});

		// Update success rate display
		function updateSuccessRate(includeRetries) {
			const rate = includeRetries ? analysisData.successRateWithRetries : analysisData.successRate;
			document.getElementById('success-rate-value').textContent = (rate * 100).toFixed(1) + '%';
		}

		// Handle checkbox change
		document.getElementById('includeRetries').addEventListener('change', (e) => {
			drawSankey(e.target.checked);
			updateSuccessRate(e.target.checked);
		});

		// Redraw on window resize
		let resizeTimer;
		window.addEventListener('resize', () => {
			clearTimeout(resizeTimer);
			resizeTimer = setTimeout(() => {
				const includeRetries = document.getElementById('includeRetries').checked;
				drawSankey(includeRetries);
			}, 250);
		});
	</script>
</body>
</html>`;

	return html;
}

async function main() {
	const args = process.argv.slice(2);
	const runIdArg = args.find(arg => arg.startsWith('--runId='));

	const basePath = path.join('/Users/connor/Github/vscode-copilot-evaluation/.msbenchRun');

	let runId: string;

	if (runIdArg) {
		runId = runIdArg.split('=')[1];
		console.log(`Using run ID: ${runId}`);
	} else {
		const runs = await listRuns(basePath);
		if (runs.length === 0) {
			console.error('No test runs found in', basePath);
			process.exit(1);
		}
		runId = await promptUserForRun(runs);
		console.log(`Selected run: ${runId}`);
	}

	console.log('\nAnalyzing run...');
	const analysis = await analyzeRun(runId, basePath);

	console.log(`\nFound ${analysis.conversations.length} conversations with edits`);
	console.log(`Total edits: ${analysis.totalEdits}`);
	console.log(`Success rate: ${(analysis.successRate * 100).toFixed(1)}%`);

	const outputPath = path.join(basePath, runId, 'edit-analysis.html');
	const html = generateHTML(analysis, outputPath);

	await fs.writeFile(outputPath, html, 'utf-8');
	console.log(`\n✓ Analysis complete! Generated: ${outputPath}`);
}

main().catch(console.error);