Files
Federico Brancasi c33e376aa0 Allow invoking simulationMain with alternative action input (#4304)
* Allow invoking simulationMain with alternative action input

* Address review comments: rename CLI opts, extract pipeline, fix correctness issues

- Rename CLI options with --train- prefix (--train-input, --train-strategy,
  --train-out, --train-row-offset, --train-worker) and document all options
- Extract runInputPipeline/runInputPipelineParallel to test/pipeline/trainPipeline.ts
- Preserve original row index through parse/replay/prompt pipeline to fix
  sample numbering drift when rows are filtered out
- Fix parseSuggestedEdit: use JSON.parse for escaped text, handle missing delimiter
- Fix line number regex to accept optional space after | (WithoutSpace format)
- Clamp concurrency to >= 1, type samples as ISample[], wrap dispose in try/finally
- Gate verbose logging in loadAndParseInput behind verbose flag
- Use splitLines from existing utility instead of local duplicate

* move nes-datagen to a subcommand

* more code reuse around setting promptStrategy and model config

* Address review: use ResponseFormat, Limiter, assertNever, and raw messages

* minor refactor runPipeline

* finalize

* use POT instead of custom code

* move files from script/ to test/pipeline/

---------

Co-authored-by: ulugbekna <ulugbekna@gmail.com>
2026-03-30 15:44:46 +00:00

166 lines
5.6 KiB
TypeScript

/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
import csvParse from 'csv-parse';
import * as fs from 'fs/promises';
import minimist from 'minimist';
import { IAlternativeAction } from '../../src/extension/inlineEdits/node/nextEditProviderTelemetry';
import { coalesce } from '../../src/util/vs/base/common/arrays';
import { Processor } from '../../test/pipeline/alternativeAction/processor';
import { IData, Scoring } from '../../test/pipeline/alternativeAction/types';
import { Either, log } from '../../test/pipeline/alternativeAction/util';
async function extractFromCsv(csvContents: string): Promise<(Scoring.t | undefined)[]> {
const options = {
columns: true as const, // Use first row as column headers
delimiter: ',', // Comma delimiter
quote: '"', // Double quotes
escape: '"', // Standard CSV escape character
skip_empty_lines: true, // Skip any empty rows
trim: true, // Remove whitespace around fields
relax_quotes: true, // Handle quotes within fields more flexibly
bom: true, // Handle UTF-8 BOM
cast: false // Keep all values as strings initially
} as const;
type CsvRecord = { Data: string };
const objects = (await new Promise<CsvRecord[]>((resolve, reject) =>
csvParse.parse<CsvRecord>(csvContents, options, (err, result) => {
if (err) {
reject(err);
} else {
if (result.every((item: any) => typeof item === 'object' && 'Data' in item && typeof item['Data'] === 'string')) {
resolve(result);
} else {
reject(new Error('Invalid CSV format'));
}
}
})
)).map(record => JSON.parse(record.Data) as IData);
const scoredEdits = objects.map((obj: IData) => {
const altAction: IAlternativeAction = obj.altAction;
if (!altAction || !altAction.recording) {
return undefined;
}
return Processor.createScoringForAlternativeAction(altAction, coalesce([parseSuggestedEdit(obj.postProcessingOutcome.suggestedEdit)]), false);
});
return scoredEdits;
}
function writeFiles(basename: string, scoring: Scoring.t) {
return [
fs.writeFile(`${basename}.scoredEdits.w.json`, JSON.stringify(scoring, null, 2)),
fs.writeFile(`${basename}.recording.w.json`, JSON.stringify(scoring.scoringContext.recording, null, 2)),
];
}
async function handleCsv(inputFilePath: string) {
log('Handling CSV file:', inputFilePath);
const csvContents = await fs.readFile(inputFilePath, 'utf8');
log('CSV contents read, length:', csvContents.length);
const extracted = await extractFromCsv(csvContents);
log('Extraction complete, number of scored edits:', extracted.filter(e => e).length);
try {
await Promise.all(extracted.flatMap((obj: Scoring.t | undefined, idx: number) => {
if (!obj) {
return [];
}
return writeFiles(idx.toString(), obj);
}));
log('All files written successfully');
} catch (e) {
log('Error writing files:', e);
}
}
function parseFile(fileContents: string): Either<IData, IAlternativeAction> | undefined {
let parsedObj: unknown;
try {
parsedObj = JSON.parse(fileContents);
} catch (e) {
console.error('Failed to parse JSON:', e);
return undefined;
}
if (parsedObj && typeof parsedObj === 'object' && 'prompt' in parsedObj) {
return Either.left(parsedObj as IData);
}
return Either.right(parsedObj as IAlternativeAction);
}
async function handleAlternativeActionJson(inputFilePath: string) {
log('Handling alternative action JSON file:', inputFilePath);
const fileContents = await fs.readFile(inputFilePath, 'utf8');
log('File contents read, length:', fileContents.length);
const obj = parseFile(fileContents);
if (!obj) {
console.error('Failed to parse alternative action JSON file');
return;
}
const altAction = obj.isLeft() ? obj.value.altAction : obj.value;
const edits: [start: number, endEx: number, text: string][] = [];
let isAccepted = false;
if (obj.isLeft()) {
const data = obj.value;
const parsedEdit = parseSuggestedEdit(data.postProcessingOutcome.suggestedEdit);
if (parsedEdit) {
edits.push(parsedEdit);
}
isAccepted = data.suggestionStatus === 'accepted';
}
const scoring = Processor.createScoringForAlternativeAction(altAction, edits, isAccepted);
if (!scoring) {
console.error('Failed to create scoring from alternative action');
return;
}
const outputFilePath = inputFilePath.replace(/\.json$/, '.scoredEdits.json');
await Promise.all(writeFiles(outputFilePath.replace(/\.scoredEdits\.json$/, ''), scoring));
log('Scoring written to:', outputFilePath);
}
function parseSuggestedEdit(suggestedEditStr: string): [number, number, string] | null {
const [stringifiedRange, quotedText] = suggestedEditStr.split(' -> ');
const match = stringifiedRange.match(/^\[(\d+), (\d+)\)$/);
if (match) {
const start = parseInt(match[1], 10);
const endEx = parseInt(match[2], 10);
const text = quotedText.slice(1, -1); // Remove surrounding quotes
return [start, endEx, text];
}
return null;
}
async function main() {
const argv = minimist(process.argv.slice(2), {
alias: {
p: 'path',
s: 'single',
c: 'csv'
},
boolean: ['single', 'csv'],
string: ['path']
});
if (!argv.path) {
console.error('Please provide a path to an alternative action JSON file using --path or -p');
process.exit(1);
}
const inputFilePath = argv.path;
if (argv.csv) {
await handleCsv(inputFilePath);
return;
}
await handleAlternativeActionJson(inputFilePath);
return;
}
main();