|
| 1 | +/** |
| 2 | + * Download RepoEval benchmark data |
| 3 | + * |
| 4 | + * Downloads: |
| 5 | + * 1. Task datasets (queries, ground truth) from Microsoft CodeT repo |
| 6 | + * 2. Function-level Python repositories for chunking |
| 7 | + */ |
| 8 | + |
| 9 | +import { existsSync } from 'node:fs' |
| 10 | +import { mkdir, writeFile } from 'node:fs/promises' |
| 11 | +import { join } from 'node:path' |
| 12 | + |
| 13 | +const DATA_DIR = join(import.meta.dir, 'data', 'repoeval') |
| 14 | +const DATASETS_DIR = join(DATA_DIR, 'datasets') |
| 15 | +const REPOS_DIR = join(DATA_DIR, 'repositories', 'function_level') |
| 16 | + |
| 17 | +// Function-level repositories from RepoEval |
| 18 | +const REPOS_FUNCTION = [ |
| 19 | + 'amazon-science_patchcore-inspection', |
| 20 | + 'deepmind_tracr', |
| 21 | + 'facebookresearch_omnivore', |
| 22 | + 'google_lightweight_mmm', |
| 23 | + 'lucidrains_imagen-pytorch', |
| 24 | + 'maxhumber_redframes', |
| 25 | +] |
| 26 | + |
| 27 | +async function downloadAndExtractZip( |
| 28 | + url: string, |
| 29 | + destDir: string, |
| 30 | +): Promise<void> { |
| 31 | + console.log(`Downloading from ${url}...`) |
| 32 | + |
| 33 | + const response = await fetch(url) |
| 34 | + if (!response.ok) { |
| 35 | + throw new Error(`Failed to download: ${response.statusText}`) |
| 36 | + } |
| 37 | + |
| 38 | + const arrayBuffer = await response.arrayBuffer() |
| 39 | + const tempZipPath = join(destDir, '_temp.zip') |
| 40 | + |
| 41 | + await mkdir(destDir, { recursive: true }) |
| 42 | + await writeFile(tempZipPath, new Uint8Array(arrayBuffer)) |
| 43 | + |
| 44 | + // Use unzip command |
| 45 | + const proc = Bun.spawn(['unzip', '-o', '-q', tempZipPath, '-d', destDir], { |
| 46 | + cwd: destDir, |
| 47 | + }) |
| 48 | + await proc.exited |
| 49 | + |
| 50 | + // Clean up temp file |
| 51 | + await Bun.spawn(['rm', tempZipPath]).exited |
| 52 | + |
| 53 | + console.log(`Extracted to ${destDir}`) |
| 54 | +} |
| 55 | + |
| 56 | +async function downloadDatasets(): Promise<void> { |
| 57 | + if (existsSync(DATASETS_DIR)) { |
| 58 | + console.log('Datasets already downloaded, skipping...') |
| 59 | + return |
| 60 | + } |
| 61 | + |
| 62 | + const datasetsUrl = |
| 63 | + 'https://github.com/microsoft/CodeT/raw/main/RepoCoder/datasets/datasets.zip' |
| 64 | + await downloadAndExtractZip(datasetsUrl, DATASETS_DIR) |
| 65 | +} |
| 66 | + |
| 67 | +async function downloadRepositories(): Promise<void> { |
| 68 | + if (existsSync(REPOS_DIR)) { |
| 69 | + console.log('Repositories already downloaded, skipping...') |
| 70 | + return |
| 71 | + } |
| 72 | + |
| 73 | + // Using the cleaned version from Veronicium's fork |
| 74 | + const reposUrl = |
| 75 | + 'https://github.com/Veronicium/repoeval_debug/raw/main/function_level.zip' |
| 76 | + await downloadAndExtractZip(reposUrl, REPOS_DIR) |
| 77 | +} |
| 78 | + |
| 79 | +export interface RepoEvalTask { |
| 80 | + prompt: string |
| 81 | + metadata: { |
| 82 | + task_id: string |
| 83 | + ground_truth: string |
| 84 | + fpath_tuple: string[] |
| 85 | + line_no: number |
| 86 | + lineno: number |
| 87 | + context_start_lineno: number |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +export async function loadTasks( |
| 92 | + contextLength: '1k' | '2k' | '4k' = '2k', |
| 93 | +): Promise<RepoEvalTask[]> { |
| 94 | + const fileName = `function_level_completion_${contextLength}_context_codex.test.jsonl` |
| 95 | + const filePath = join(DATASETS_DIR, fileName) |
| 96 | + |
| 97 | + const content = await Bun.file(filePath).text() |
| 98 | + const lines = content.trim().split('\n') |
| 99 | + |
| 100 | + const tasks: RepoEvalTask[] = [] |
| 101 | + const repo2idx: Record<string, number> = {} |
| 102 | + |
| 103 | + for (const line of lines) { |
| 104 | + const task = JSON.parse(line) as RepoEvalTask |
| 105 | + |
| 106 | + // Clean up task_id format |
| 107 | + const repo = task.metadata.task_id.replace('--', '_').split('/')[0] |
| 108 | + if (!REPOS_FUNCTION.includes(repo)) continue |
| 109 | + |
| 110 | + if (!(repo in repo2idx)) { |
| 111 | + repo2idx[repo] = 0 |
| 112 | + } |
| 113 | + |
| 114 | + task.metadata.task_id = task.metadata.task_id |
| 115 | + .replace('--', '_') |
| 116 | + .replace('idx', String(repo2idx[repo])) |
| 117 | + task.metadata.line_no = task.metadata.lineno |
| 118 | + repo2idx[repo]++ |
| 119 | + |
| 120 | + tasks.push(task) |
| 121 | + } |
| 122 | + |
| 123 | + return tasks |
| 124 | +} |
| 125 | + |
| 126 | +export function getReposDir(): string { |
| 127 | + return REPOS_DIR |
| 128 | +} |
| 129 | + |
| 130 | +export function getRepos(): string[] { |
| 131 | + return REPOS_FUNCTION |
| 132 | +} |
| 133 | + |
| 134 | +export async function download(): Promise<void> { |
| 135 | + console.log('Downloading RepoEval benchmark data...\n') |
| 136 | + |
| 137 | + await mkdir(DATA_DIR, { recursive: true }) |
| 138 | + |
| 139 | + await downloadDatasets() |
| 140 | + await downloadRepositories() |
| 141 | + |
| 142 | + console.log('\nDownload complete!') |
| 143 | + console.log(`Data stored in: ${DATA_DIR}`) |
| 144 | +} |
| 145 | + |
| 146 | +// Run if executed directly |
| 147 | +if (import.meta.main) { |
| 148 | + await download() |
| 149 | +} |
0 commit comments