Skip to content

Commit 9023524

Browse files
committed
feat(github): add GitHubRepoIndexer for RAG codebase chunking
Background indexer that walks GitHub repo trees and produces structured IndexedChunk arrays for vector-store ingestion. Extracts four chunk types per repo: metadata, filtered directory tree, heading-split documentation files, and package.json summaries. Includes indexEcosystem() for batch indexing of 6 default ecosystem repos with per-repo error isolation.
1 parent 0680c76 commit 9023524

File tree

2 files changed

+606
-0
lines changed

2 files changed

+606
-0
lines changed
Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
/**
2+
* @fileoverview Background codebase indexer that produces chunks from GitHub repos
3+
* for RAG embedding. Walks repo trees, extracts documentation files, splits them
4+
* by markdown headings, and returns structured IndexedChunk arrays suitable for
5+
* vector-store ingestion.
6+
*/
7+
8+
import type { GitHubService } from './GitHubService.js';
9+
10+
/* ------------------------------------------------------------------ */
11+
/* Types */
12+
/* ------------------------------------------------------------------ */
13+
14+
/** A single indexable text chunk extracted from a GitHub repository. */
15+
export interface IndexedChunk {
16+
/** Human-readable heading identifying this chunk (e.g. `github:owner/repo:metadata`). */
17+
heading: string;
18+
/** The chunk text content, ready for embedding. */
19+
content: string;
20+
/** Source path within the repo (or a virtual path like `:metadata`). */
21+
sourcePath: string;
22+
}
23+
24+
/** Summary result returned after indexing a single repository. */
25+
export interface IndexResult {
26+
/** Full `owner/repo` slug. */
27+
repo: string;
28+
/** All chunks extracted from the repository. */
29+
chunks: IndexedChunk[];
30+
/** Number of individual files whose content was fetched. */
31+
filesScanned: number;
32+
/** Total number of entries in the repo tree (before filtering). */
33+
treeSize: number;
34+
/** Wall-clock duration of the indexing run in milliseconds. */
35+
durationMs: number;
36+
}
37+
38+
/* ------------------------------------------------------------------ */
39+
/* Constants */
40+
/* ------------------------------------------------------------------ */
41+
42+
/**
43+
* Default ecosystem repos to index when `indexEcosystem()` is called
44+
* without arguments.
45+
*/
46+
export const ECOSYSTEM_REPOS: readonly { owner: string; repo: string }[] = [
47+
{ owner: 'framersai', repo: 'agentos' },
48+
{ owner: 'jddunn', repo: 'wunderland' },
49+
{ owner: 'framersai', repo: 'agentos-live-docs' },
50+
{ owner: 'jddunn', repo: 'wunderland-live-docs' },
51+
{ owner: 'framersai', repo: 'agentos-skills-registry' },
52+
{ owner: 'framersai', repo: 'agentos-extensions' },
53+
] as const;
54+
55+
/**
56+
* Directories to skip when walking the repo tree.
57+
* These are typically build artifacts, caches, or vendored dependencies.
58+
*/
59+
export const SKIP_DIRS: ReadonlySet<string> = new Set([
60+
'node_modules',
61+
'dist',
62+
'build',
63+
'.git',
64+
'.next',
65+
'coverage',
66+
'__pycache__',
67+
'.turbo',
68+
]);
69+
70+
/**
71+
* Regex matching file paths that should be treated as indexable documentation.
72+
* Matches README.md, CONTRIBUTING.md, CHANGELOG.md, any .md file under docs/,
73+
* and any nested README.md.
74+
*/
75+
export const DOC_PATTERN: RegExp =
76+
/(?:^|\/)(?:README\.md|CONTRIBUTING\.md|CHANGELOG\.md)$|(?:^|\/)docs\/[^/]+\.md$|(?:^\/?)[\w.-]+\/README\.md$/i;
77+
78+
/* ------------------------------------------------------------------ */
79+
/* Limits */
80+
/* ------------------------------------------------------------------ */
81+
82+
/** Maximum number of doc files to fetch per repo. */
83+
const MAX_DOC_FILES = 20;
84+
85+
/** Maximum number of heading-delimited sections to keep per doc file. */
86+
const MAX_SECTIONS_PER_FILE = 5;
87+
88+
/** Maximum character length for any single chunk's content. */
89+
const MAX_CHUNK_CHARS = 6_000;
90+
91+
/* ------------------------------------------------------------------ */
92+
/* GitHubRepoIndexer */
93+
/* ------------------------------------------------------------------ */
94+
95+
/**
96+
* Indexes GitHub repositories into structured text chunks for RAG embedding.
97+
*
98+
* The indexer produces four kinds of chunks per repo:
99+
* 1. **Metadata** — repo description, stars, language, topics, etc.
100+
* 2. **Directory tree** — filtered file listing (skipping SKIP_DIRS).
101+
* 3. **Documentation chunks** — README, CONTRIBUTING, CHANGELOG, docs/*.md
102+
* split by h1-h3 headings.
103+
* 4. **package.json** — name, version, description, dependencies, scripts.
104+
*/
105+
export class GitHubRepoIndexer {
106+
private readonly service: GitHubService;
107+
108+
constructor(service: GitHubService) {
109+
this.service = service;
110+
}
111+
112+
/* ---------------------------------------------------------------- */
113+
/* Public API */
114+
/* ---------------------------------------------------------------- */
115+
116+
/**
117+
* Index all six default ecosystem repos.
118+
* Errors on individual repos are caught so one failure doesn't abort the batch.
119+
*
120+
* @returns An IndexResult for every repo (failed repos have zero chunks).
121+
*/
122+
async indexEcosystem(): Promise<IndexResult[]> {
123+
const results: IndexResult[] = [];
124+
for (const { owner, repo } of ECOSYSTEM_REPOS) {
125+
try {
126+
const result = await this.indexRepo(owner, repo);
127+
results.push(result);
128+
} catch (err) {
129+
results.push({
130+
repo: `${owner}/${repo}`,
131+
chunks: [],
132+
filesScanned: 0,
133+
treeSize: 0,
134+
durationMs: 0,
135+
});
136+
}
137+
}
138+
return results;
139+
}
140+
141+
/**
142+
* Index a single GitHub repository.
143+
*
144+
* @param owner - Repository owner (user or org).
145+
* @param repo - Repository name.
146+
* @param branch - Optional branch/tag/SHA (defaults to the repo's default branch).
147+
* @returns An IndexResult containing all extracted chunks.
148+
*/
149+
async indexRepo(owner: string, repo: string, branch?: string): Promise<IndexResult> {
150+
const t0 = Date.now();
151+
const slug = `${owner}/${repo}`;
152+
const chunks: IndexedChunk[] = [];
153+
let filesScanned = 0;
154+
155+
const octokit = this.service.getOctokit();
156+
157+
/* 1 — Metadata chunk ------------------------------------------- */
158+
const { data: meta } = await octokit.rest.repos.get({ owner, repo });
159+
const metaLines = [
160+
`Repository: ${meta.full_name}`,
161+
meta.description ? `Description: ${meta.description}` : '',
162+
`Language: ${meta.language ?? 'N/A'}`,
163+
`Stars: ${meta.stargazers_count} Forks: ${meta.forks_count}`,
164+
`Default branch: ${meta.default_branch}`,
165+
`License: ${meta.license?.spdx_id ?? 'N/A'}`,
166+
meta.topics && meta.topics.length > 0 ? `Topics: ${meta.topics.join(', ')}` : '',
167+
`URL: ${meta.html_url}`,
168+
].filter(Boolean);
169+
170+
chunks.push({
171+
heading: `github:${slug}:metadata`,
172+
content: metaLines.join('\n'),
173+
sourcePath: ':metadata',
174+
});
175+
176+
/* Determine ref to use for tree/file fetches -------------------- */
177+
const ref = branch ?? meta.default_branch;
178+
179+
/* 2 — Directory tree chunk -------------------------------------- */
180+
const { data: treeData } = await octokit.rest.git.getTree({
181+
owner,
182+
repo,
183+
tree_sha: ref,
184+
recursive: 'true',
185+
});
186+
const treeEntries = treeData.tree;
187+
const treeSize = treeEntries.length;
188+
189+
const filteredPaths = treeEntries
190+
.filter((entry) => {
191+
const parts = (entry.path ?? '').split('/');
192+
return !parts.some((part) => SKIP_DIRS.has(part));
193+
})
194+
.map((entry) => entry.path ?? '');
195+
196+
const treeListing = filteredPaths.join('\n').slice(0, MAX_CHUNK_CHARS);
197+
chunks.push({
198+
heading: `github:${slug}:tree`,
199+
content: treeListing,
200+
sourcePath: ':tree',
201+
});
202+
203+
/* 3 — Documentation file chunks --------------------------------- */
204+
const docPaths = filteredPaths
205+
.filter((p) => DOC_PATTERN.test(p))
206+
.slice(0, MAX_DOC_FILES);
207+
208+
for (const docPath of docPaths) {
209+
try {
210+
const { data: fileData } = await octokit.rest.repos.getContent({
211+
owner,
212+
repo,
213+
path: docPath,
214+
ref,
215+
});
216+
217+
if (Array.isArray(fileData) || fileData.type !== 'file' || !('content' in fileData)) {
218+
continue;
219+
}
220+
221+
const content = Buffer.from(fileData.content, 'base64').toString('utf8');
222+
filesScanned++;
223+
224+
const sections = this.splitByHeadings(content).slice(0, MAX_SECTIONS_PER_FILE);
225+
for (const section of sections) {
226+
chunks.push({
227+
heading: `github:${slug}:${docPath}`,
228+
content: section.slice(0, MAX_CHUNK_CHARS),
229+
sourcePath: docPath,
230+
});
231+
}
232+
} catch {
233+
// File may have been deleted or is inaccessible — skip silently.
234+
}
235+
}
236+
237+
/* 4 — package.json chunk ---------------------------------------- */
238+
try {
239+
const { data: pkgData } = await octokit.rest.repos.getContent({
240+
owner,
241+
repo,
242+
path: 'package.json',
243+
ref,
244+
});
245+
246+
if (!Array.isArray(pkgData) && pkgData.type === 'file' && 'content' in pkgData) {
247+
const raw = Buffer.from(pkgData.content, 'base64').toString('utf8');
248+
filesScanned++;
249+
const pkg = JSON.parse(raw);
250+
const pkgLines = [
251+
pkg.name ? `name: ${pkg.name}` : '',
252+
pkg.version ? `version: ${pkg.version}` : '',
253+
pkg.description ? `description: ${pkg.description}` : '',
254+
pkg.dependencies
255+
? `dependencies: ${Object.keys(pkg.dependencies).join(', ')}`
256+
: '',
257+
pkg.scripts
258+
? `scripts: ${Object.keys(pkg.scripts).join(', ')}`
259+
: '',
260+
].filter(Boolean);
261+
262+
chunks.push({
263+
heading: `github:${slug}:package.json`,
264+
content: pkgLines.join('\n').slice(0, MAX_CHUNK_CHARS),
265+
sourcePath: 'package.json',
266+
});
267+
}
268+
} catch {
269+
// No package.json in repo — that's fine.
270+
}
271+
272+
return {
273+
repo: slug,
274+
chunks,
275+
filesScanned,
276+
treeSize,
277+
durationMs: Date.now() - t0,
278+
};
279+
}
280+
281+
/* ---------------------------------------------------------------- */
282+
/* Private helpers */
283+
/* ---------------------------------------------------------------- */
284+
285+
/**
286+
* Split markdown content by h1-h3 headings (`#`, `##`, `###`).
287+
* Each returned string includes the heading line that starts it.
288+
* Content before the first heading (if any) is returned as the first element.
289+
*
290+
* @param content - Raw markdown text.
291+
* @returns Array of sections, each starting at a heading boundary.
292+
*/
293+
splitByHeadings(content: string): string[] {
294+
const headingRe = /^#{1,3}\s/m;
295+
const lines = content.split('\n');
296+
const sections: string[] = [];
297+
let current: string[] = [];
298+
299+
for (const line of lines) {
300+
if (headingRe.test(line) && current.length > 0) {
301+
sections.push(current.join('\n'));
302+
current = [];
303+
}
304+
current.push(line);
305+
}
306+
307+
if (current.length > 0) {
308+
sections.push(current.join('\n'));
309+
}
310+
311+
return sections;
312+
}
313+
}

0 commit comments

Comments
 (0)