Skip to content

Commit 93d1756

Browse files
author
Calvinn Ng
committed
update files dependent on updated interfaces
1 parent 9016e66 commit 93d1756

39 files changed

+1714
-570
lines changed

core/commands/slash/edit.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ const EditSlashCommand: SlashCommand = {
480480
lineStream = filterEnglishLinesAtStart(lineStream);
481481

482482
lineStream = filterEnglishLinesAtEnd(filterCodeBlockLines(lineStream));
483-
lineStream = stopAtLines(lineStream);
483+
lineStream = stopAtLines(lineStream, () => {});
484484

485485
generator = streamWithNewLines(
486486
fixCodeLlamaFirstLineIndentation(lineStream),

core/context/rerankers/index.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
import { RerankerName } from "../..";
2-
import { FreeTrialReranker } from "./freeTrial";
3-
import { LLMReranker } from "./llm";
4-
import { VoyageReranker } from "./voyage";
1+
import { RerankerName } from "../../index.js";
2+
import { CohereReranker } from "./cohere.js";
3+
import { FreeTrialReranker } from "./freeTrial.js";
4+
import { LLMReranker } from "./llm.js";
5+
import { VoyageReranker } from "./voyage.js";
56

67
export const AllRerankers: { [key in RerankerName]: any } = {
8+
cohere: CohereReranker,
79
llm: LLMReranker,
810
voyage: VoyageReranker,
911
"free-trial": FreeTrialReranker,

core/indexing/CodeSnippetsIndex.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import {
88
IndexTag,
99
IndexingProgressUpdate,
1010
} from "..";
11-
import { getBasename } from "../util";
11+
import { getBasename, getLastNPathParts } from "../util/index.js";
1212
import {
1313
getLanguageForFile,
1414
getParserForFile,
@@ -132,6 +132,7 @@ export class CodeSnippetsCodebaseIndex implements CodebaseIndex {
132132
yield {
133133
desc: `Indexing ${compute.path}`,
134134
progress: i / results.compute.length,
135+
status: "indexing",
135136
};
136137
markComplete([compute], IndexResultType.Compute);
137138
}
@@ -187,7 +188,7 @@ export class CodeSnippetsCodebaseIndex implements CodebaseIndex {
187188

188189
return {
189190
name: row.title,
190-
description: getBasename(row.path, 2),
191+
description: getLastNPathParts(row.path, 2),
191192
content: `\`\`\`${getBasename(row.path)}\n${row.content}\n\`\`\``,
192193
};
193194
}
@@ -207,7 +208,7 @@ export class CodeSnippetsCodebaseIndex implements CodebaseIndex {
207208

208209
return rows.map((row) => ({
209210
title: row.title,
210-
description: getBasename(row.path, 2),
211+
description: getLastNPathParts(row.path, 2),
211212
id: row.id.toString(),
212213
}));
213214
} catch (e) {

core/indexing/FullTextSearch.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ export class FullTextSearchCodebaseIndex implements CodebaseIndex {
6161
yield {
6262
progress: i / results.compute.length,
6363
desc: `Indexing ${item.path}`,
64+
status: "indexing",
6465
};
6566
markComplete([item], IndexResultType.Compute);
6667
}

core/indexing/LanceDbIndex.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ export class LanceDbIndex implements CodebaseIndex {
247247
data.contents,
248248
);
249249

250-
yield { progress, desc };
250+
yield { progress, desc, status: "indexing" };
251251
} else {
252252
await addComputedLanceDbRows(update, computedRows);
253253
computedRows = [];
@@ -301,7 +301,11 @@ export class LanceDbIndex implements CodebaseIndex {
301301
}
302302

303303
markComplete(results.del, IndexResultType.Delete);
304-
yield { progress: 1, desc: "Completed Calculating Embeddings" };
304+
yield {
305+
progress: 1,
306+
desc: "Completed Calculating Embeddings",
307+
status: "done",
308+
};
305309
}
306310

307311
private async _retrieveForTag(

core/indexing/chunk/ChunkCodebaseIndex.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ export class ChunkCodebaseIndex implements CodebaseIndex {
113113
yield {
114114
progress: i / results.compute.length,
115115
desc: `Chunking ${getBasename(item.path)}`,
116+
status: "indexing",
116117
};
117118
markComplete([item], IndexResultType.Compute);
118119
}

core/indexing/docs/crawl.ts

Lines changed: 78 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { Octokit } from "@octokit/rest";
22
import cheerio from "cheerio";
33
import fetch from "node-fetch";
4-
import { URL } from "url";
4+
import { URL } from "node:url";
55

66
const IGNORE_PATHS_ENDING_IN = [
77
"favicon.ico",
@@ -18,24 +18,33 @@ const IGNORE_PATHS_ENDING_IN = [
1818

1919
const GITHUB_PATHS_TO_TRAVERSE = ["/blob/", "/tree/"];
2020

21+
async function getDefaultBranch(owner: string, repo: string): Promise<string> {
22+
const octokit = new Octokit({ auth: undefined });
23+
24+
const repoInfo = await octokit.repos.get({
25+
owner,
26+
repo,
27+
});
28+
29+
return repoInfo.data.default_branch;
30+
}
31+
2132
async function crawlGithubRepo(baseUrl: URL) {
2233
const octokit = new Octokit({
2334
auth: undefined,
2435
});
2536

2637
const [_, owner, repo] = baseUrl.pathname.split("/");
2738

28-
let dirContentsConfig = {
29-
owner: owner,
30-
repo: repo,
31-
};
39+
const branch = await getDefaultBranch(owner, repo);
40+
console.log("Github repo detected. Crawling", branch, "branch");
3241

3342
const tree = await octokit.request(
3443
"GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
3544
{
3645
owner,
3746
repo,
38-
tree_sha: "main",
47+
tree_sha: branch,
3948
headers: {
4049
"X-GitHub-Api-Version": "2022-11-28",
4150
},
@@ -44,8 +53,8 @@ async function crawlGithubRepo(baseUrl: URL) {
4453
);
4554

4655
const paths = tree.data.tree
47-
.filter((file) => file.type === "blob" && file.path?.endsWith(".md"))
48-
.map((file) => baseUrl.pathname + "/tree/main/" + file.path);
56+
.filter((file: any) => file.type === "blob" && file.path?.endsWith(".md"))
57+
.map((file: any) => baseUrl.pathname + "/tree/main/" + file.path);
4958

5059
return paths;
5160
}
@@ -54,6 +63,7 @@ async function getLinksFromUrl(url: string, path: string) {
5463
const baseUrl = new URL(url);
5564
const location = new URL(path, url);
5665
let response;
66+
5767
try {
5868
response = await fetch(location.toString());
5969
} catch (error: unknown) {
@@ -63,13 +73,12 @@ async function getLinksFromUrl(url: string, path: string) {
6373
html: "",
6474
links: [],
6575
};
66-
} else {
67-
console.error(error);
68-
return {
69-
html: "",
70-
links: [],
71-
};
7276
}
77+
console.error(error);
78+
return {
79+
html: "",
80+
links: [],
81+
};
7382
}
7483

7584
const html = await response.text();
@@ -113,7 +122,9 @@ async function getLinksFromUrl(url: string, path: string) {
113122
}
114123

115124
function splitUrl(url: URL) {
116-
const baseUrl = `${url.protocol}//${url.hostname}`;
125+
const baseUrl = `${url.protocol}//${url.hostname}${
126+
url.port ? ":" + url.port : ""
127+
}`;
117128
const basePath = url.pathname;
118129
return {
119130
baseUrl,
@@ -127,46 +138,69 @@ export type PageData = {
127138
html: string;
128139
};
129140

130-
export async function* crawlPage(url: URL): AsyncGenerator<PageData> {
141+
export async function* crawlPage(
142+
url: URL,
143+
maxDepth: number = 3,
144+
): AsyncGenerator<PageData> {
145+
console.log("Starting crawl from: ", url, " - Max Depth: ", maxDepth);
131146
const { baseUrl, basePath } = splitUrl(url);
132-
let paths: string[] = [basePath];
147+
let paths: { path: string; depth: number }[] = [{ path: basePath, depth: 0 }];
133148

134149
if (url.hostname === "github.com") {
135150
const githubLinks = await crawlGithubRepo(url);
136-
paths = [...paths, ...githubLinks];
151+
const githubLinkObjects = githubLinks.map((link) => ({
152+
path: link,
153+
depth: 0,
154+
}));
155+
paths = [...paths, ...githubLinkObjects];
137156
}
138157

139158
let index = 0;
140-
141159
while (index < paths.length) {
142-
const promises = paths
143-
.slice(index, index + 50)
144-
.map((path) => getLinksFromUrl(baseUrl, path));
145-
146-
const results = await Promise.all(promises);
147-
148-
for (const { html, links } of results) {
149-
if (html !== "") {
150-
yield {
151-
url: url.toString(),
152-
path: paths[index],
153-
html: html,
154-
};
155-
}
160+
const batch = paths.slice(index, index + 50);
161+
162+
try {
163+
const promises = batch.map(({ path, depth }) =>
164+
getLinksFromUrl(baseUrl, path).then((links) => ({
165+
links,
166+
path,
167+
depth,
168+
})),
169+
); // Adjust for depth tracking
170+
171+
const results = await Promise.all(promises);
172+
for (const {
173+
links: { html, links: linksArray },
174+
path,
175+
depth,
176+
} of results) {
177+
if (html !== "" && depth <= maxDepth) {
178+
// Check depth
179+
yield {
180+
url: url.toString(),
181+
path,
182+
html,
183+
};
184+
}
156185

157-
for (let link of links) {
158-
if (!paths.includes(link)) {
159-
paths.push(link);
186+
// Ensure we only add links if within depth limit
187+
if (depth < maxDepth) {
188+
for (let link of linksArray) {
189+
if (!paths.some((p) => p.path === link)) {
190+
paths.push({ path: link, depth: depth + 1 }); // Increment depth for new paths
191+
}
192+
}
160193
}
161194
}
162-
163-
index++;
195+
} catch (e) {
196+
if (e instanceof TypeError) {
197+
console.warn("Error while crawling page: ", e); // Likely an invalid url, continue with process
198+
} else {
199+
console.error("Error while crawling page: ", e);
200+
}
164201
}
165202

166-
paths = paths.filter((path) =>
167-
results.some(
168-
(result) => result.html !== "" && result.links.includes(path),
169-
),
170-
);
203+
index += batch.length; // Proceed to next batch
171204
}
172-
}
205+
console.log("Crawl completed");
206+
}

0 commit comments

Comments
 (0)