ahrefs
diff --git a/‎core/commands/slash/edit.ts‎
Lines changed: 1 addition & 1 deletion b/‎core/commands/slash/edit.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/context/rerankers/index.ts‎
Lines changed: 6 additions & 4 deletions b/‎core/context/rerankers/index.ts‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎core/indexing/CodeSnippetsIndex.ts‎
Lines changed: 4 additions & 3 deletions b/‎core/indexing/CodeSnippetsIndex.ts‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎core/indexing/FullTextSearch.ts‎
Lines changed: 1 addition & 0 deletions b/‎core/indexing/FullTextSearch.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/indexing/LanceDbIndex.ts‎
Lines changed: 6 additions & 2 deletions b/‎core/indexing/LanceDbIndex.ts‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎core/indexing/chunk/ChunkCodebaseIndex.ts‎
Lines changed: 1 addition & 0 deletions b/‎core/indexing/chunk/ChunkCodebaseIndex.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/indexing/docs/crawl.ts‎
Lines changed: 78 additions & 44 deletions b/‎core/indexing/docs/crawl.ts‎
Lines changed: 78 additions & 44 deletions
@@ -480,7 +480,7 @@ const EditSlashCommand: SlashCommand = {
       lineStream = filterEnglishLinesAtStart(lineStream);
 
       lineStream = filterEnglishLinesAtEnd(filterCodeBlockLines(lineStream));
-      lineStream = stopAtLines(lineStream);
+      lineStream = stopAtLines(lineStream, () => {});
 
       generator = streamWithNewLines(
         fixCodeLlamaFirstLineIndentation(lineStream),
 
@@ -1,9 +1,11 @@
-import { RerankerName } from "../..";
-import { FreeTrialReranker } from "./freeTrial";
-import { LLMReranker } from "./llm";
-import { VoyageReranker } from "./voyage";
+import { RerankerName } from "../../index.js";
+import { CohereReranker } from "./cohere.js";
+import { FreeTrialReranker } from "./freeTrial.js";
+import { LLMReranker } from "./llm.js";
+import { VoyageReranker } from "./voyage.js";
 
 export const AllRerankers: { [key in RerankerName]: any } = {
+  cohere: CohereReranker,
   llm: LLMReranker,
   voyage: VoyageReranker,
   "free-trial": FreeTrialReranker,
 
@@ -8,7 +8,7 @@ import {
   IndexTag,
   IndexingProgressUpdate,
 } from "..";
-import { getBasename } from "../util";
+import { getBasename, getLastNPathParts } from "../util/index.js";
 import {
   getLanguageForFile,
   getParserForFile,
@@ -132,6 +132,7 @@ export class CodeSnippetsCodebaseIndex implements CodebaseIndex {
       yield {
         desc: `Indexing ${compute.path}`,
         progress: i / results.compute.length,
+        status: "indexing",
       };
       markComplete([compute], IndexResultType.Compute);
     }
@@ -187,7 +188,7 @@ export class CodeSnippetsCodebaseIndex implements CodebaseIndex {
 
     return {
       name: row.title,
-      description: getBasename(row.path, 2),
+      description: getLastNPathParts(row.path, 2),
       content: `\`\`\`${getBasename(row.path)}\n${row.content}\n\`\`\``,
     };
   }
@@ -207,7 +208,7 @@ export class CodeSnippetsCodebaseIndex implements CodebaseIndex {
 
       return rows.map((row) => ({
         title: row.title,
-        description: getBasename(row.path, 2),
+        description: getLastNPathParts(row.path, 2),
         id: row.id.toString(),
       }));
     } catch (e) {
 
@@ -61,6 +61,7 @@ export class FullTextSearchCodebaseIndex implements CodebaseIndex {
       yield {
         progress: i / results.compute.length,
         desc: `Indexing ${item.path}`,
+        status: "indexing",
       };
       markComplete([item], IndexResultType.Compute);
     }
 
@@ -247,7 +247,7 @@ export class LanceDbIndex implements CodebaseIndex {
           data.contents,
         );
 
-        yield { progress, desc };
+        yield { progress, desc, status: "indexing" };
       } else {
         await addComputedLanceDbRows(update, computedRows);
         computedRows = [];
@@ -301,7 +301,11 @@ export class LanceDbIndex implements CodebaseIndex {
     }
 
     markComplete(results.del, IndexResultType.Delete);
-    yield { progress: 1, desc: "Completed Calculating Embeddings" };
+    yield {
+      progress: 1,
+      desc: "Completed Calculating Embeddings",
+      status: "done",
+    };
   }
 
   private async _retrieveForTag(
 
@@ -113,6 +113,7 @@ export class ChunkCodebaseIndex implements CodebaseIndex {
       yield {
         progress: i / results.compute.length,
         desc: `Chunking ${getBasename(item.path)}`,
+        status: "indexing",
       };
       markComplete([item], IndexResultType.Compute);
     }
 
@@ -1,7 +1,7 @@
 import { Octokit } from "@octokit/rest";
 import cheerio from "cheerio";
 import fetch from "node-fetch";
-import { URL } from "url";
+import { URL } from "node:url";
 
 const IGNORE_PATHS_ENDING_IN = [
   "favicon.ico",
@@ -18,24 +18,33 @@ const IGNORE_PATHS_ENDING_IN = [
 
 const GITHUB_PATHS_TO_TRAVERSE = ["/blob/", "/tree/"];
 
+async function getDefaultBranch(owner: string, repo: string): Promise<string> {
+  const octokit = new Octokit({ auth: undefined });
+
+  const repoInfo = await octokit.repos.get({
+    owner,
+    repo,
+  });
+
+  return repoInfo.data.default_branch;
+}
+
 async function crawlGithubRepo(baseUrl: URL) {
   const octokit = new Octokit({
     auth: undefined,
   });
 
   const [_, owner, repo] = baseUrl.pathname.split("/");
 
-  let dirContentsConfig = {
-    owner: owner,
-    repo: repo,
-  };
+  const branch = await getDefaultBranch(owner, repo);
+  console.log("Github repo detected. Crawling", branch, "branch");
 
   const tree = await octokit.request(
     "GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
     {
       owner,
       repo,
-      tree_sha: "main",
+      tree_sha: branch,
       headers: {
         "X-GitHub-Api-Version": "2022-11-28",
       },
@@ -44,8 +53,8 @@ async function crawlGithubRepo(baseUrl: URL) {
   );
 
   const paths = tree.data.tree
-    .filter((file) => file.type === "blob" && file.path?.endsWith(".md"))
-    .map((file) => baseUrl.pathname + "/tree/main/" + file.path);
+    .filter((file: any) => file.type === "blob" && file.path?.endsWith(".md"))
+    .map((file: any) => baseUrl.pathname + "/tree/main/" + file.path);
 
   return paths;
 }
@@ -54,6 +63,7 @@ async function getLinksFromUrl(url: string, path: string) {
   const baseUrl = new URL(url);
   const location = new URL(path, url);
   let response;
+
   try {
     response = await fetch(location.toString());
   } catch (error: unknown) {
@@ -63,13 +73,12 @@ async function getLinksFromUrl(url: string, path: string) {
         html: "",
         links: [],
       };
-    } else {
-      console.error(error);
-      return {
-        html: "",
-        links: [],
-      };
     }
+    console.error(error);
+    return {
+      html: "",
+      links: [],
+    };
   }
 
   const html = await response.text();
@@ -113,7 +122,9 @@ async function getLinksFromUrl(url: string, path: string) {
 }
 
 function splitUrl(url: URL) {
-  const baseUrl = `${url.protocol}//${url.hostname}`;
+  const baseUrl = `${url.protocol}//${url.hostname}${
+    url.port ? ":" + url.port : ""
+  }`;
   const basePath = url.pathname;
   return {
     baseUrl,
@@ -127,46 +138,69 @@ export type PageData = {
   html: string;
 };
 
-export async function* crawlPage(url: URL): AsyncGenerator<PageData> {
+export async function* crawlPage(
+  url: URL,
+  maxDepth: number = 3,
+): AsyncGenerator<PageData> {
+  console.log("Starting crawl from: ", url, " - Max Depth: ", maxDepth);
   const { baseUrl, basePath } = splitUrl(url);
-  let paths: string[] = [basePath];
+  let paths: { path: string; depth: number }[] = [{ path: basePath, depth: 0 }];
 
   if (url.hostname === "github.com") {
     const githubLinks = await crawlGithubRepo(url);
-    paths = [...paths, ...githubLinks];
+    const githubLinkObjects = githubLinks.map((link) => ({
+      path: link,
+      depth: 0,
+    }));
+    paths = [...paths, ...githubLinkObjects];
   }
 
   let index = 0;
-
   while (index < paths.length) {
-    const promises = paths
-      .slice(index, index + 50)
-      .map((path) => getLinksFromUrl(baseUrl, path));
-
-    const results = await Promise.all(promises);
-
-    for (const { html, links } of results) {
-      if (html !== "") {
-        yield {
-          url: url.toString(),
-          path: paths[index],
-          html: html,
-        };
-      }
+    const batch = paths.slice(index, index + 50);
+
+    try {
+      const promises = batch.map(({ path, depth }) =>
+        getLinksFromUrl(baseUrl, path).then((links) => ({
+          links,
+          path,
+          depth,
+        })),
+      ); // Adjust for depth tracking
+
+      const results = await Promise.all(promises);
+      for (const {
+        links: { html, links: linksArray },
+        path,
+        depth,
+      } of results) {
+        if (html !== "" && depth <= maxDepth) {
+          // Check depth
+          yield {
+            url: url.toString(),
+            path,
+            html,
+          };
+        }
 
-      for (let link of links) {
-        if (!paths.includes(link)) {
-          paths.push(link);
+        // Ensure we only add links if within depth limit
+        if (depth < maxDepth) {
+          for (let link of linksArray) {
+            if (!paths.some((p) => p.path === link)) {
+              paths.push({ path: link, depth: depth + 1 }); // Increment depth for new paths
+            }
+          }
         }
       }
-
-      index++;
+    } catch (e) {
+      if (e instanceof TypeError) {
+        console.warn("Error while crawling page: ", e); // Likely an invalid url, continue with process
+      } else {
+        console.error("Error while crawling page: ", e);
+      }
     }
 
-    paths = paths.filter((path) =>
-      results.some(
-        (result) => result.html !== "" && result.links.includes(path),
-      ),
-    );
+    index += batch.length; // Proceed to next batch
   }
-}
+  console.log("Crawl completed");
+}
Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ export class FullTextSearchCodebaseIndex implements CodebaseIndex {`
`61`	`61`	`yield {`
`62`	`62`	`progress: i / results.compute.length,`
`63`	`63`	desc: `Indexing ${item.path}`,
	`64`	`+ status: "indexing",`
`64`	`65`	`};`
`65`	`66`	`markComplete([item], IndexResultType.Compute);`
`66`	`67`	`}`
Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ export class ChunkCodebaseIndex implements CodebaseIndex {`
`113`	`113`	`yield {`
`114`	`114`	`progress: i / results.compute.length,`
`115`	`115`	desc: `Chunking ${getBasename(item.path)}`,
	`116`	`+ status: "indexing",`
`116`	`117`	`};`
`117`	`118`	`markComplete([item], IndexResultType.Compute);`
`118`	`119`	`}`