Skip to content

Commit b171f3a

Browse files
authored
feat: fetch generated llms.txt (#83)
1 parent 889e962 commit b171f3a

File tree

6 files changed

+349
-229
lines changed

6 files changed

+349
-229
lines changed

src/api/tools/commonTools.ts

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,18 @@ import {
1010
import { cacheIsIndexed, getIsIndexedFromCache } from "../utils/cache.js";
1111
import htmlToMd from "html-to-md";
1212
import { searchCode } from "../utils/githubClient.js";
13+
import { fetchFileFromR2 } from "../utils/r2.js";
14+
import { generateServerName } from "../../shared/nameUtils.js";
1315

1416
// Add env parameter to access Cloudflare's bindings
1517
export async function fetchDocumentation({
1618
repoData,
1719
env,
20+
initiatedFromSearch = false,
1821
}: {
1922
repoData: RepoData;
2023
env: any;
24+
initiatedFromSearch?: boolean;
2125
}): Promise<{
2226
fileUsed: string;
2327
content: { type: "text"; text: string }[];
@@ -188,6 +192,24 @@ export async function fetchDocumentation({
188192
}
189193
}
190194
}
195+
// only in case of search, try to fetch pre-generated llms-full.txt
196+
if (!content && initiatedFromSearch) {
197+
// Try to fetch pre-generated llms-full.txt
198+
content =
199+
(await fetchFileFromR2(owner, repo, "llms-full.txt", env)) ?? null;
200+
if (content) {
201+
console.log(`Fetched pre-generated llms-full.txt for ${owner}/${repo}`);
202+
fileUsed = "llms-full.txt (generated)";
203+
}
204+
}
205+
if (!content) {
206+
// Try to fetch pre-generated llms.txt
207+
content = (await fetchFileFromR2(owner, repo, "llms.txt", env)) ?? null;
208+
if (content) {
209+
console.log(`Fetched pre-generated llms.txt for ${owner}/${repo}`);
210+
fileUsed = "llms.txt (generated)";
211+
}
212+
}
191213

192214
// Fallback to README.md if llms.txt not found in any location
193215
if (!content) {
@@ -632,3 +654,234 @@ export async function fetchUrlContent({ url, env }: { url: string; env: any }) {
632654
};
633655
}
634656
}
657+
658+
/**
659+
* Enforces the 60-character limit on the combined server and tool names
660+
* @param prefix - The prefix for the tool name (fetch_ or search_)
661+
* @param repo - The repository name
662+
* @param suffix - The suffix for the tool name (_documentation)
663+
* @returns A tool name that ensures combined length with server name stays under 60 characters
664+
*/
665+
export function enforceToolNameLengthLimit(
666+
prefix: string,
667+
repo: string | null | undefined,
668+
suffix: string,
669+
): string {
670+
if (!repo) {
671+
console.error(
672+
"Repository name is null/undefined in enforceToolNameLengthLimit",
673+
);
674+
return `${prefix}${suffix}`;
675+
}
676+
677+
// Generate the server name to check combined length
678+
const serverNameLen = generateServerName(repo).length;
679+
680+
// Replace non-alphanumeric characters with underscores
681+
let repoName = repo.replace(/[^a-zA-Z0-9]/g, "_");
682+
let toolName = `${prefix}${repoName}${suffix}`;
683+
684+
// Calculate combined length
685+
const combinedLength = toolName.length + serverNameLen;
686+
687+
// If combined length is already under limit, return it
688+
if (combinedLength <= 60) {
689+
return toolName;
690+
}
691+
692+
// Step 1: Try shortening "_documentation" to "_docs"
693+
if (suffix === "_documentation") {
694+
toolName = `${prefix}${repoName}_docs`;
695+
if (toolName.length + serverNameLen <= 60) {
696+
return toolName;
697+
}
698+
}
699+
700+
// Step 2: Shorten the repo name by removing words
701+
const words = repoName.split("_");
702+
if (words.length > 1) {
703+
// Keep removing words from the end until we're under the limit or have only one word left
704+
let shortenedRepo = repoName;
705+
for (let i = words.length - 1; i > 0; i--) {
706+
shortenedRepo = words.slice(0, i).join("_");
707+
toolName = `${prefix}${shortenedRepo}${suffix === "_documentation" ? "_docs" : suffix}`;
708+
if (toolName.length + serverNameLen <= 60) {
709+
return toolName;
710+
}
711+
}
712+
}
713+
714+
// Step 3: As a last resort, truncate to fit
715+
const shortenedSuffix = suffix === "_documentation" ? "_docs" : suffix;
716+
const maxRepoLength =
717+
60 - prefix.length - shortenedSuffix.length - serverNameLen;
718+
const truncatedRepo = repoName.substring(0, Math.max(1, maxRepoLength));
719+
return `${prefix}${truncatedRepo}${shortenedSuffix}`;
720+
}
721+
722+
/**
723+
* Generate a dynamic search tool name for the search_documentation tool based on the URL
724+
* @param requestHost - The host from the request
725+
* @param requestUrl - The full request URL (optional)
726+
* @returns A descriptive string for the tool name
727+
*/
728+
export function generateSearchToolName({ urlType, repo }: RepoData): string {
729+
try {
730+
// Default tool name as fallback
731+
let toolName = "search_documentation";
732+
if (urlType == "subdomain" || urlType == "github") {
733+
// Use enforceLengthLimit to ensure the tool name doesn't exceed 60 characters
734+
return enforceToolNameLengthLimit("search_", repo, "_documentation");
735+
}
736+
// replace non-alphanumeric characters with underscores
737+
return toolName.replace(/[^a-zA-Z0-9]/g, "_");
738+
} catch (error) {
739+
console.error("Error generating search tool name:", error);
740+
// Return default tool name if there's any error parsing the URL
741+
return "search_documentation";
742+
}
743+
}
744+
745+
/**
746+
* Generate a dynamic description for the search_documentation tool based on the URL
747+
* @param requestHost - The host from the request
748+
* @param requestUrl - The full request URL (optional)
749+
* @returns A descriptive string for the tool
750+
*/
751+
export function generateSearchToolDescription({
752+
urlType,
753+
owner,
754+
repo,
755+
}: RepoData): string {
756+
try {
757+
const fetchToolName = generateFetchToolName({
758+
urlType,
759+
owner,
760+
repo,
761+
});
762+
763+
// Default description as fallback
764+
let description =
765+
"Semantically search within the fetched documentation for the current repository.";
766+
767+
if (urlType == "subdomain") {
768+
description = `Semantically search within the fetched documentation from the ${owner}/${repo} GitHub Pages. Useful for specific queries. Don't call if you already used ${fetchToolName}.`;
769+
} else if (urlType == "github") {
770+
description = `Semantically search within the fetched documentation from GitHub repository: ${owner}/${repo}. Useful for specific queries. Don't call if you already used ${fetchToolName}.`;
771+
}
772+
773+
return description;
774+
} catch (error) {
775+
// Return default description if there's any error parsing the URL
776+
return "Search documentation for the current repository.";
777+
}
778+
}
779+
780+
/**
781+
* Generate a dynamic description for the fetch_documentation tool based on the URL
782+
* @param requestHost - The host from the request
783+
* @param requestUrl - The full request URL (optional)
784+
* @returns A descriptive string for the tool
785+
*/
786+
export function generateFetchToolDescription({
787+
urlType,
788+
owner,
789+
repo,
790+
}: Omit<RepoData, "host">): string {
791+
try {
792+
// Default description as fallback
793+
let description = "Fetch entire documentation for the current repository.";
794+
795+
if (urlType == "subdomain") {
796+
description = `Fetch entire documentation file from the ${owner}/${repo} GitHub Pages. Useful for general questions.`;
797+
} else if (urlType == "github") {
798+
description = `Fetch entire documentation file from GitHub repository: ${owner}/${repo}. Useful for general questions.`;
799+
}
800+
801+
return description;
802+
} catch (error) {
803+
// Return default description if there's any error parsing the URL
804+
return "Fetch documentation for the current repository.";
805+
}
806+
}
807+
808+
/**
809+
* Generate a dynamic tool name for the fetch_documentation tool based on the URL
810+
* @param requestHost - The host from the request
811+
* @param requestUrl - The full request URL (optional)
812+
* @returns A descriptive string for the tool
813+
*/
814+
export function generateFetchToolName({
815+
urlType,
816+
owner,
817+
repo,
818+
}: Omit<RepoData, "host">): string {
819+
try {
820+
// Default tool name as fallback
821+
let toolName = "fetch_documentation";
822+
823+
if (urlType == "subdomain" || urlType == "github") {
824+
// Use enforceLengthLimit to ensure the tool name doesn't exceed 60 characters
825+
return enforceToolNameLengthLimit("fetch_", repo, "_documentation");
826+
}
827+
828+
// replace non-alphanumeric characters with underscores
829+
return toolName.replace(/[^a-zA-Z0-9]/g, "_");
830+
} catch (error) {
831+
console.error("Error generating tool name:", error);
832+
// Return default tool name if there's any error parsing the URL
833+
return "fetch_documentation";
834+
}
835+
}
836+
837+
/**
838+
* Generate a dynamic tool name for the code search tool based on the URL
839+
* @param repoData - The repository data object
840+
* @returns A descriptive string for the tool
841+
*/
842+
export function generateCodeSearchToolName({
843+
urlType,
844+
repo,
845+
}: RepoData): string {
846+
try {
847+
// Default tool name as fallback
848+
let toolName = "search_code";
849+
if (urlType == "subdomain" || urlType == "github") {
850+
// Use enforceLengthLimit to ensure the tool name doesn't exceed 60 characters
851+
return enforceToolNameLengthLimit("search_", repo, "_code");
852+
}
853+
// replace non-alphanumeric characters with underscores
854+
return toolName.replace(/[^a-zA-Z0-9]/g, "_");
855+
} catch (error) {
856+
console.error("Error generating code search tool name:", error);
857+
// Return default tool name if there's any error parsing the URL
858+
return "search_code";
859+
}
860+
}
861+
862+
/**
863+
* Generate a dynamic description for the code search tool based on the URL
864+
* @param repoData - The repository data object
865+
* @returns A descriptive string for the tool
866+
*/
867+
export function generateCodeSearchToolDescription({
868+
urlType,
869+
owner,
870+
repo,
871+
}: RepoData): string {
872+
try {
873+
// Default description as fallback
874+
let description = "Search code files in the current repository.";
875+
876+
if (urlType == "subdomain") {
877+
description = `Search for code within the ${owner}/${repo} GitHub repository. Returns matching files and code snippets.`;
878+
} else if (urlType == "github") {
879+
description = `Search for code within GitHub repository: ${owner}/${repo}. Returns matching files and code snippets.`;
880+
}
881+
882+
return description;
883+
} catch (error) {
884+
// Return default description if there's any error parsing the URL
885+
return "Search code in the current repository.";
886+
}
887+
}

0 commit comments

Comments
 (0)