Skip to content

Commit cf3fbcb

Browse files
Improve URL handler for collector processes (#4504)
* Improve URL handler for collector processes * dev build
1 parent 3259ede commit cf3fbcb

File tree

7 files changed

+161
-6
lines changed

7 files changed

+161
-6
lines changed

.github/workflows/dev-build.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ concurrency:
66

77
on:
88
push:
9-
branches: ['4499-tooltips'] # put your current branch to create a build. Core team only.
9+
branches: ['improve-url-handler-collector'] # put your current branch to create a build. Core team only.
1010
paths-ignore:
1111
- '**.md'
1212
- 'cloud-deployments/*'
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
const { validURL, validateURL } = require("../../../utils/url");
2+
3+
// Mock the RuntimeSettings module
4+
jest.mock("../../../utils/runtimeSettings", () => {
5+
const mockInstance = {
6+
get: jest.fn(),
7+
set: jest.fn(),
8+
};
9+
return jest.fn().mockImplementation(() => mockInstance);
10+
});
11+
12+
describe("validURL", () => {
13+
let mockRuntimeSettings;
14+
15+
beforeEach(() => {
16+
const RuntimeSettings = require("../../../utils/runtimeSettings");
17+
mockRuntimeSettings = new RuntimeSettings();
18+
jest.clearAllMocks();
19+
});
20+
21+
it("should validate a valid URL", () => {
22+
mockRuntimeSettings.get.mockImplementation((key) => {
23+
if (key === "allowAnyIp") return false;
24+
if (key === "seenAnyIpWarning") return true; // silence the warning for tests
25+
return false;
26+
});
27+
28+
expect(validURL("https://www.google.com")).toBe(true);
29+
expect(validURL("http://www.google.com")).toBe(true);
30+
31+
// JS URL does not require extensions, so in theory
32+
// these should be valid
33+
expect(validURL("https://random")).toBe(true);
34+
expect(validURL("http://123")).toBe(true);
35+
36+
// missing protocols
37+
expect(validURL("www.google.com")).toBe(false);
38+
expect(validURL("google.com")).toBe(false);
39+
40+
// invalid protocols
41+
expect(validURL("ftp://www.google.com")).toBe(false);
42+
expect(validURL("mailto://www.google.com")).toBe(false);
43+
expect(validURL("tel://www.google.com")).toBe(false);
44+
expect(validURL("data://www.google.com")).toBe(false);
45+
});
46+
47+
it("should block private/local IPs when allowAnyIp is false (default behavior)", () => {
48+
mockRuntimeSettings.get.mockImplementation((key) => {
49+
if (key === "allowAnyIp") return false;
50+
if (key === "seenAnyIpWarning") return true; // silence the warning for tests
51+
return false;
52+
});
53+
54+
expect(validURL("http://192.168.1.1")).toBe(false);
55+
expect(validURL("http://10.0.0.1")).toBe(false);
56+
expect(validURL("http://172.16.0.1")).toBe(false);
57+
58+
// But localhost should still be allowed
59+
expect(validURL("http://127.0.0.1")).toBe(true);
60+
expect(validURL("http://0.0.0.0")).toBe(true);
61+
});
62+
63+
it("should allow any IP when allowAnyIp is true", () => {
64+
mockRuntimeSettings.get.mockImplementation((key) => {
65+
if (key === "allowAnyIp") return true;
66+
if (key === "seenAnyIpWarning") return true; // silence the warning for tests
67+
return false;
68+
});
69+
70+
expect(validURL("http://192.168.1.1")).toBe(true);
71+
expect(validURL("http://10.0.0.1")).toBe(true);
72+
expect(validURL("http://172.16.0.1")).toBe(true);
73+
});
74+
});
75+
76+
describe("validateURL", () => {
77+
it("should return the exact same URL if it's already valid", () => {
78+
expect(validateURL("https://www.google.com")).toBe("https://www.google.com");
79+
expect(validateURL("http://www.google.com")).toBe("http://www.google.com");
80+
expect(validateURL("https://random")).toBe("https://random");
81+
82+
// With numbers as a url this will turn into an ip
83+
expect(validateURL("123")).toBe("https://0.0.0.123");
84+
expect(validateURL("123.123.123.123")).toBe("https://123.123.123.123");
85+
expect(validateURL("http://127.0.123.45")).toBe("http://127.0.123.45");
86+
});
87+
88+
it("should assume https:// if the URL doesn't have a protocol", () => {
89+
expect(validateURL("www.google.com")).toBe("https://www.google.com");
90+
expect(validateURL("google.com")).toBe("https://google.com");
91+
expect(validateURL("ftp://www.google.com")).toBe("ftp://www.google.com");
92+
expect(validateURL("mailto://www.google.com")).toBe("mailto://www.google.com");
93+
expect(validateURL("tel://www.google.com")).toBe("tel://www.google.com");
94+
expect(validateURL("data://www.google.com")).toBe("data://www.google.com");
95+
});
96+
97+
it("should remove trailing slashes post-validation", () => {
98+
expect(validateURL("https://www.google.com/")).toBe("https://www.google.com");
99+
expect(validateURL("http://www.google.com/")).toBe("http://www.google.com");
100+
expect(validateURL("https://random/")).toBe("https://random");
101+
});
102+
103+
it("should handle edge cases and bad data inputs", () => {
104+
expect(validateURL({})).toBe("");
105+
expect(validateURL(null)).toBe("");
106+
expect(validateURL(undefined)).toBe("");
107+
expect(validateURL(124512)).toBe("");
108+
expect(validateURL("")).toBe("");
109+
expect(validateURL(" ")).toBe("");
110+
expect(validateURL(" look here! ")).toBe("look here!");
111+
});
112+
});

collector/extensions/index.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ const { setDataSigner } = require("../middleware/setDataSigner");
22
const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
33
const { resolveRepoLoader, resolveRepoLoaderFunction } = require("../utils/extensions/RepoLoader");
44
const { reqBody } = require("../utils/http");
5-
const { validURL } = require("../utils/url");
5+
const { validURL, validateURL } = require("../utils/url");
66
const RESYNC_METHODS = require("./resync");
77
const { loadObsidianVault } = require("../utils/extensions/ObsidianVault");
88

@@ -119,6 +119,7 @@ function extensions(app) {
119119
try {
120120
const websiteDepth = require("../utils/extensions/WebsiteDepth");
121121
const { url, depth = 1, maxLinks = 20 } = reqBody(request);
122+
url = validateURL(url);
122123
if (!validURL(url)) throw new Error("Not a valid URL.");
123124
const scrapedData = await websiteDepth(url, depth, maxLinks);
124125
response.status(200).json({ success: true, data: scrapedData });

collector/processLink/convert/generic.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ async function scrapeGenericUrl({
111111
headers: scraperHeaders,
112112
});
113113

114-
if (!content.length) {
114+
if (!content || !content.length) {
115115
console.error(`Resulting URL content was empty at ${link}.`);
116116
return returnResult({
117117
success: false,

collector/processLink/index.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
const { validURL } = require("../utils/url");
22
const { scrapeGenericUrl } = require("./convert/generic");
3+
const { validateURL } = require("../utils/url");
34

45
/**
56
* Process a link and return the text content. This util will save the link as a document
@@ -10,6 +11,7 @@ const { scrapeGenericUrl } = require("./convert/generic");
1011
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
1112
*/
1213
async function processLink(link, scraperHeaders = {}, metadata = {}) {
14+
link = validateURL(link);
1315
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
1416
return await scrapeGenericUrl({
1517
link,
@@ -28,6 +30,7 @@ async function processLink(link, scraperHeaders = {}, metadata = {}) {
2830
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
2931
*/
3032
async function getLinkText(link, captureAs = "text") {
33+
link = validateURL(link);
3134
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
3235
return await scrapeGenericUrl({
3336
link,

collector/utils/url/index.js

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ function isInvalidIp({ hostname }) {
5454
}
5555

5656
/**
57-
* Validates a URL
57+
* Validates a URL strictly
5858
* - Checks the URL forms a valid URL
5959
* - Checks the URL is at least HTTP(S)
6060
* - Checks the URL is not an internal IP - can be bypassed via COLLECTOR_ALLOW_ANY_IP
@@ -71,6 +71,33 @@ function validURL(url) {
7171
return false;
7272
}
7373

74+
/**
75+
* Modifies a URL to be valid:
76+
* - Checks the URL is at least HTTP(S) so that protocol exists
77+
* - Checks the URL forms a valid URL
78+
* @param {string} url
79+
* @returns {string}
80+
*/
81+
function validateURL(url) {
82+
try {
83+
let destination = url.trim().toLowerCase();
84+
// If the URL has a protocol, just pass through
85+
if (destination.includes("://")) {
86+
destination = new URL(destination).toString();
87+
} else {
88+
// If the URL doesn't have a protocol, assume https://
89+
destination = new URL(`https://${destination.trim()}`).toString();
90+
}
91+
92+
// If the URL ends with a slash, remove it
93+
return destination.endsWith("/") ? destination.slice(0, -1) : destination;
94+
} catch {
95+
if (typeof url !== "string") return "";
96+
return url.trim();
97+
}
98+
}
99+
74100
module.exports = {
75101
validURL,
102+
validateURL,
76103
};

server/utils/agents/aibitat/plugins/web-scraping.js

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ const webScraping = {
4545
if (url) return await this.scrape(url);
4646
return "There is nothing we can do. This function call returns no information.";
4747
} catch (error) {
48+
this.super.handlerProps.log(
49+
`Web Scraping Error: ${error.message}`
50+
);
51+
this.super.introspect(
52+
`${this.caller}: Web Scraping Error: ${error.message}`
53+
);
4854
return `There was an error while calling the function. No data or response was found. Let the user know this was the error: ${error.message}`;
4955
}
5056
},
@@ -78,15 +84,21 @@ const webScraping = {
7884
}
7985

8086
const { TokenManager } = require("../../../helpers/tiktoken");
87+
const tokenEstimate = new TokenManager(
88+
this.super.model
89+
).countFromString(content);
8190
if (
82-
new TokenManager(this.super.model).countFromString(content) <
91+
tokenEstimate <
8392
Provider.contextLimit(this.super.provider, this.super.model)
8493
) {
94+
this.super.introspect(
95+
`${this.caller}: Looking over the content of the page. ~${tokenEstimate} tokens.`
96+
);
8597
return content;
8698
}
8799

88100
this.super.introspect(
89-
`${this.caller}: This page's content is way too long. I will summarize it right now.`
101+
`${this.caller}: This page's content exceeds the model's context limit. Summarizing it right now.`
90102
);
91103
this.super.onAbort(() => {
92104
this.super.handlerProps.log(

0 commit comments

Comments
 (0)