Skip to content

Commit 5849edd

Browse files
authored
Merge pull request #2 from watercrawl/update-struture-and-functionality
refactor: simplify tool architecture and update dependencies
2 parents 8155b2b + b14875d commit 5849edd

File tree

11 files changed

+1015
-1116
lines changed

11 files changed

+1015
-1116
lines changed

README.md

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -92,25 +92,9 @@ npm run build
9292
3. Link the package for local development:
9393

9494
```bash
95-
npm run dev:link
95+
npm link @watercrawl/mcp
9696
```
9797

98-
### Testing
99-
100-
The project includes tests for both SSE and npx modes:
101-
102-
```bash
103-
# Run all tests
104-
npm test
105-
106-
# Run only SSE tests
107-
npm run test:sse
108-
109-
# Run only npx tests
110-
npm run test:npx
111-
```
112-
113-
Tests require a valid WaterCrawl API key to be set in the `.env` file or passed as an environment variable.
11498

11599
### Contribution Guidelines
116100

package-lock.json

Lines changed: 914 additions & 865 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
"scripts": {
1010
"clean": "rm -rf dist",
1111
"build": "npm run clean && node scripts/build.js",
12-
"start": "tsx --require tsconfig-paths/register index.ts",
13-
"cli": "tsx --require tsconfig-paths/register cli.ts",
12+
"start": "tsx --require tsconfig-paths/register src/index.ts",
13+
"cli": "tsx --require tsconfig-paths/register src/cli.ts",
1414
"fix-paths": "tsc-alias",
1515
"lint": "eslint . --ext .ts",
1616
"lint:fix": "eslint --fix . --ext .ts",
@@ -36,7 +36,7 @@
3636
"access": "public"
3737
},
3838
"dependencies": {
39-
"@watercrawl/nodejs": "^1.1.0",
39+
"@watercrawl/nodejs": "^1.2.1",
4040
"commander": "^13.1.0",
4141
"dotenv": "^16.5.0",
4242
"fastmcp": "^1.23.2",

src/tools/crawl-manager.ts

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { Context, ToolParameters, UserError, Tool } from 'fastmcp';
33
import { getClient } from '@utils/client';
44

55
interface CrawlManagerArgs {
6-
action: 'list' | 'get' | 'stop' | 'download';
6+
action: 'list' | 'get' | 'get_results' | 'stop';
77
crawlRequestId?: string;
88
page?: number;
99
pageSize?: number;
@@ -22,18 +22,24 @@ const manageCrawl = async (args: CrawlManagerArgs | any, { session }: Context<an
2222
}
2323
const getResult = await client.getCrawlRequest(args.crawlRequestId);
2424
return JSON.stringify(getResult);
25+
case 'get_results':
26+
if (!args.crawlRequestId) {
27+
throw new UserError("crawlRequestId is required for 'get_results' action");
28+
}
29+
30+
const results = await client.getCrawlRequestResults(
31+
args.crawlRequestId,
32+
args.page || 1,
33+
args.pageSize || 10,
34+
args.download !== false,
35+
);
36+
return JSON.stringify(results);
2537
case 'stop':
2638
if (!args.crawlRequestId) {
2739
throw new UserError("crawlRequestId is required for 'stop' action");
2840
}
2941
await client.stopCrawlRequest(args.crawlRequestId);
3042
return JSON.stringify({ success: true, message: 'Crawl request stopped successfully' });
31-
case 'download':
32-
if (!args.crawlRequestId) {
33-
throw new UserError("crawlRequestId is required for 'download' action");
34-
}
35-
const downloadResult = await client.downloadCrawlRequest(args.crawlRequestId);
36-
return JSON.stringify(downloadResult);
3743
default:
3844
throw new UserError(`Unknown action: ${args.action}`);
3945
}
@@ -50,8 +56,16 @@ const parameters = z.object({
5056
.string()
5157
.optional()
5258
.describe('UUID of the crawl request (required for get, stop, and download actions)'),
53-
page: z.number().optional().default(1).describe('Page number for listing (1-indexed)'),
54-
pageSize: z.number().optional().default(10).describe('Number of items per page for listing'),
59+
page: z
60+
.number()
61+
.optional()
62+
.default(1)
63+
.describe('Page number for listing (1-indexed), can use for get_results and list actions'),
64+
pageSize: z
65+
.number()
66+
.optional()
67+
.default(10)
68+
.describe('Number of items per page for listing, can use for get_results and list actions'),
5569
});
5670

5771
export const CrawlManagerTool: Tool<any, ToolParameters> = {

src/tools/crawl.ts

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,21 @@ const parameters = z.object({
2929
.object({
3030
max_depth: z.number().optional().describe('Maximum depth to crawl'),
3131
page_limit: z.number().optional().describe('Maximum number of pages to crawl'),
32-
allowed_domains: z.string().array().optional().describe('Allowed domains to crawl example: ["*.example.com"]'),
33-
exclude_paths: z.string().array().optional().describe('Paths to exclude from crawling example: ["/path/*"]'),
34-
include_paths: z.string().array().optional().describe('Paths to include in crawling example: ["/path/*"]'),
35-
32+
allowed_domains: z
33+
.string()
34+
.array()
35+
.optional()
36+
.describe('Allowed domains to crawl example: ["*.example.com"]'),
37+
exclude_paths: z
38+
.string()
39+
.array()
40+
.optional()
41+
.describe('Paths to exclude from crawling example: ["/path/*"]'),
42+
include_paths: z
43+
.string()
44+
.array()
45+
.optional()
46+
.describe('Paths to include in crawling example: ["/path/*"]'),
3647
})
3748
.optional()
3849
.describe('Spider options'),
@@ -66,7 +77,8 @@ const parameters = z.object({
6677

6778
export const CrawlTool: Tool<any, ToolParameters> = {
6879
name: 'crawl',
69-
description: 'Crawl a URL and its subpages with customizable depth and spider limitations. This is an async operation, with crawl manager you can get status and results.',
80+
description:
81+
'Crawl a URL and its subpages with customizable depth and spider limitations. This is an async operation, with crawl manager you can get status and results.',
7082
parameters: parameters,
7183
execute: crawlUrl,
7284
};

src/tools/index.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,12 @@ import { Tool, ToolParameters } from 'fastmcp';
33
import { SearchTool } from './search';
44
import { SitemapTool } from './sitemap';
55
import { CrawlManagerTool } from './crawl-manager';
6-
import { SearchManagerTool } from './search-manager';
7-
import { MonitorTool } from './monitor';
86
import { CrawlTool } from './crawl';
97

108
export const tools: Tool<any, ToolParameters>[] = [
119
ScrapeTool,
1210
SearchTool,
1311
SitemapTool,
14-
CrawlManagerTool,
15-
SearchManagerTool,
16-
MonitorTool,
1712
CrawlTool,
13+
CrawlManagerTool,
1814
];

src/tools/monitor.ts

Lines changed: 0 additions & 105 deletions
This file was deleted.

src/tools/scrape.ts

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,38 @@
11
import { z } from 'zod';
22
import { Context, ToolParameters, UserError, Tool } from 'fastmcp';
33
import { getClient } from '@utils/client';
4-
import type { PageOptions } from '@watercrawl/nodejs/dist/types';
4+
import type { CrawlRequest, PageOptions } from '@watercrawl/nodejs/dist/types';
55

66
interface ScrapeArgs {
7-
url: string;
7+
urls: string[];
88
pageOptions?: PageOptions;
9-
sync?: boolean;
10-
download?: boolean;
119
}
1210

1311
const scrapeUrl = async (args: ScrapeArgs | any, { session }: Context<any>) => {
1412
const client = getClient(session?.apiKey);
1513
try {
16-
const req = await client.scrapeUrl(
17-
args.url,
18-
args.pageOptions || {},
19-
{},
20-
args.sync === false ? false : true,
21-
args.download === false ? false : true,
22-
);
23-
return JSON.stringify(req);
14+
const req = await client.createBatchCrawlRequest(args.urls, {}, args.pageOptions || {});
15+
const results = [];
16+
for await (const data of client.monitorCrawlRequest(req.uuid, true)) {
17+
if (data.type === 'result') {
18+
results.push(data.data);
19+
}
20+
if (data.type === 'state' && (data.data as CrawlRequest).status === 'finished') {
21+
break;
22+
}
23+
}
24+
25+
return JSON.stringify({
26+
...req,
27+
results,
28+
});
2429
} catch (e) {
2530
throw new UserError(String(e));
2631
}
2732
};
2833

2934
const parameters = z.object({
30-
url: z.string().describe('URL to scrape'),
35+
urls: z.string().array().describe('List of URLs to scrape'),
3136
pageOptions: z
3237
.object({
3338
exclude_tags: z.string().array().optional().describe('HTML tags to exclude'),
@@ -59,8 +64,9 @@ const parameters = z.object({
5964
});
6065

6166
export const ScrapeTool: Tool<any, ToolParameters> = {
62-
name: 'scrape-url',
63-
description: 'Scrape a URL with optional configuration for page options, and more',
67+
name: 'scrape-urls',
68+
description:
69+
'Scrape multiple(or single) URL(s) with optional configuration for page options, and more',
6470
parameters: parameters,
6571
execute: scrapeUrl,
6672
};

src/tools/search-manager.ts

Lines changed: 0 additions & 63 deletions
This file was deleted.

0 commit comments

Comments
 (0)