Skip to content

Commit 3160f71

Browse files
authored
fix: rename RobotsFile to RobotsTxtFile (#2913)
The old name is still supported as an alias, will be removed in v4. Related #2910
1 parent 0eabed1 commit 3160f71

File tree

9 files changed

+35
-31
lines changed

9 files changed

+35
-31
lines changed

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ import {
5050
validators,
5151
} from '@crawlee/core';
5252
import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
53-
import { RobotsFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
53+
import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
5454
import { stringify } from 'csv-stringify/sync';
5555
import { ensureDir, writeFile, writeJSON } from 'fs-extra';
5656
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
@@ -520,7 +520,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
520520
private _closeEvents?: boolean;
521521

522522
private experiments: CrawlerExperiments;
523-
private readonly robotsTxtFileCache: LruCache<RobotsFile>;
523+
private readonly robotsTxtFileCache: LruCache<RobotsTxtFile>;
524524
private _experimentWarnings: Partial<Record<keyof CrawlerExperiments, boolean>> = {};
525525

526526
protected static optionsShape = {
@@ -1178,7 +1178,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
11781178
return !robotsTxtFile || robotsTxtFile.isAllowed(url);
11791179
}
11801180

1181-
protected async getRobotsTxtFileForUrl(url: string): Promise<RobotsFile | undefined> {
1181+
protected async getRobotsTxtFileForUrl(url: string): Promise<RobotsTxtFile | undefined> {
11821182
if (!this.respectRobotsTxtFile) {
11831183
return undefined;
11841184
}
@@ -1191,7 +1191,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
11911191
return cachedRobotsTxtFile;
11921192
}
11931193

1194-
const robotsTxtFile = await RobotsFile.find(url);
1194+
const robotsTxtFile = await RobotsTxtFile.find(url);
11951195
this.robotsTxtFileCache.add(origin, robotsTxtFile);
11961196

11971197
return robotsTxtFile;

packages/browser-crawler/src/internals/browser-crawler.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ import type {
3838
} from '@crawlee/browser-pool';
3939
import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
4040
import type { Cookie as CookieObject } from '@crawlee/types';
41-
import type { RobotsFile } from '@crawlee/utils';
41+
import type { RobotsTxtFile } from '@crawlee/utils';
4242
import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
4343
import ow from 'ow';
4444
import type { ReadonlyDeep } from 'type-fest';
@@ -791,7 +791,7 @@ interface EnqueueLinksInternalOptions {
791791
options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
792792
page: CommonPage;
793793
requestQueue: RequestProvider;
794-
robotsTxtFile?: RobotsFile;
794+
robotsTxtFile?: RobotsTxtFile;
795795
originalRequestUrl: string;
796796
finalRequestUrl?: string;
797797
}

packages/cheerio-crawler/src/internals/cheerio-crawler.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import type {
1515
} from '@crawlee/http';
1616
import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router } from '@crawlee/http';
1717
import type { Dictionary } from '@crawlee/types';
18-
import { type CheerioRoot, extractUrlsFromCheerio, type RobotsFile } from '@crawlee/utils';
18+
import { type CheerioRoot, extractUrlsFromCheerio, type RobotsTxtFile } from '@crawlee/utils';
1919
import type { CheerioOptions } from 'cheerio';
2020
import * as cheerio from 'cheerio';
2121
import { DomHandler, parseDocument } from 'htmlparser2';
@@ -239,7 +239,7 @@ interface EnqueueLinksInternalOptions {
239239
options?: EnqueueLinksOptions;
240240
$: cheerio.CheerioAPI | null;
241241
requestQueue: RequestProvider;
242-
robotsTxtFile?: RobotsFile;
242+
robotsTxtFile?: RobotsTxtFile;
243243
originalRequestUrl: string;
244244
finalRequestUrl?: string;
245245
}

packages/core/src/enqueue_links/enqueue_links.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
2-
import { type RobotsFile } from '@crawlee/utils';
2+
import { type RobotsTxtFile } from '@crawlee/utils';
33
import ow from 'ow';
44
import { getDomain } from 'tldts';
55
import type { SetRequired } from 'type-fest';
@@ -161,10 +161,10 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
161161
waitForAllRequestsToBeAdded?: boolean;
162162

163163
/**
164-
* RobotsFile instance for the current request that triggered the `enqueueLinks`.
164+
* RobotsTxtFile instance for the current request that triggered the `enqueueLinks`.
165165
* If provided, disallowed URLs will be ignored.
166166
*/
167-
robotsTxtFile?: RobotsFile;
167+
robotsTxtFile?: RobotsTxtFile;
168168
}
169169

170170
/**

packages/jsdom-crawler/src/internals/jsdom-crawler.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import {
2020
tryAbsoluteURL,
2121
} from '@crawlee/http';
2222
import type { Dictionary } from '@crawlee/types';
23-
import { type CheerioRoot, type RobotsFile, sleep } from '@crawlee/utils';
23+
import { type CheerioRoot, type RobotsTxtFile, sleep } from '@crawlee/utils';
2424
import * as cheerio from 'cheerio';
2525
import type { DOMWindow } from 'jsdom';
2626
import { JSDOM, ResourceLoader, VirtualConsole } from 'jsdom';
@@ -344,7 +344,7 @@ interface EnqueueLinksInternalOptions {
344344
options?: EnqueueLinksOptions;
345345
window: DOMWindow | null;
346346
requestQueue: RequestProvider;
347-
robotsTxtFile?: RobotsFile;
347+
robotsTxtFile?: RobotsTxtFile;
348348
originalRequestUrl: string;
349349
finalRequestUrl?: string;
350350
}

packages/linkedom-crawler/src/internals/linkedom-crawler.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ import {
1919
tryAbsoluteURL,
2020
} from '@crawlee/http';
2121
import type { Dictionary } from '@crawlee/types';
22-
import { type CheerioRoot, type RobotsFile, sleep } from '@crawlee/utils';
22+
import { type CheerioRoot, type RobotsTxtFile, sleep } from '@crawlee/utils';
2323
import * as cheerio from 'cheerio';
2424
// @ts-expect-error This throws a compilation error due to TypeScript not inferring the module has CJS versions too
2525
import { DOMParser } from 'linkedom/cached';
@@ -227,7 +227,7 @@ interface EnqueueLinksInternalOptions {
227227
options?: LinkeDOMCrawlerEnqueueLinksOptions;
228228
window: Window | null;
229229
requestQueue: RequestProvider;
230-
robotsTxtFile?: RobotsFile;
230+
robotsTxtFile?: RobotsTxtFile;
231231
originalRequestUrl: string;
232232
finalRequestUrl?: string;
233233
}

packages/utils/src/internals/robots.ts

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ let HTTPError: typeof HTTPErrorClass;
1414
* **Example usage:**
1515
* ```javascript
1616
* // Load the robots.txt file
17-
* const robots = await RobotsFile.find('https://crawlee.dev/js/docs/introduction/first-crawler');
17+
* const robots = await RobotsTxtFile.find('https://crawlee.dev/js/docs/introduction/first-crawler');
1818
*
1919
* // Check if a URL should be crawled according to robots.txt
2020
* const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler';
@@ -26,7 +26,7 @@ let HTTPError: typeof HTTPErrorClass;
2626
* await crawler.addRequests(await robots.parseUrlsFromSitemaps());
2727
* ```
2828
*/
29-
export class RobotsFile {
29+
export class RobotsTxtFile {
3030
private constructor(
3131
private robots: Pick<Robot, 'isAllowed' | 'getSitemaps'>,
3232
private proxyUrl?: string,
@@ -37,12 +37,12 @@ export class RobotsFile {
3737
* @param url the URL to fetch robots.txt for
3838
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
3939
*/
40-
static async find(url: string, proxyUrl?: string): Promise<RobotsFile> {
40+
static async find(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
4141
const robotsTxtFileUrl = new URL(url);
4242
robotsTxtFileUrl.pathname = '/robots.txt';
4343
robotsTxtFileUrl.search = '';
4444

45-
return RobotsFile.load(robotsTxtFileUrl.toString(), proxyUrl);
45+
return RobotsTxtFile.load(robotsTxtFileUrl.toString(), proxyUrl);
4646
}
4747

4848
/**
@@ -51,11 +51,11 @@ export class RobotsFile {
5151
* @param content contents of robots.txt
5252
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
5353
*/
54-
static from(url: string, content: string, proxyUrl?: string): RobotsFile {
55-
return new RobotsFile(robotsParser(url, content), proxyUrl);
54+
static from(url: string, content: string, proxyUrl?: string): RobotsTxtFile {
55+
return new RobotsTxtFile(robotsParser(url, content), proxyUrl);
5656
}
5757

58-
protected static async load(url: string, proxyUrl?: string): Promise<RobotsFile> {
58+
protected static async load(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
5959
if (!HTTPError) {
6060
HTTPError = (await import('got-scraping')).HTTPError;
6161
}
@@ -68,10 +68,10 @@ export class RobotsFile {
6868
responseType: 'text',
6969
});
7070

71-
return new RobotsFile(robotsParser(url.toString(), response.body), proxyUrl);
71+
return new RobotsTxtFile(robotsParser(url.toString(), response.body), proxyUrl);
7272
} catch (e) {
7373
if (e instanceof HTTPError && e.response.statusCode === 404) {
74-
return new RobotsFile(
74+
return new RobotsTxtFile(
7575
{
7676
isAllowed() {
7777
return true;
@@ -117,3 +117,6 @@ export class RobotsFile {
117117
return (await this.parseSitemaps()).urls;
118118
}
119119
}
120+
121+
// to stay backwards compatible
122+
export { RobotsTxtFile as RobotsFile };

packages/utils/src/internals/sitemap.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ export class Sitemap {
355355

356356
/**
357357
* Try to load sitemap from the most common locations - `/sitemap.xml` and `/sitemap.txt`.
358-
* For loading based on `Sitemap` entries in `robots.txt`, the {@apilink RobotsFile} class should be used.
358+
* For loading based on `Sitemap` entries in `robots.txt`, the {@apilink RobotsTxtFile} class should be used.
359359
* @param url The domain URL to fetch the sitemap for.
360360
* @param proxyUrl A proxy to be used for fetching the sitemap file.
361361
*/

packages/utils/test/robots.test.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import nock from 'nock';
22
import { beforeEach, describe, expect, it } from 'vitest';
33

4-
import { RobotsFile } from '../src/internals/robots';
4+
import { RobotsTxtFile } from '../src/internals/robots';
55

6-
describe('RobotsFile', () => {
6+
describe('RobotsTxtFile', () => {
77
beforeEach(() => {
88
nock.disableNetConnect();
99
nock('http://not-exists.com')
@@ -37,19 +37,20 @@ describe('RobotsFile', () => {
3737
});
3838

3939
it('generates the correct robots.txt URL', async () => {
40-
const robots = await RobotsFile.find('http://not-exists.com/nested/index.html');
40+
const robots = await RobotsTxtFile.find('http://not-exists.com/nested/index.html');
4141
expect(robots.getSitemaps()).not.toHaveLength(0);
4242
});
4343

4444
it('parses allow/deny directives from robots.txt', async () => {
45-
const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
45+
const robots = await RobotsTxtFile.find('http://not-exists.com/robots.txt');
46+
console.log(robots.isAllowed('https://crawlee.dev'));
4647
expect(robots.isAllowed('http://not-exists.com/something/page.html')).toBe(true);
4748
expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html')).toBe(true);
4849
expect(robots.isAllowed('http://not-exists.com/deny_all/page.html')).toBe(false);
4950
});
5051

5152
it('extracts sitemap urls', async () => {
52-
const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
53+
const robots = await RobotsTxtFile.find('http://not-exists.com/robots.txt');
5354
expect(robots.getSitemaps()).toEqual([
5455
'http://not-exists.com/sitemap_1.xml',
5556
'http://not-exists.com/sitemap_2.xml',
@@ -62,7 +63,7 @@ Disallow: *deny_all/
6263
crawl-delay: 10
6364
User-agent: Googlebot
6465
Disallow: *deny_googlebot/`;
65-
const robots = RobotsFile.from('http://not-exists.com/robots.txt', contents);
66+
const robots = RobotsTxtFile.from('http://not-exists.com/robots.txt', contents);
6667
expect(robots.isAllowed('http://not-exists.com/something/page.html')).toBe(true);
6768
expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html')).toBe(true);
6869
expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html', 'Googlebot')).toBe(false);

0 commit comments

Comments
 (0)