Skip to content

Commit bca7d7a

Browse files
authored
feat: replace got-specific HttpRequest with native Request interfaces (#3295)
Works towards removing `got-scraping` as a direct Crawlee dependency. Related to #3275 Related to #3071
1 parent e2c6784 commit bca7d7a

File tree

9 files changed

+147
-282
lines changed

9 files changed

+147
-282
lines changed
Lines changed: 10 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,17 @@
1-
import type {
2-
BaseHttpClient,
3-
HttpRequest,
4-
HttpResponse,
5-
RedirectHandler,
6-
ResponseTypes,
7-
StreamingHttpResponse,
8-
} from '@crawlee/core';
9-
import { Readable } from 'node:stream';
1+
import type { BaseHttpClient, SendRequestOptions, StreamOptions } from '@crawlee/core';
102

11-
export class CustomHttpClient implements BaseHttpClient {
12-
async sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
13-
request: HttpRequest<TResponseType>,
14-
): Promise<Response> {
15-
const requestHeaders = new Headers();
16-
for (let [headerName, headerValues] of Object.entries(request.headers ?? {})) {
17-
if (headerValues === undefined) {
18-
continue;
19-
}
20-
21-
if (!Array.isArray(headerValues)) {
22-
headerValues = [headerValues];
23-
}
24-
25-
for (const value of headerValues) {
26-
requestHeaders.append(headerName, value);
27-
}
28-
}
29-
30-
return fetch(request.url, {
31-
method: request.method,
32-
headers: requestHeaders,
33-
body: request.body as string,
34-
signal: request.signal,
3+
export class FetchHttpClient implements BaseHttpClient {
4+
async sendRequest(request: Request, options?: SendRequestOptions): Promise<Response> {
5+
const signal = options?.timeout ? AbortSignal.timeout(options.timeout ?? 0) : undefined;
6+
return fetch(request, {
7+
signal,
358
});
369
}
3710

38-
async stream(request: HttpRequest, _onRedirect?: RedirectHandler): Promise<Response> {
39-
return fetch(request.url, {
40-
method: request.method,
41-
headers: new Headers(),
42-
body: request.body as string,
43-
signal: request.signal,
11+
async stream(request: Request, options: StreamOptions): Promise<Response> {
12+
const signal = options?.timeout ? AbortSignal.timeout(options.timeout ?? 0) : undefined;
13+
return fetch(request, {
14+
signal,
4415
});
4516
}
4617
}

docs/guides/custom-http-client/usage.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import { HttpCrawler } from 'crawlee';
2-
import { CustomHttpClient } from './implementation.js';
2+
import { FetchHttpClient } from './implementation.js';
33

44
const crawler = new HttpCrawler({
5-
httpClient: new CustomHttpClient(),
5+
httpClient: new FetchHttpClient(),
66
async requestHandler() {
77
/* ... */
88
},
Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,4 @@
1-
import {
2-
type BaseHttpClient,
3-
type HttpRequestOptions,
4-
processHttpRequestOptions,
5-
type Request,
6-
type Session,
7-
} from '@crawlee/core';
1+
import { type BaseHttpClient, type HttpRequestOptions, type Request, type Session } from '@crawlee/core';
82

93
/**
104
* Prepares a function to be used as the `sendRequest` context helper.
@@ -25,20 +19,6 @@ export function createSendRequest(httpClient: BaseHttpClient, originRequest: Req
2519
}
2620
: overrideOptions?.cookieJar;
2721

28-
const requestOptions = processHttpRequestOptions({
29-
url: originRequest.url,
30-
method: originRequest.method,
31-
headers: originRequest.headers,
32-
proxyUrl: session?.proxyInfo?.url,
33-
sessionToken: session,
34-
responseType: 'text',
35-
...overrideOptions,
36-
cookieJar,
37-
});
38-
39-
// Fill in body as the last step - `processHttpRequestOptions` may use either `body`, `json` or `form` so we cannot override it beforehand
40-
requestOptions.body ??= originRequest.payload;
41-
42-
return httpClient.sendRequest(requestOptions);
22+
return httpClient.sendRequest(originRequest.intoFetchAPIRequest(), { session, cookieJar: cookieJar as any });
4323
};
4424
}

packages/core/src/http_clients/base-http-client.ts

Lines changed: 35 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,9 @@
1-
import type { Readable } from 'node:stream';
1+
import { Readable } from 'node:stream';
22

33
import type { AllowedHttpMethods } from '@crawlee/types';
44
import { applySearchParams, type SearchParams } from '@crawlee/utils';
55

6-
import type { FormDataLike } from './form-data-like.js';
7-
8-
type Timeout =
9-
| {
10-
lookup: number;
11-
connect: number;
12-
secureConnect: number;
13-
socket: number;
14-
send: number;
15-
response: number;
16-
}
17-
| { request: number };
18-
19-
/**
20-
* Maps permitted values of the `responseType` option on {@apilink HttpRequest} to the types that they produce.
21-
*/
22-
export interface ResponseTypes {
23-
'json': unknown;
24-
'text': string;
25-
'buffer': Buffer;
26-
}
27-
28-
interface Progress {
29-
percent: number;
30-
transferred: number;
31-
total?: number;
32-
}
6+
import type { Session } from '../session_pool/session.js';
337

348
// TODO BC with got - remove the options and callback parameters in 4.0
359
interface ToughCookieJar {
@@ -53,28 +27,23 @@ interface PromiseCookieJar {
5327
setCookie: (rawCookie: string, url: string) => Promise<unknown>;
5428
}
5529

56-
type SimpleHeaders = Record<string, string | string[] | undefined>;
57-
5830
/**
5931
* HTTP Request as accepted by {@apilink BaseHttpClient} methods.
6032
*/
61-
export interface HttpRequest<TResponseType extends keyof ResponseTypes = 'text'> {
62-
[k: string]: unknown; // TODO BC with got - remove in 4.0
63-
33+
export interface HttpRequest {
6434
url: string | URL;
6535
method?: AllowedHttpMethods;
66-
headers?: SimpleHeaders;
67-
body?: string | Buffer | Readable | Generator | AsyncGenerator | FormDataLike;
36+
headers?: Headers;
37+
body?: Readable;
6838

6939
signal?: AbortSignal;
70-
timeout?: Partial<Timeout>;
40+
timeout?: number;
7141

7242
cookieJar?: ToughCookieJar | PromiseCookieJar;
7343
followRedirect?: boolean | ((response: any) => boolean); // TODO BC with got - specify type better in 4.0
7444
maxRedirects?: number;
7545

7646
encoding?: BufferEncoding;
77-
responseType?: TResponseType;
7847
throwHttpErrors?: boolean;
7948

8049
// from got-scraping Context
@@ -91,8 +60,7 @@ export interface HttpRequest<TResponseType extends keyof ResponseTypes = 'text'>
9160
/**
9261
* Additional options for HTTP requests that need to be handled separately before passing to {@apilink BaseHttpClient}.
9362
*/
94-
export interface HttpRequestOptions<TResponseType extends keyof ResponseTypes = 'text'>
95-
extends HttpRequest<TResponseType> {
63+
export interface HttpRequestOptions extends HttpRequest {
9664
/** Search (query string) parameters to be appended to the request URL */
9765
searchParams?: SearchParams;
9866

@@ -107,28 +75,6 @@ export interface HttpRequestOptions<TResponseType extends keyof ResponseTypes =
10775
password?: string;
10876
}
10977

110-
/**
111-
* HTTP response data, without a body, as returned by {@apilink BaseHttpClient} methods.
112-
*/
113-
export interface BaseHttpResponseData {
114-
redirectUrls: URL[];
115-
url: string;
116-
117-
ip?: string;
118-
statusCode: number;
119-
statusMessage?: string;
120-
121-
headers: SimpleHeaders;
122-
trailers: SimpleHeaders; // Populated after the whole message is processed
123-
124-
complete: boolean;
125-
}
126-
127-
interface HttpResponseWithoutBody<TResponseType extends keyof ResponseTypes = keyof ResponseTypes>
128-
extends BaseHttpResponseData {
129-
request: HttpRequest<TResponseType>;
130-
}
131-
13278
export class ResponseWithUrl extends Response {
13379
override url: string;
13480
constructor(body: BodyInit | null, init: ResponseInit & { url?: string }) {
@@ -137,63 +83,52 @@ export class ResponseWithUrl extends Response {
13783
}
13884
}
13985

140-
/**
141-
* HTTP response data as returned by the {@apilink BaseHttpClient.sendRequest} method.
142-
*/
143-
export interface HttpResponse<TResponseType extends keyof ResponseTypes = keyof ResponseTypes>
144-
extends HttpResponseWithoutBody<TResponseType> {
145-
[k: string]: any; // TODO BC with got - remove in 4.0
146-
147-
body: ResponseTypes[TResponseType];
148-
}
149-
150-
/**
151-
* HTTP response data as returned by the {@apilink BaseHttpClient.stream} method.
152-
*/
153-
export interface StreamingHttpResponse extends HttpResponseWithoutBody {
154-
stream: Readable;
155-
readonly downloadProgress: Progress;
156-
readonly uploadProgress: Progress;
157-
}
158-
15986
/**
16087
* Type of a function called when an HTTP redirect takes place. It is allowed to mutate the `updatedRequest` argument.
16188
*/
16289
export type RedirectHandler = (
16390
redirectResponse: Response,
164-
updatedRequest: { url?: string | URL; headers: SimpleHeaders },
91+
updatedRequest: { url?: string | URL; headers: Headers },
16592
) => void;
16693

94+
export interface SendRequestOptions {
95+
session?: Session;
96+
cookieJar?: ToughCookieJar;
97+
timeout?: number;
98+
}
99+
100+
export interface StreamOptions extends SendRequestOptions {
101+
onRedirect?: RedirectHandler;
102+
}
103+
167104
/**
168105
* Interface for user-defined HTTP clients to be used for plain HTTP crawling and for sending additional requests during a crawl.
169106
*/
170107
export interface BaseHttpClient {
171108
/**
172109
* Perform an HTTP Request and return the complete response.
173110
*/
174-
sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
175-
request: HttpRequest<TResponseType>,
176-
): Promise<Response>;
111+
sendRequest(request: Request, options?: SendRequestOptions): Promise<Response>;
177112

178113
/**
179114
* Perform an HTTP Request and return after the response headers are received. The body may be read from a stream contained in the response.
180115
*/
181-
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<Response>;
116+
stream(request: Request, options?: StreamOptions): Promise<Response>;
182117
}
183118

184119
/**
185120
* Converts {@apilink HttpRequestOptions} to a {@apilink HttpRequest}.
186121
*/
187-
export function processHttpRequestOptions<TResponseType extends keyof ResponseTypes = 'text'>({
122+
export function processHttpRequestOptions({
188123
searchParams,
189124
form,
190125
json,
191126
username,
192127
password,
193128
...request
194-
}: HttpRequestOptions<TResponseType>): HttpRequest<TResponseType> {
129+
}: HttpRequestOptions): HttpRequest {
195130
const url = new URL(request.url);
196-
const headers = { ...request.headers };
131+
const headers = new Headers(request.headers);
197132

198133
applySearchParams(url, searchParams);
199134

@@ -203,27 +138,31 @@ export function processHttpRequestOptions<TResponseType extends keyof ResponseTy
203138

204139
const body = (() => {
205140
if (form !== undefined) {
206-
return new URLSearchParams(form).toString();
141+
return Readable.from(new URLSearchParams(form).toString());
207142
}
208143

209144
if (json !== undefined) {
210-
return JSON.stringify(json);
145+
return Readable.from(JSON.stringify(json));
146+
}
147+
148+
if (request.body !== undefined) {
149+
return Readable.from(request.body);
211150
}
212151

213-
return request.body;
152+
return undefined;
214153
})();
215154

216-
if (form !== undefined) {
217-
headers['content-type'] ??= 'application/x-www-form-urlencoded';
155+
if (form !== undefined && !headers.has('content-type')) {
156+
headers.set('content-type', 'application/x-www-form-urlencoded');
218157
}
219158

220-
if (json !== undefined) {
221-
headers['content-type'] ??= 'application/json';
159+
if (json !== undefined && !headers.has('content-type')) {
160+
headers.set('content-type', 'application/json');
222161
}
223162

224163
if (username !== undefined || password !== undefined) {
225164
const encodedAuth = Buffer.from(`${username ?? ''}:${password ?? ''}`).toString('base64');
226-
headers.authorization = `Basic ${encodedAuth}`;
165+
headers.set('authorization', `Basic ${encodedAuth}`);
227166
}
228167

229168
return { ...request, body, url, headers };

0 commit comments

Comments
 (0)