Skip to content

Commit a4c236b

Browse files
added google bot option
1 parent a4e3966 commit a4c236b

File tree

3 files changed

+38
-8
lines changed

3 files changed

+38
-8
lines changed

INPUT_SCHEMA.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@
8686
"description": "Only works for puppeteer type. Be careful that Chrome is not guaranteed to work with Puppeteer.",
8787
"default": false
8888
},
89+
"useGoogleBotHeaders": {
90+
"title": "Use Google Bot headers",
91+
"type": "boolean",
92+
"description": "This option will allow you to bypass protection and/or paywall on some sites. Use with caution as it might get blocked.",
93+
"default": false
94+
},
8995
"saveSnapshots": {
9096
"title": "Save Snapshots",
9197
"type": "boolean",

src/constants.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
module.exports = {
2+
GOOGLE_BOT_HEADERS: {
3+
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
4+
'Referer': 'https://www.google.com/',
5+
'X-Forwarded-For': '66.249.66.1',
6+
},
7+
};

src/main.js

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ const cheerio = require('cheerio');
33

44
const { testHtml } = require('./checkers.js');
55
const { toSimpleState } = require('./utils.js');
6+
const { GOOGLE_BOT_HEADERS } = require('./constants.js');
67

78
Apify.main(async () => {
89
const input = await Apify.getInput();
@@ -22,6 +23,7 @@ Apify.main(async () => {
2223
retireInstanceAfterRequestCount = 10,
2324
headfull = false,
2425
useChrome = false,
26+
useGoogleBotHeaders = false,
2527
} = input;
2628

2729
const proxyUrl = proxyConfiguration.useApifyProxy
@@ -54,9 +56,16 @@ Apify.main(async () => {
5456
const requestQueue = await Apify.openRequestQueue();
5557

5658
for (const req of startUrls) {
57-
await requestQueue.addRequest({ ...req, headers: {'User-Agent': Apify.utils.getRandomUserAgent() } });
59+
await requestQueue.addRequest({
60+
...req,
61+
headers: useGoogleBotHeaders ? GOOGLE_BOT_HEADERS : { 'User-Agent': Apify.utils.getRandomUserAgent() },
62+
});
5863
for (let i = 0; i < replicateStartUrls; i++) {
59-
await requestQueue.addRequest({ ...req, uniqueKey: Math.random().toString(), headers: {'User-Agent': Apify.utils.getRandomUserAgent() } });
64+
await requestQueue.addRequest({
65+
...req,
66+
uniqueKey: Math.random().toString(),
67+
headers: useGoogleBotHeaders ? GOOGLE_BOT_HEADERS : { 'User-Agent': Apify.utils.getRandomUserAgent() },
68+
});
6069
}
6170
}
6271

@@ -73,7 +82,6 @@ Apify.main(async () => {
7382
await Apify.setValue(`${key}.html`, html, { contentType: 'text/html' });
7483
htmlUrl = `https://api.apify.com/v2/key-value-stores/${Apify.getEnv().defaultKeyValueStoreId}/records/${key}.html?disableRedirect=true`
7584
}
76-
7785
}
7886
state.total.push({ url: request.url, screenshotUrl, htmlUrl });
7987

@@ -109,12 +117,12 @@ Apify.main(async () => {
109117
requestQueue,
110118
baseUrl: request.loadedUrl,
111119
transformRequestFunction: (request) => {
112-
request.headers = { ...request.headers, 'User-Agent': Apify.utils.getRandomUserAgent() }
120+
request.headers = useGoogleBotHeaders ? GOOGLE_BOT_HEADERS : { 'User-Agent': Apify.utils.getRandomUserAgent() };
113121
return request;
114-
}
122+
},
115123
});
116124
}
117-
}
125+
};
118126

119127
const handleFailedRequestFunction = ({ request }) => {
120128
state.total.push({ url: request.url });
@@ -134,7 +142,15 @@ Apify.main(async () => {
134142
}
135143
state.statusCodes[statusCode].push({ url: request.url });
136144
}
137-
}
145+
};
146+
147+
const gotoFunction = async ({ request, page }) => {
148+
await page.setExtraHTTPHeaders({
149+
'Referer': GOOGLE_BOT_HEADERS.Referer,
150+
'X-Forwarded-For': GOOGLE_BOT_HEADERS['X-Forwarded-For'],
151+
});
152+
return page.goto(request.url, { timeout: 60000 });
153+
};
138154

139155
const basicOptions = {
140156
maxRequestRetries: 0,
@@ -151,13 +167,14 @@ Apify.main(async () => {
151167
stealth: true,
152168
headless: headfull ? undefined : true,
153169
useChrome,
170+
userAgent: useGoogleBotHeaders ? GOOGLE_BOT_HEADERS['User-Agent'] : Apify.utils.getRandomUserAgent(),
154171
};
155172

156173
const puppeteerPoolOptions = { retireInstanceAfterRequestCount };
157174

158175
const crawler = type === 'cheerio'
159176
? new Apify.CheerioCrawler({ ...basicOptions, proxyUrls: proxyUrl ? [proxyUrl] : null })
160-
: new Apify.PuppeteerCrawler({ ...basicOptions, launchPuppeteerOptions, puppeteerPoolOptions });
177+
: new Apify.PuppeteerCrawler({ ...basicOptions, launchPuppeteerOptions, puppeteerPoolOptions, gotoFunction });
161178

162179
await crawler.run();
163180

0 commit comments

Comments
 (0)