@@ -3,6 +3,7 @@ const cheerio = require('cheerio');
3
3
4
4
const { testHtml } = require ( './checkers.js' ) ;
5
5
const { toSimpleState } = require ( './utils.js' ) ;
6
+ const { GOOGLE_BOT_HEADERS } = require ( './constants.js' ) ;
6
7
7
8
Apify . main ( async ( ) => {
8
9
const input = await Apify . getInput ( ) ;
@@ -22,6 +23,7 @@ Apify.main(async () => {
22
23
retireInstanceAfterRequestCount = 10 ,
23
24
headfull = false ,
24
25
useChrome = false ,
26
+ useGoogleBotHeaders = false ,
25
27
} = input ;
26
28
27
29
const proxyUrl = proxyConfiguration . useApifyProxy
@@ -54,9 +56,16 @@ Apify.main(async () => {
54
56
const requestQueue = await Apify . openRequestQueue ( ) ;
55
57
56
58
for ( const req of startUrls ) {
57
- await requestQueue . addRequest ( { ...req , headers : { 'User-Agent' : Apify . utils . getRandomUserAgent ( ) } } ) ;
59
+ await requestQueue . addRequest ( {
60
+ ...req ,
61
+ headers : useGoogleBotHeaders ? GOOGLE_BOT_HEADERS : { 'User-Agent' : Apify . utils . getRandomUserAgent ( ) } ,
62
+ } ) ;
58
63
for ( let i = 0 ; i < replicateStartUrls ; i ++ ) {
59
- await requestQueue . addRequest ( { ...req , uniqueKey : Math . random ( ) . toString ( ) , headers : { 'User-Agent' : Apify . utils . getRandomUserAgent ( ) } } ) ;
64
+ await requestQueue . addRequest ( {
65
+ ...req ,
66
+ uniqueKey : Math . random ( ) . toString ( ) ,
67
+ headers : useGoogleBotHeaders ? GOOGLE_BOT_HEADERS : { 'User-Agent' : Apify . utils . getRandomUserAgent ( ) } ,
68
+ } ) ;
60
69
}
61
70
}
62
71
@@ -73,7 +82,6 @@ Apify.main(async () => {
73
82
await Apify . setValue ( `${ key } .html` , html , { contentType : 'text/html' } ) ;
74
83
htmlUrl = `https://api.apify.com/v2/key-value-stores/${ Apify . getEnv ( ) . defaultKeyValueStoreId } /records/${ key } .html?disableRedirect=true`
75
84
}
76
-
77
85
}
78
86
state . total . push ( { url : request . url , screenshotUrl, htmlUrl } ) ;
79
87
@@ -109,12 +117,12 @@ Apify.main(async () => {
109
117
requestQueue,
110
118
baseUrl : request . loadedUrl ,
111
119
transformRequestFunction : ( request ) => {
112
- request . headers = { ... request . headers , 'User-Agent' : Apify . utils . getRandomUserAgent ( ) }
120
+ request . headers = useGoogleBotHeaders ? GOOGLE_BOT_HEADERS : { 'User-Agent' : Apify . utils . getRandomUserAgent ( ) } ;
113
121
return request ;
114
- }
122
+ } ,
115
123
} ) ;
116
124
}
117
- }
125
+ } ;
118
126
119
127
const handleFailedRequestFunction = ( { request } ) => {
120
128
state . total . push ( { url : request . url } ) ;
@@ -134,7 +142,15 @@ Apify.main(async () => {
134
142
}
135
143
state . statusCodes [ statusCode ] . push ( { url : request . url } ) ;
136
144
}
137
- }
145
+ } ;
146
+
147
+ const gotoFunction = async ( { request, page } ) => {
148
+ await page . setExtraHTTPHeaders ( {
149
+ 'Referer' : GOOGLE_BOT_HEADERS . Referer ,
150
+ 'X-Forwarded-For' : GOOGLE_BOT_HEADERS [ 'X-Forwarded-For' ] ,
151
+ } ) ;
152
+ return page . goto ( request . url , { timeout : 60000 } ) ;
153
+ } ;
138
154
139
155
const basicOptions = {
140
156
maxRequestRetries : 0 ,
@@ -151,13 +167,14 @@ Apify.main(async () => {
151
167
stealth : true ,
152
168
headless : headfull ? undefined : true ,
153
169
useChrome,
170
+ userAgent : useGoogleBotHeaders ? GOOGLE_BOT_HEADERS [ 'User-Agent' ] : Apify . utils . getRandomUserAgent ( ) ,
154
171
} ;
155
172
156
173
const puppeteerPoolOptions = { retireInstanceAfterRequestCount } ;
157
174
158
175
const crawler = type === 'cheerio'
159
176
? new Apify . CheerioCrawler ( { ...basicOptions , proxyUrls : proxyUrl ? [ proxyUrl ] : null } )
160
- : new Apify . PuppeteerCrawler ( { ...basicOptions , launchPuppeteerOptions, puppeteerPoolOptions } ) ;
177
+ : new Apify . PuppeteerCrawler ( { ...basicOptions , launchPuppeteerOptions, puppeteerPoolOptions, gotoFunction } ) ;
161
178
162
179
await crawler . run ( ) ;
163
180
0 commit comments