55
66# useful for handling different item types with a single interface
77
8+ import logging
89import random
910import time
1011
1112import scrapy
1213import scrapy .http .response .html
1314from scrapy import signals
1415from scrapy .exceptions import IgnoreRequest
15- from v2ex_scrapy . DB import DB , LogItem
16+
1617from v2ex_scrapy import utils
18+ from v2ex_scrapy .DB import DB , LogItem
1719
1820
1921class TutorialScrapySpiderMiddleware :
@@ -70,6 +72,7 @@ class ProxyAndCookieDownloaderMiddleware:
7072 def __init__ (self ):
7173 self .proxies : list [str ] = []
7274 self .cookies : dict [str , str ] = {}
75+ self .logger = logging .getLogger (__name__ )
7376
7477 @classmethod
7578 def from_crawler (cls , crawler ):
@@ -102,6 +105,7 @@ def process_response(
102105 ):
103106 # Called with the response returned from the downloader.
104107 if response .status == 403 :
108+ self .logger .info (f"skip url:{ response .url } , because 403" )
105109 raise IgnoreRequest (f"403 url { response .url } " )
106110 # Must either;
107111 # - return a Response object
@@ -123,7 +127,7 @@ def spider_opened(self, spider: scrapy.Spider):
123127 self .proxies = spider .settings .get ("PROXIES" , []) # type: ignore
124128
125129 cookie_str = spider .settings .get ("COOKIES" , "" )
126- self .cookies = utils .cookie_str2cookie_dict (cookie_str ) # type: ignore
130+ self .cookies = utils .cookie_str2cookie_dict (cookie_str ) # type: ignore
127131
128132 spider .logger .info ("Spider opened: %s" % spider .name )
129133
0 commit comments