@@ -33,25 +33,23 @@ class ApifyCacheStorage:
3333
3434 def __init__ (self , settings : BaseSettings ) -> None :
3535 self .expiration_max_items = 100
36- self .expiration_secs : int = settings .getint (" HTTPCACHE_EXPIRATION_SECS" )
36+ self .expiration_secs : int = settings .getint (' HTTPCACHE_EXPIRATION_SECS' )
3737 self .spider : Spider | None = None
3838 self ._kv : KeyValueStore | None = None
3939 self ._fingerprinter : RequestFingerprinterProtocol | None = None
4040 self ._async_thread : AsyncThread | None = None
4141
4242 def open_spider (self , spider : Spider ) -> None :
43- logger .debug (" Using Apify key value cache storage" , extra = {" spider" : spider })
43+ logger .debug (' Using Apify key value cache storage' , extra = {' spider' : spider })
4444 self .spider = spider
4545 self ._fingerprinter = spider .crawler .request_fingerprinter
46- kv_name = f" httpcache-{ spider .name } "
46+ kv_name = f' httpcache-{ spider .name } '
4747
4848 async def open_kv () -> KeyValueStore :
4949 config = Configuration .get_global_configuration ()
5050 if config .is_at_home :
5151 storage_client = ApifyStorageClient .from_config (config )
52- return await KeyValueStore .open (
53- name = kv_name , storage_client = storage_client
54- )
52+ return await KeyValueStore .open (name = kv_name , storage_client = storage_client )
5553 return await KeyValueStore .open (name = kv_name )
5654
5755 logger .debug ("Starting background thread for cache storage's event loop" )
@@ -60,88 +58,84 @@ async def open_kv() -> KeyValueStore:
6058 self ._kv = self ._async_thread .run_coro (open_kv ())
6159
6260 def close_spider (self , spider : Spider , current_time : int | None = None ) -> None :
63- assert self ._async_thread is not None , " Async thread not initialized"
61+ assert self ._async_thread is not None , ' Async thread not initialized'
6462
65- logger .info (f" Cleaning up cache items (max { self .expiration_max_items } )" )
63+ logger .info (f' Cleaning up cache items (max { self .expiration_max_items } )' )
6664 if 0 < self .expiration_secs :
6765 if current_time is None :
6866 current_time = int (time ())
6967
7068 async def expire_kv () -> None :
71- assert self ._kv is not None , " Key value store not initialized"
69+ assert self ._kv is not None , ' Key value store not initialized'
7270 i = 0
7371 async for item in self ._kv .iterate_keys ():
7472 value = await self ._kv .get_value (item .key )
7573 try :
7674 gzip_time = read_gzip_time (value )
7775 except Exception as e :
78- logger .warning (f" Malformed cache item { item .key } : { e } " )
76+ logger .warning (f' Malformed cache item { item .key } : { e } ' )
7977 await self ._kv .set_value (item .key , None )
8078 else :
8179 if self .expiration_secs < current_time - gzip_time :
82- logger .debug (f" Expired cache item { item .key } " )
80+ logger .debug (f' Expired cache item { item .key } ' )
8381 await self ._kv .set_value (item .key , None )
8482 else :
85- logger .debug (f" Valid cache item { item .key } " )
83+ logger .debug (f' Valid cache item { item .key } ' )
8684 if i == self .expiration_max_items :
8785 break
8886 i += 1
8987
9088 self ._async_thread .run_coro (expire_kv ())
9189
92- logger .debug (" Closing cache storage" )
90+ logger .debug (' Closing cache storage' )
9391 try :
9492 self ._async_thread .close ()
9593 except KeyboardInterrupt :
96- logger .warning (" Shutdown interrupted by KeyboardInterrupt!" )
94+ logger .warning (' Shutdown interrupted by KeyboardInterrupt!' )
9795 except Exception :
98- logger .exception (" Exception occurred while shutting down cache storage" )
96+ logger .exception (' Exception occurred while shutting down cache storage' )
9997 finally :
100- logger .debug (" Cache storage closed" )
98+ logger .debug (' Cache storage closed' )
10199
102- def retrieve_response (
103- self , spider : Spider , request : Request , current_time : int | None = None
104- ) -> Response | None :
105- assert self ._async_thread is not None , "Async thread not initialized"
106- assert self ._kv is not None , "Key value store not initialized"
107- assert self ._fingerprinter is not None , "Request fingerprinter not initialized"
100+ def retrieve_response (self , spider : Spider , request : Request , current_time : int | None = None ) -> Response | None :
101+ assert self ._async_thread is not None , 'Async thread not initialized'
102+ assert self ._kv is not None , 'Key value store not initialized'
103+ assert self ._fingerprinter is not None , 'Request fingerprinter not initialized'
108104
109105 key = self ._fingerprinter .fingerprint (request ).hex ()
110106 value = self ._async_thread .run_coro (self ._kv .get_value (key ))
111107
112108 if value is None :
113- logger .debug (" Cache miss" , extra = {" request" : request })
109+ logger .debug (' Cache miss' , extra = {' request' : request })
114110 return None
115111
116112 if current_time is None :
117113 current_time = int (time ())
118114 if 0 < self .expiration_secs < current_time - read_gzip_time (value ):
119- logger .debug (" Cache expired" , extra = {" request" : request })
115+ logger .debug (' Cache expired' , extra = {' request' : request })
120116 return None
121117
122118 data = from_gzip (value )
123- url = data [" url" ]
124- status = data [" status" ]
125- headers = Headers (data [" headers" ])
126- body = data [" body" ]
119+ url = data [' url' ]
120+ status = data [' status' ]
121+ headers = Headers (data [' headers' ])
122+ body = data [' body' ]
127123 respcls = responsetypes .from_args (headers = headers , url = url , body = body )
128124
129- logger .debug (" Cache hit" , extra = {" request" : request })
125+ logger .debug (' Cache hit' , extra = {' request' : request })
130126 return respcls (url = url , headers = headers , status = status , body = body )
131127
132- def store_response (
133- self , spider : Spider , request : Request , response : Response
134- ) -> None :
135- assert self ._async_thread is not None , "Async thread not initialized"
136- assert self ._kv is not None , "Key value store not initialized"
137- assert self ._fingerprinter is not None , "Request fingerprinter not initialized"
128+ def store_response (self , spider : Spider , request : Request , response : Response ) -> None :
129+ assert self ._async_thread is not None , 'Async thread not initialized'
130+ assert self ._kv is not None , 'Key value store not initialized'
131+ assert self ._fingerprinter is not None , 'Request fingerprinter not initialized'
138132
139133 key = self ._fingerprinter .fingerprint (request ).hex ()
140134 data = {
141- " status" : response .status ,
142- " url" : response .url ,
143- " headers" : dict (response .headers ),
144- " body" : response .body ,
135+ ' status' : response .status ,
136+ ' url' : response .url ,
137+ ' headers' : dict (response .headers ),
138+ ' body' : response .body ,
145139 }
146140 value = to_gzip (data )
147141 self ._async_thread .run_coro (self ._kv .set_value (key , value ))
@@ -150,19 +144,19 @@ def store_response(
150144def to_gzip (data : dict , mtime : int | None = None ) -> bytes :
151145 """Dump a dictionary to a gzip-compressed byte stream."""
152146 with io .BytesIO () as byte_stream :
153- with gzip .GzipFile (fileobj = byte_stream , mode = "wb" , mtime = mtime ) as gzip_file :
147+ with gzip .GzipFile (fileobj = byte_stream , mode = 'wb' , mtime = mtime ) as gzip_file :
154148 pickle .dump (data , gzip_file , protocol = 4 )
155149 return byte_stream .getvalue ()
156150
157151
158152def from_gzip (gzip_bytes : bytes ) -> dict :
159153 """Load a dictionary from a gzip-compressed byte stream."""
160- with io .BytesIO (gzip_bytes ) as byte_stream , gzip .GzipFile (fileobj = byte_stream , mode = "rb" ) as gzip_file :
154+ with io .BytesIO (gzip_bytes ) as byte_stream , gzip .GzipFile (fileobj = byte_stream , mode = 'rb' ) as gzip_file :
161155 return pickle .load (gzip_file )
162156
163157
164158def read_gzip_time (gzip_bytes : bytes ) -> int :
165159 """Read the modification time from a gzip-compressed byte stream without decompressing the data."""
166160 header = gzip_bytes [:10 ]
167- header_components = struct .unpack (" <HBBI2B" , header )
161+ header_components = struct .unpack (' <HBBI2B' , header )
168162 return header_components [3 ]
0 commit comments