88from urllib .parse import ParseResult , urlparse
99
1010import requests
11+ from pydantic import BaseModel
12+
13+
14+ class ExtractThresholds (BaseModel ):
15+ entries = 100
16+ size = 5000000000
17+ ratio = 100
1118
1219
1320class Extractor :
@@ -18,17 +25,21 @@ class Extractor:
1825 ALLOWED_SCHEMES = {"https" , "s3" , "file" }
1926 FILE_SKIP_PATTERNS = [".DS_Store" .lower (), "__MACOSX" .lower (), "/." ]
2027
21- THRESHOLD_ENTRIES = 100
22- THRESHOLD_SIZE = 5000000000
23- THRESHOLD_RATIO = 100
24-
25- def __init__ (self , source_url : ParseResult | str , work_dir : Path | str , s3_client = None ):
28+ def __init__ (
29+ self ,
30+ source_url : ParseResult | str ,
31+ work_dir : Path | str ,
32+ s3_client = None ,
33+ thresholds = ExtractThresholds (),
34+ ):
2635 self .source_url = self ._validate_url (source_url )
2736 self .work_dir = self ._validate_work_dir (work_dir )
2837 self .s3_client = self ._validate_s3_client (s3_client )
2938 self .file_total = 0
3039 self .size_total = 0
3140
41+ self .thresholds = thresholds
42+
3243 def _validate_url (self , url : ParseResult | str ) -> ParseResult :
3344 parsed_url = url if isinstance (url , ParseResult ) else urlparse (url )
3445 if parsed_url .scheme not in self .ALLOWED_SCHEMES :
@@ -204,13 +215,13 @@ def _add_to_stats_and_verify(self, size, count=1):
204215 self .file_total += count
205216
206217 ratio = size / self .size_total
207- if ratio > self .THRESHOLD_RATIO :
218+ if ratio > self .thresholds . ratio :
208219 raise AssertionError ("Encountered suspicious compression ratio in the archive" )
209220
210- if self .size_total > self .THRESHOLD_SIZE :
221+ if self .size_total > self .thresholds . size :
211222 raise AssertionError ("The archive is too big" )
212223
213- if self .file_total > self .THRESHOLD_ENTRIES :
224+ if self .file_total > self .thresholds . entries :
214225 raise AssertionError ("Too many files in the archive" )
215226
216227 def _remove_from_stats (self , size , count = 1 ):
@@ -236,9 +247,9 @@ def _download(self):
236247 for content in result ["Contents" ]:
237248 s3_content_count += 1
238249 s3_content_size += content ["Size" ]
239- if s3_content_count > self .THRESHOLD_ENTRIES :
250+ if s3_content_count > self .thresholds . entries :
240251 raise AssertionError ("Too many objects at %s" % self .source_url )
241- if s3_content_size > self .THRESHOLD_SIZE :
252+ if s3_content_size > self .thresholds . size :
242253 raise AssertionError (
243254 "Size limit exceeded while downloading from %s" % self .source_url
244255 )
0 commit comments