|
31 | 31 | from dataclasses import dataclass |
32 | 32 | from pathlib import Path |
33 | 33 |
|
34 | | -import boto3 |
35 | | -import paramiko |
36 | | -from botocore.exceptions import ClientError |
37 | | -from paramiko.ssh_exception import SSHException |
38 | 34 |
|
39 | 35 | logger = logging.getLogger(__name__) |
40 | 36 |
|
@@ -192,273 +188,3 @@ def find( |
192 | 188 | return None |
193 | 189 |
|
194 | 190 |
|
195 | | -class S3LikeProvider(DownloadStore): |
196 | | - def __init__( |
197 | | - self, |
198 | | - bucket_name: str, |
199 | | - aws_userid: str, |
200 | | - aws_apikey: str, |
201 | | - other_aws_credentials: dict, |
202 | | - ): |
203 | | - self.bucket_name = bucket_name |
204 | | - self.s3_client = boto3.client( |
205 | | - "s3", |
206 | | - aws_access_key_id=aws_userid, |
207 | | - aws_secret_access_key=aws_apikey, |
208 | | - **(other_aws_credentials or {}), |
209 | | - ) |
210 | | - |
211 | | - def _get_content_path(self, sha256: str) -> str: |
212 | | - """S3 key like 59/4c/67/<sha256>/""" |
213 | | - return f"{sha256[:2]}/{sha256[2:4]}/{sha256[4:]}/" |
214 | | - |
215 | | - def list(self): |
216 | | - """List all stored downloads.""" |
217 | | - downloads = [] |
218 | | - try: |
219 | | - paginator = self.s3_client.get_paginator("list_objects_v2") |
220 | | - for page in paginator.paginate(Bucket=self.bucket_name): |
221 | | - for obj in page.get("Contents", []): |
222 | | - key = obj["Key"] |
223 | | - if key.endswith(".json"): |
224 | | - try: |
225 | | - response = self.s3_client.get_object( |
226 | | - Bucket=self.bucket_name, Key=key |
227 | | - ) |
228 | | - data = json.loads(response["Body"].read()) |
229 | | - downloads.append(Download(**data)) |
230 | | - except Exception as e: |
231 | | - logger.error(f"Error reading S3 object {key}: {e}") |
232 | | - except ClientError as e: |
233 | | - logger.error(f"Failed to list S3 objects: {e}") |
234 | | - return downloads |
235 | | - |
236 | | - def get(self, sha256_checksum: str): |
237 | | - """Retrieve a Download object for the given SHA256 hash.""" |
238 | | - prefix = self._get_content_path(sha256_checksum) |
239 | | - try: |
240 | | - response = self.s3_client.list_objects_v2( |
241 | | - Bucket=self.bucket_name, Prefix=prefix, MaxKeys=1 |
242 | | - ) |
243 | | - if "Contents" in response: |
244 | | - key = response["Contents"][0]["Key"] |
245 | | - obj_response = self.s3_client.get_object( |
246 | | - Bucket=self.bucket_name, Key=key |
247 | | - ) |
248 | | - data = json.loads(obj_response["Body"].read()) |
249 | | - return Download(**data) |
250 | | - except ClientError as e: |
251 | | - logger.error(f"Failed to get S3 object for {sha256_checksum}: {e}") |
252 | | - return None |
253 | | - |
254 | | - def put(self, content: bytes, download_url: str, download_date: str, filename: str): |
255 | | - """Store the content and its metadata.""" |
256 | | - sha256 = self._compute_sha256(content) |
257 | | - content_key = self._get_content_path(sha256) + "content" |
258 | | - try: |
259 | | - self.s3_client.head_object(Bucket=self.bucket_name, Key=content_key) |
260 | | - logger.info(f"Content already exists for {sha256}") |
261 | | - except ClientError: |
262 | | - try: |
263 | | - self.s3_client.put_object( |
264 | | - Bucket=self.bucket_name, |
265 | | - Key=content_key, |
266 | | - Body=content, |
267 | | - ) |
268 | | - except ClientError as e: |
269 | | - raise Exception(f"Failed to write content to S3 {content_key}: {e}") |
270 | | - |
271 | | - origin_hash = self._compute_origin_hash(filename, download_date, download_url) |
272 | | - origin_filename = f"origin-{origin_hash}.json" |
273 | | - origin_key = self._get_content_path(sha256) + origin_filename |
274 | | - |
275 | | - metadata = self._build_metadata(sha256, filename, download_date, download_url) |
276 | | - metadata_json = json.dumps(metadata, indent=2).encode("utf-8") |
277 | | - try: |
278 | | - self.s3_client.put_object( |
279 | | - Bucket=self.bucket_name, |
280 | | - Key=origin_key, |
281 | | - Body=metadata_json, |
282 | | - ) |
283 | | - except ClientError as e: |
284 | | - raise Exception(f"Failed to write metadata to S3 {origin_key}: {e}") |
285 | | - |
286 | | - return Download(**metadata) |
287 | | - |
288 | | - def find( |
289 | | - self, download_url: str = None, filename: str = None, download_date: str = None |
290 | | - ): |
291 | | - """Find a download based on metadata.""" |
292 | | - if not (download_url or filename or download_date): |
293 | | - return None |
294 | | - try: |
295 | | - paginator = self.s3_client.get_paginator("list_objects_v2") |
296 | | - for page in paginator.paginate(Bucket=self.bucket_name): |
297 | | - for obj in page.get("Contents", []): |
298 | | - key = obj["Key"] |
299 | | - if key.endswith(".json"): |
300 | | - try: |
301 | | - response = self.s3_client.get_object( |
302 | | - Bucket=self.bucket_name, Key=key |
303 | | - ) |
304 | | - data = json.loads(response["Body"].read()) |
305 | | - if ( |
306 | | - ( |
307 | | - download_url is None |
308 | | - or data.get("url") == download_url |
309 | | - ) |
310 | | - and ( |
311 | | - filename is None or data.get("filename") == filename |
312 | | - ) |
313 | | - and ( |
314 | | - download_date is None |
315 | | - or data.get("download_date") == download_date |
316 | | - ) |
317 | | - ): |
318 | | - return Download(**data) |
319 | | - except Exception as e: |
320 | | - logger.error(f"Error reading S3 object {key}: {e}") |
321 | | - except ClientError as e: |
322 | | - logger.error(f"Failed to find in S3: {e}") |
323 | | - return None |
324 | | - |
325 | | - |
326 | | -class SftpProvider(DownloadStore): |
327 | | - def __init__(self, host: str, root_path: str, ssh_credentials: dict): |
328 | | - self.host = host |
329 | | - self.root_path = Path(root_path) |
330 | | - self.ssh_credentials = ssh_credentials |
331 | | - self.ssh = paramiko.SSHClient() |
332 | | - self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
333 | | - try: |
334 | | - self.ssh.connect( |
335 | | - hostname=host, |
336 | | - username=ssh_credentials.get("username"), |
337 | | - password=ssh_credentials.get("password"), |
338 | | - ) |
339 | | - self.sftp = self.ssh.open_sftp() |
340 | | - except SSHException as e: |
341 | | - raise Exception(f"Failed to connect to SFTP server {host}: {e}") |
342 | | - |
343 | | - def _get_content_path(self, sha256: str) -> str: |
344 | | - """SFTP path like 59/4c/67/<sha256>/""" |
345 | | - return str(self.root_path / sha256[:2] / sha256[2:4] / sha256[4:]) |
346 | | - |
347 | | - def list(self): |
348 | | - """List all stored downloads.""" |
349 | | - downloads = [] |
350 | | - try: |
351 | | - for root, _, files in self._sftp_walk(self.root_path): |
352 | | - for filename in files: |
353 | | - if filename.endswith(".json"): |
354 | | - file_path = os.path.join(root, filename) |
355 | | - try: |
356 | | - with self.sftp.open(file_path, "r") as f: |
357 | | - data = json.load(f) |
358 | | - downloads.append(Download(**data)) |
359 | | - except Exception as e: |
360 | | - logger.error(f"Error reading SFTP file {file_path}: {e}") |
361 | | - except SSHException as e: |
362 | | - logger.error(f"Failed to list SFTP files: {e}") |
363 | | - return downloads |
364 | | - |
365 | | - def _sftp_walk(self, path): |
366 | | - """Recursively walk SFTP directory.""" |
367 | | - path = str(path) |
368 | | - for entry in self.sftp.listdir_attr(path): |
369 | | - full_path = os.path.join(path, entry.filename) |
370 | | - if stat.S_ISDIR(entry.st_mode): |
371 | | - yield from self._sftp_walk(full_path) |
372 | | - else: |
373 | | - yield path, [], [entry.filename] |
374 | | - |
375 | | - def get(self, sha256_checksum: str): |
376 | | - """Retrieve a Download object for the given SHA256 hash.""" |
377 | | - content_path = self._get_content_path(sha256_checksum) |
378 | | - try: |
379 | | - files = self.sftp.listdir(content_path) |
380 | | - origin_files = [ |
381 | | - f for f in files if f.startswith("origin-") and f.endswith(".json") |
382 | | - ] |
383 | | - if origin_files: |
384 | | - with self.sftp.open( |
385 | | - os.path.join(content_path, origin_files[0]), "r" |
386 | | - ) as f: |
387 | | - data = json.load(f) |
388 | | - return Download(**data) |
389 | | - except SSHException as e: |
390 | | - logger.error(f"Failed to get SFTP file for {sha256_checksum}: {e}") |
391 | | - return None |
392 | | - |
393 | | - def put(self, content: bytes, download_url: str, download_date: str, filename: str): |
394 | | - """Store the content and its metadata.""" |
395 | | - sha256 = self._compute_sha256(content) |
396 | | - content_path = self._get_content_path(sha256) |
397 | | - try: |
398 | | - self.sftp.mkdir(content_path) |
399 | | - except SSHException: |
400 | | - pass |
401 | | - |
402 | | - content_file = os.path.join(content_path, "content") |
403 | | - try: |
404 | | - self.sftp.stat(content_file) |
405 | | - logger.info(f"Content already exists for {sha256}") |
406 | | - except SSHException: |
407 | | - try: |
408 | | - with self.sftp.open(content_file, "wb") as f: |
409 | | - f.write(content) |
410 | | - except SSHException as e: |
411 | | - raise Exception(f"Failed to write content to SFTP {content_file}: {e}") |
412 | | - |
413 | | - origin_hash = self._compute_origin_hash(filename, download_date, download_url) |
414 | | - origin_filename = f"origin-{origin_hash}.json" |
415 | | - origin_path = os.path.join(content_path, origin_filename) |
416 | | - try: |
417 | | - self.sftp.stat(origin_path) |
418 | | - raise Exception(f"Origin {origin_filename} already exists") |
419 | | - except SSHException: |
420 | | - metadata = self._build_metadata( |
421 | | - sha256, filename, download_date, download_url |
422 | | - ) |
423 | | - metadata_json = json.dumps(metadata, indent=2).encode("utf-8") |
424 | | - try: |
425 | | - with self.sftp.open(origin_path, "wb") as f: |
426 | | - f.write(metadata_json) |
427 | | - except SSHException as e: |
428 | | - raise Exception(f"Failed to write metadata to SFTP {origin_path}: {e}") |
429 | | - |
430 | | - return Download(**metadata) |
431 | | - |
432 | | - def find( |
433 | | - self, download_url: str = None, filename: str = None, download_date: str = None |
434 | | - ): |
435 | | - """Find a download based on metadata.""" |
436 | | - if not (download_url or filename or download_date): |
437 | | - return None |
438 | | - try: |
439 | | - for root, _, files in self._sftp_walk(self.root_path): |
440 | | - for filename in files: |
441 | | - if filename.endswith(".json"): |
442 | | - file_path = os.path.join(root, filename) |
443 | | - try: |
444 | | - with self.sftp.open(file_path, "r") as f: |
445 | | - data = json.load(f) |
446 | | - if ( |
447 | | - ( |
448 | | - download_url is None |
449 | | - or data.get("url") == download_url |
450 | | - ) |
451 | | - and ( |
452 | | - filename is None or data.get("filename") == filename |
453 | | - ) |
454 | | - and ( |
455 | | - download_date is None |
456 | | - or data.get("download_date") == download_date |
457 | | - ) |
458 | | - ): |
459 | | - return Download(**data) |
460 | | - except Exception as e: |
461 | | - logger.error(f"Error reading SFTP file {file_path}: {e}") |
462 | | - except SSHException as e: |
463 | | - logger.error(f"Failed to find in SFTP: {e}") |
464 | | - return None |
0 commit comments