66from multiprocessing .dummy import Pool
77from pathlib import Path
88from typing import Any , List , Union
9+ import os
910
1011import boto3 # type: ignore
1112import botocore # type: ignore
1920 S3_URL ,
2021)
2122
23+ sensitive_var_pattern = re .compile (
24+ r"\b[A-Z_]*(?<!WRONG_)(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!%)(?!=clickhouse$)(?!=minio)(?!: \*{3}$)(?! '\[HIDDEN\]')"
25+ )
26+ sensitive_strings = {
27+ var : value for var , value in os .environ .items () if sensitive_var_pattern .match (var )
28+ }
29+
30+
31+ def scan_file_for_sensitive_data (file_content , file_name ):
32+ """
33+ Scan the content of a file for sensitive strings.
34+ Raises ValueError if any sensitive values are found.
35+ """
36+
37+ def clean_line (line ):
38+ for name , value in sensitive_strings .items ():
39+ line = line .replace (value , f"SECRET[{ name } ]" )
40+ return line
41+
42+ matches = []
43+ for line_number , line in enumerate (file_content .splitlines (), start = 1 ):
44+ for match in sensitive_var_pattern .finditer (line ):
45+ matches .append ((file_name , line_number , clean_line (line )))
46+ for name , value in sensitive_strings .items ():
47+ if value in line :
48+ matches .append ((file_name , line_number , clean_line (line )))
49+
50+ if not matches :
51+ return
52+
53+ logging .error (f"Sensitive values found in { file_name } " )
54+ for file_name , line_number , match in matches :
55+ logging .error (f"{ file_name } :{ line_number } : { match } " )
56+
57+ raise ValueError (f"Sensitive values found in { file_name } " )
58+
2259
2360def _flatten_list (lst ):
2461 result = []
@@ -45,6 +82,14 @@ def __init__(self, client: Any = None, endpoint: str = S3_URL):
4582 def _upload_file_to_s3 (
4683 self , bucket_name : str , file_path : Path , s3_path : str
4784 ) -> str :
85+ logging .debug ("Checking %s for sensitive values" , file_path )
86+ try :
87+ file_content = file_path .read_text (encoding = "utf-8" )
88+ except UnicodeDecodeError :
89+ logging .warning ("Failed to read file %s, unknown encoding" , file_path )
90+ else :
91+ scan_file_for_sensitive_data (file_content , file_path .name )
92+
4893 logging .debug (
4994 "Start uploading %s to bucket=%s path=%s" , file_path , bucket_name , s3_path
5095 )
0 commit comments