|
1 | | -from __future__ import annotations |
| 1 | +"""from __future__ import annotations |
2 | 2 |
|
3 | 3 | import asyncio |
4 | 4 | import os |
|
29 | 29 | MAX_FILE_SIZE_MB = max(int(MAX_FILE_SIZE_MB), 1) |
30 | 30 | else: |
31 | 31 | MAX_FILE_SIZE_MB = None |
| 32 | +""" |
| 33 | +import csv |
| 34 | +import json |
| 35 | +import os |
| 36 | +import shutil |
| 37 | +from concurrent.futures import ThreadPoolExecutor, as_completed |
| 38 | + |
| 39 | +import torch |
| 40 | +from sentence_transformers import SentenceTransformer |
| 41 | +from torch import nn |
| 42 | + |
| 43 | +# ================== GLOBAL SETTINGS ================== |
| 44 | +# Paths |
| 45 | +ROOT_DIR = r"C:\Users\Hp\Desktop\Shahm" # Folder to scan |
| 46 | +BACKUP_DIR = r"C:\Users\Hp\Desktop\VulnScan_Files" # Backup folder |
| 47 | +MODEL_PATH = r"vulnscan/Model_SenseMacro.4n1.pth" # Your trained model checkpoint |
| 48 | + |
| 49 | +# File scan settings |
| 50 | +TEXT_EXTENSIONS = {".txt", ".log", ".csv", ".json", ".xml", ".html", ".md", ".cfg", ".ini", ".yml", ".yaml"} |
| 51 | +MAX_TEXT_LENGTH = 1000000 # Max characters per file to scan |
| 52 | + |
| 53 | +# Threading |
| 54 | +NUM_WORKERS = 8 # Number of parallel threads for scanning |
| 55 | + |
| 56 | +# Classification threshold |
| 57 | +SENSITIVE_THRESHOLD = 0.5 # Probability cutoff to consider a file sensitive |
| 58 | + |
| 59 | +# Reports |
| 60 | +REPORT_JSON = os.path.join(os.getcwd(), "report.json") |
| 61 | +REPORT_CSV = os.path.join(os.getcwd(), "report.csv") |
32 | 62 |
|
| 63 | +# ================== DEVICE SETUP ================== |
| 64 | +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| 65 | +print(f"Using device: {DEVICE}") |
33 | 66 |
|
34 | | -class _SensitiveDataScanner: |
35 | | - def __init__(self, model_path: str, vectorizer_path: str): |
36 | | - self.model_path = model_path |
37 | | - self.vectorizer_path = vectorizer_path |
38 | | - self.model_cache = {} |
39 | | - self.vectorizer_cache = {} |
40 | | - self.model_lock = threading.Lock() |
41 | | - self.vectorizer_lock = threading.Lock() |
42 | | - self.model = None |
43 | | - self.vectorizer = None |
44 | | - self._load_model() |
45 | | - self._load_vectorizer() |
46 | | - |
47 | | - def _load_model(self) -> None: |
48 | | - with self.model_lock: |
49 | | - if self.model_path in self.model_cache: |
50 | | - self.model = self.model_cache[self.model_path] |
51 | | - return |
52 | | - |
53 | | - if self.model_path.endswith(".pkl"): |
54 | | - self.model = joblib.load(self.model_path) |
55 | | - elif self.model_path.endswith(".safetensors"): |
56 | | - self.model = safe_open(self.model_path, framework="torch") |
57 | | - elif self.model_path.endswith(".pth"): |
58 | | - with warnings.catch_warnings(): |
59 | | - warnings.filterwarnings("ignore", category=FutureWarning) |
60 | | - self.model = torch.load( |
61 | | - self.model_path, |
62 | | - map_location=torch.device( |
63 | | - "cuda" if torch.cuda.is_available() else "cpu" |
64 | | - ), |
65 | | - weights_only=False, |
66 | | - ) |
67 | | - if not torch.cuda.is_available() and torch.version.cuda: |
68 | | - log.warning( |
69 | | - "NVIDIA GPU detected but CUDA is not available. Check your PyTorch and CUDA installation to utilise as much power as possible." |
70 | | - ) |
71 | | - log.debug( |
72 | | - f"Model using device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}" |
73 | | - ) |
74 | | - else: |
75 | | - raise ValueError("Unsupported model file format") |
76 | | - |
77 | | - self.model_cache[self.model_path] = self.model |
78 | | - |
79 | | - def _load_vectorizer(self) -> None: |
80 | | - with self.vectorizer_lock: |
81 | | - if self.vectorizer_path in self.vectorizer_cache: |
82 | | - self.vectorizer = self.vectorizer_cache[self.vectorizer_path] |
83 | | - return |
84 | | - |
85 | | - try: |
86 | | - self.vectorizer = joblib.load(self.vectorizer_path) |
87 | | - except Exception as e: |
88 | | - log.critical(f"Failed to load vectorizer: {e}") |
89 | | - exit(1) |
90 | | - |
91 | | - self.vectorizer_cache[self.vectorizer_path] = self.vectorizer |
92 | | - |
93 | | - def _extract_features(self, content: str): |
94 | | - return self.vectorizer.transform([content]) |
95 | | - |
96 | | - def _is_sensitive(self, content: str) -> tuple[bool, float, str]: |
97 | | - features = self._extract_features(content) |
98 | | - if isinstance(self.model, torch.nn.Module): |
99 | | - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
100 | | - self.model.to(device) |
101 | | - self.model.eval() |
102 | | - indices = torch.LongTensor(np.vstack(features.nonzero())) |
103 | | - values = torch.FloatTensor(features.data) |
104 | | - tensor = torch.sparse_coo_tensor(indices, values, size=features.shape).to( |
105 | | - device |
106 | | - ) |
107 | 67 |
|
| 68 | +# ================== MODEL DEFINITION ================== |
| 69 | +class SimpleNN(nn.Module): |
| 70 | + def __init__(self, input_dim): |
| 71 | + super().__init__() |
| 72 | + self.fc = nn.Sequential( |
| 73 | + nn.Linear(in_features=input_dim, out_features=256), |
| 74 | + nn.ReLU(), |
| 75 | + nn.Linear(in_features=256, out_features=64), |
| 76 | + nn.ReLU(), |
| 77 | + nn.Linear(in_features=64, out_features=1), |
| 78 | + ) |
| 79 | + |
| 80 | + def forward(self, x): |
| 81 | + return self.fc(x) |
| 82 | + |
| 83 | + |
| 84 | +# ================== LOAD MODELS ================== |
| 85 | +# Load classifier |
| 86 | +checkpoint = torch.load(MODEL_PATH, map_location=DEVICE) |
| 87 | +model = SimpleNN(input_dim=384) |
| 88 | +model.load_state_dict(checkpoint["model_state_dict"]) |
| 89 | +model.to(DEVICE) |
| 90 | +model.eval() |
| 91 | + |
| 92 | +# Load embedding model |
| 93 | +embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=DEVICE) |
| 94 | + |
| 95 | +# Make backup folder |
| 96 | +os.makedirs(BACKUP_DIR, exist_ok=True) |
| 97 | + |
| 98 | + |
| 99 | +# ================== FILE PROCESSING ================== |
| 100 | +def process_file(filepath): |
| 101 | + try: |
| 102 | + _, ext = os.path.splitext(filepath) |
| 103 | + if ext.lower() not in TEXT_EXTENSIONS: |
| 104 | + return None |
| 105 | + |
| 106 | + with open(filepath, "r", encoding="utf-8", errors="ignore") as f: |
| 107 | + content = f.read() |
| 108 | + if not content.strip(): |
| 109 | + return None |
| 110 | + |
| 111 | + # Limit file length |
| 112 | + content = content[:MAX_TEXT_LENGTH] |
| 113 | + |
| 114 | + # Split content into lines |
| 115 | + lines = [line for line in content.splitlines() if line.strip()] |
| 116 | + if not lines: |
| 117 | + return None |
| 118 | + |
| 119 | + # Embed all lines |
| 120 | + embeddings = embed_model.encode(lines, convert_to_tensor=True, device=DEVICE) |
| 121 | + |
| 122 | + # Predict per line |
| 123 | + probs = [] |
| 124 | + for emb in embeddings: |
108 | 125 | with torch.no_grad(): |
109 | | - pred = self.model(tensor) |
110 | | - prob = torch.softmax(pred, dim=1).max().item() |
111 | | - reason = ", ".join( |
112 | | - self.vectorizer.get_feature_names_out()[i] |
113 | | - for i in np.argsort(features.data)[-5:] |
114 | | - ) |
115 | | - return pred.argmax(dim=1).item() == 1, prob, reason |
116 | | - else: |
117 | | - probs = self.model.predict_proba(features) |
118 | | - top_indices = np.argsort(features.toarray()[0])[-5:] |
119 | | - reason = ", ".join( |
120 | | - self.vectorizer.get_feature_names_out()[i] for i in top_indices |
121 | | - ) |
122 | | - return self.model.predict(features)[0] == 1, probs.max(), reason |
123 | | - |
124 | | - async def scan_file_async(self, file_path: str) -> tuple[bool, float, str]: |
125 | | - try: |
126 | | - async with aiofiles.open( |
127 | | - file_path, "r", encoding="utf-8", errors="ignore" |
128 | | - ) as f: |
129 | | - content = await f.read() |
130 | | - return self._is_sensitive(content) |
131 | | - except Exception as e: |
132 | | - log.error(f"Failed to scan {file_path}: {e}") |
133 | | - return False, 0.0, "Error" |
134 | | - |
135 | | - def cleanup(self): |
136 | | - self.model_cache.clear() |
137 | | - self.vectorizer_cache.clear() |
138 | | - self.model = None |
139 | | - self.vectorizer = None |
140 | | - log.info("Cleanup complete.") |
141 | | - |
142 | | - |
143 | | -class VulnScan: |
144 | | - def __init__(self, model_path: str, vectorizer_path: str): |
145 | | - self.scanner = _SensitiveDataScanner(model_path, vectorizer_path) |
146 | | - |
147 | | - @log.function |
148 | | - def scan_directory(self, scan_paths: list[str]) -> None: |
149 | | - log.info("Collecting files...") |
150 | | - all_files = [] |
151 | | - |
152 | | - for path in scan_paths: |
153 | | - try: |
154 | | - all_files.extend(str(f) for f in Path(path).rglob("*") if f.is_file()) |
155 | | - log.debug(f"Found {len(all_files)} files in {path}") |
156 | | - except Exception as e: |
157 | | - log.warning(f"Skipping path {path} due to error: {e}") |
158 | | - |
159 | | - log.info(f"Collected {len(all_files)} files.") |
160 | | - |
161 | | - loop = asyncio.get_event_loop() |
162 | | - loop.run_until_complete(self._async_scan(all_files)) |
163 | | - |
164 | | - async def _async_scan(self, files: list[str]) -> None: |
165 | | - valid_files = [] |
166 | | - |
167 | | - for file in files: |
168 | | - try: |
169 | | - file_size_mb = os.path.getsize(file) / (1024 * 1024) |
170 | | - if MAX_FILE_SIZE_MB and file_size_mb > MAX_FILE_SIZE_MB: |
171 | | - continue |
172 | | - if any(file.lower().endswith(ext) for ext in UNREADABLE_EXTENSIONS): |
173 | | - continue |
174 | | - valid_files.append(file) |
175 | | - except Exception as e: |
176 | | - log.debug(f"Skipping file {file}: {e}") |
177 | | - |
178 | | - log.info(f"Valid files to scan: {len(valid_files)}") |
179 | | - |
180 | | - semaphore = asyncio.Semaphore(max_workers) |
181 | | - sensitive_files = [] |
182 | | - |
183 | | - async def scan_worker(scan_file): |
184 | | - async with semaphore: |
185 | | - result, prob, reason = await self.scanner.scan_file_async(scan_file) |
186 | | - if result: |
187 | | - log.debug( |
188 | | - f"SENSITIVE: {scan_file} | Confidence: {prob:.2f} | Reason: {reason}" |
189 | | - ) |
190 | | - sensitive_files.append(scan_file) |
191 | | - |
192 | | - tasks = [scan_worker(f) for f in valid_files] |
193 | | - |
194 | | - with tqdm( |
195 | | - total=len(valid_files), |
196 | | - desc="\033[32mSCAN\033[0m \033[94mScanning Files\033[0m", |
197 | | - unit="file", |
198 | | - bar_format="{l_bar} {bar} {n_fmt}/{total_fmt}\n", |
199 | | - ) as pbar: |
200 | | - for f in asyncio.as_completed(tasks): |
201 | | - await f |
202 | | - pbar.update(1) |
203 | | - |
204 | | - with open("Sensitive_File_Paths.txt", "a") as out: |
205 | | - out.write( |
206 | | - "\n".join(sensitive_files) + "\n" |
207 | | - if sensitive_files |
208 | | - else "No sensitive files detected.\n" |
209 | | - ) |
210 | | - |
211 | | - self.scanner.cleanup() |
| 126 | + output = model(emb.unsqueeze(0)) |
| 127 | + probs.append(torch.sigmoid(output).item()) |
| 128 | + |
| 129 | + max_prob = max(probs) |
| 130 | + if max_prob < SENSITIVE_THRESHOLD: |
| 131 | + return None |
| 132 | + |
| 133 | + # Get top 5 lines contributing most |
| 134 | + top_lines = [lines[i] for i, p in sorted(enumerate(probs), key=lambda x: x[1], reverse=True)[:5]] |
| 135 | + |
| 136 | + # Backup file |
| 137 | + rel_path = os.path.relpath(filepath, ROOT_DIR) |
| 138 | + backup_path = os.path.join(BACKUP_DIR, rel_path) |
| 139 | + os.makedirs(os.path.dirname(backup_path), exist_ok=True) |
| 140 | + shutil.copy2(filepath, backup_path) |
212 | 141 |
|
| 142 | + return { |
| 143 | + "file": filepath, |
| 144 | + "probability": max_prob, |
| 145 | + "copied_to": backup_path, |
| 146 | + "reason": top_lines |
| 147 | + } |
213 | 148 |
|
| 149 | + except Exception as e: |
| 150 | + print(f"[ERROR] Could not process {filepath}: {e}") |
| 151 | + return None |
| 152 | + |
| 153 | + |
| 154 | +# ================== DIRECTORY SCAN ================== |
| 155 | +def scan_directory(root): |
| 156 | + sensitive_files = [] |
| 157 | + with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: |
| 158 | + futures = [] |
| 159 | + for dirpath, _, filenames in os.walk(root): |
| 160 | + for file in filenames: |
| 161 | + futures.append(executor.submit(process_file, os.path.join(dirpath, file))) |
| 162 | + |
| 163 | + for future in as_completed(futures): |
| 164 | + result = future.result() |
| 165 | + if result: |
| 166 | + sensitive_files.append(result) |
| 167 | + |
| 168 | + return sensitive_files |
| 169 | + |
| 170 | + |
| 171 | +# ================== MAIN ================== |
214 | 172 | if __name__ == "__main__": |
215 | | - try: |
216 | | - base_paths = [ |
217 | | - "C:\\Users\\", |
218 | | - "C:\\Windows\\Logs", |
219 | | - "C:\\Program Files", |
220 | | - "C:\\Program Files (x86)", |
221 | | - ] |
222 | | - vulnscan = VulnScan("vulnscan/SenseMini.3n3.pth", "vulnscan/vectorizer.3n3.pkl") |
223 | | - vulnscan.scan_directory(base_paths) |
224 | | - except KeyboardInterrupt: |
225 | | - log.warning("User interrupted. Exiting gracefully.") |
226 | | - exit(0) |
| 173 | + print(f"Scanning directory: {ROOT_DIR}") |
| 174 | + sensitive = scan_directory(ROOT_DIR) |
| 175 | + |
| 176 | + # Save JSON report |
| 177 | + with open(REPORT_JSON, "w", encoding="utf-8") as f: |
| 178 | + json.dump(sensitive, f, indent=2, ensure_ascii=False) |
| 179 | + |
| 180 | + # Save CSV report |
| 181 | + with open(REPORT_CSV, "w", newline="", encoding="utf-8") as f: |
| 182 | + writer = csv.DictWriter(f, fieldnames=["file", "probability", "copied_to", "reason"]) |
| 183 | + writer.writeheader() |
| 184 | + for entry in sensitive: |
| 185 | + # Join top lines as single string for CSV |
| 186 | + entry_csv = entry.copy() |
| 187 | + entry_csv["reason"] = " | ".join(entry["reason"]) |
| 188 | + writer.writerow(entry_csv) |
| 189 | + |
| 190 | + print("\nSensitive files detected and backed up:") |
| 191 | + for entry in sensitive: |
| 192 | + print(f" - {entry['file']} (prob={entry['probability']:.4f})") |
| 193 | + for line in entry["reason"]: |
| 194 | + print(f" -> {line}") |
| 195 | + |
| 196 | + print(f"\nBackup completed.\nFiles copied into: {BACKUP_DIR}") |
| 197 | + print(f"Reports saved as:\n - {REPORT_JSON}\n - {REPORT_CSV}") |
0 commit comments