Skip to content

Commit 1e88262

Browse files
Refactor vulnscan.py to enhance file processing and backup functionality; update PLANS.md for version tracking
Signed-off-by: Shahm Najeeb <[email protected]>
1 parent 2edb41f commit 1e88262

File tree

6 files changed

+163
-191
lines changed

6 files changed

+163
-191
lines changed

CODE/logicytics/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,4 +122,5 @@ def wrapper(*args, **kwargs) -> callable:
122122
"ObjectLoadError",
123123
"log",
124124
"Log",
125+
"config",
125126
]

CODE/vulnscan.py

Lines changed: 160 additions & 189 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from __future__ import annotations
1+
"""from __future__ import annotations
22
33
import asyncio
44
import os
@@ -29,198 +29,169 @@
2929
MAX_FILE_SIZE_MB = max(int(MAX_FILE_SIZE_MB), 1)
3030
else:
3131
MAX_FILE_SIZE_MB = None
32+
"""
33+
import csv
34+
import json
35+
import os
36+
import shutil
37+
from concurrent.futures import ThreadPoolExecutor, as_completed
38+
39+
import torch
40+
from sentence_transformers import SentenceTransformer
41+
from torch import nn
42+
43+
# ================== GLOBAL SETTINGS ==================
44+
# Paths
45+
ROOT_DIR = r"C:\Users\Hp\Desktop\Shahm" # Folder to scan
46+
BACKUP_DIR = r"C:\Users\Hp\Desktop\VulnScan_Files" # Backup folder
47+
MODEL_PATH = r"vulnscan/Model_SenseMacro.4n1.pth" # Your trained model checkpoint
48+
49+
# File scan settings
50+
TEXT_EXTENSIONS = {".txt", ".log", ".csv", ".json", ".xml", ".html", ".md", ".cfg", ".ini", ".yml", ".yaml"}
51+
MAX_TEXT_LENGTH = 1000000 # Max characters per file to scan
52+
53+
# Threading
54+
NUM_WORKERS = 8 # Number of parallel threads for scanning
55+
56+
# Classification threshold
57+
SENSITIVE_THRESHOLD = 0.5 # Probability cutoff to consider a file sensitive
58+
59+
# Reports
60+
REPORT_JSON = os.path.join(os.getcwd(), "report.json")
61+
REPORT_CSV = os.path.join(os.getcwd(), "report.csv")
3262

63+
# ================== DEVICE SETUP ==================
64+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
65+
print(f"Using device: {DEVICE}")
3366

34-
class _SensitiveDataScanner:
35-
def __init__(self, model_path: str, vectorizer_path: str):
36-
self.model_path = model_path
37-
self.vectorizer_path = vectorizer_path
38-
self.model_cache = {}
39-
self.vectorizer_cache = {}
40-
self.model_lock = threading.Lock()
41-
self.vectorizer_lock = threading.Lock()
42-
self.model = None
43-
self.vectorizer = None
44-
self._load_model()
45-
self._load_vectorizer()
46-
47-
def _load_model(self) -> None:
48-
with self.model_lock:
49-
if self.model_path in self.model_cache:
50-
self.model = self.model_cache[self.model_path]
51-
return
52-
53-
if self.model_path.endswith(".pkl"):
54-
self.model = joblib.load(self.model_path)
55-
elif self.model_path.endswith(".safetensors"):
56-
self.model = safe_open(self.model_path, framework="torch")
57-
elif self.model_path.endswith(".pth"):
58-
with warnings.catch_warnings():
59-
warnings.filterwarnings("ignore", category=FutureWarning)
60-
self.model = torch.load(
61-
self.model_path,
62-
map_location=torch.device(
63-
"cuda" if torch.cuda.is_available() else "cpu"
64-
),
65-
weights_only=False,
66-
)
67-
if not torch.cuda.is_available() and torch.version.cuda:
68-
log.warning(
69-
"NVIDIA GPU detected but CUDA is not available. Check your PyTorch and CUDA installation to utilise as much power as possible."
70-
)
71-
log.debug(
72-
f"Model using device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}"
73-
)
74-
else:
75-
raise ValueError("Unsupported model file format")
76-
77-
self.model_cache[self.model_path] = self.model
78-
79-
def _load_vectorizer(self) -> None:
80-
with self.vectorizer_lock:
81-
if self.vectorizer_path in self.vectorizer_cache:
82-
self.vectorizer = self.vectorizer_cache[self.vectorizer_path]
83-
return
84-
85-
try:
86-
self.vectorizer = joblib.load(self.vectorizer_path)
87-
except Exception as e:
88-
log.critical(f"Failed to load vectorizer: {e}")
89-
exit(1)
90-
91-
self.vectorizer_cache[self.vectorizer_path] = self.vectorizer
92-
93-
def _extract_features(self, content: str):
94-
return self.vectorizer.transform([content])
95-
96-
def _is_sensitive(self, content: str) -> tuple[bool, float, str]:
97-
features = self._extract_features(content)
98-
if isinstance(self.model, torch.nn.Module):
99-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
100-
self.model.to(device)
101-
self.model.eval()
102-
indices = torch.LongTensor(np.vstack(features.nonzero()))
103-
values = torch.FloatTensor(features.data)
104-
tensor = torch.sparse_coo_tensor(indices, values, size=features.shape).to(
105-
device
106-
)
10767

68+
# ================== MODEL DEFINITION ==================
69+
class SimpleNN(nn.Module):
70+
def __init__(self, input_dim):
71+
super().__init__()
72+
self.fc = nn.Sequential(
73+
nn.Linear(in_features=input_dim, out_features=256),
74+
nn.ReLU(),
75+
nn.Linear(in_features=256, out_features=64),
76+
nn.ReLU(),
77+
nn.Linear(in_features=64, out_features=1),
78+
)
79+
80+
def forward(self, x):
81+
return self.fc(x)
82+
83+
84+
# ================== LOAD MODELS ==================
85+
# Load classifier
86+
checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)
87+
model = SimpleNN(input_dim=384)
88+
model.load_state_dict(checkpoint["model_state_dict"])
89+
model.to(DEVICE)
90+
model.eval()
91+
92+
# Load embedding model
93+
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=DEVICE)
94+
95+
# Make backup folder
96+
os.makedirs(BACKUP_DIR, exist_ok=True)
97+
98+
99+
# ================== FILE PROCESSING ==================
100+
def process_file(filepath):
101+
try:
102+
_, ext = os.path.splitext(filepath)
103+
if ext.lower() not in TEXT_EXTENSIONS:
104+
return None
105+
106+
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
107+
content = f.read()
108+
if not content.strip():
109+
return None
110+
111+
# Limit file length
112+
content = content[:MAX_TEXT_LENGTH]
113+
114+
# Split content into lines
115+
lines = [line for line in content.splitlines() if line.strip()]
116+
if not lines:
117+
return None
118+
119+
# Embed all lines
120+
embeddings = embed_model.encode(lines, convert_to_tensor=True, device=DEVICE)
121+
122+
# Predict per line
123+
probs = []
124+
for emb in embeddings:
108125
with torch.no_grad():
109-
pred = self.model(tensor)
110-
prob = torch.softmax(pred, dim=1).max().item()
111-
reason = ", ".join(
112-
self.vectorizer.get_feature_names_out()[i]
113-
for i in np.argsort(features.data)[-5:]
114-
)
115-
return pred.argmax(dim=1).item() == 1, prob, reason
116-
else:
117-
probs = self.model.predict_proba(features)
118-
top_indices = np.argsort(features.toarray()[0])[-5:]
119-
reason = ", ".join(
120-
self.vectorizer.get_feature_names_out()[i] for i in top_indices
121-
)
122-
return self.model.predict(features)[0] == 1, probs.max(), reason
123-
124-
async def scan_file_async(self, file_path: str) -> tuple[bool, float, str]:
125-
try:
126-
async with aiofiles.open(
127-
file_path, "r", encoding="utf-8", errors="ignore"
128-
) as f:
129-
content = await f.read()
130-
return self._is_sensitive(content)
131-
except Exception as e:
132-
log.error(f"Failed to scan {file_path}: {e}")
133-
return False, 0.0, "Error"
134-
135-
def cleanup(self):
136-
self.model_cache.clear()
137-
self.vectorizer_cache.clear()
138-
self.model = None
139-
self.vectorizer = None
140-
log.info("Cleanup complete.")
141-
142-
143-
class VulnScan:
144-
def __init__(self, model_path: str, vectorizer_path: str):
145-
self.scanner = _SensitiveDataScanner(model_path, vectorizer_path)
146-
147-
@log.function
148-
def scan_directory(self, scan_paths: list[str]) -> None:
149-
log.info("Collecting files...")
150-
all_files = []
151-
152-
for path in scan_paths:
153-
try:
154-
all_files.extend(str(f) for f in Path(path).rglob("*") if f.is_file())
155-
log.debug(f"Found {len(all_files)} files in {path}")
156-
except Exception as e:
157-
log.warning(f"Skipping path {path} due to error: {e}")
158-
159-
log.info(f"Collected {len(all_files)} files.")
160-
161-
loop = asyncio.get_event_loop()
162-
loop.run_until_complete(self._async_scan(all_files))
163-
164-
async def _async_scan(self, files: list[str]) -> None:
165-
valid_files = []
166-
167-
for file in files:
168-
try:
169-
file_size_mb = os.path.getsize(file) / (1024 * 1024)
170-
if MAX_FILE_SIZE_MB and file_size_mb > MAX_FILE_SIZE_MB:
171-
continue
172-
if any(file.lower().endswith(ext) for ext in UNREADABLE_EXTENSIONS):
173-
continue
174-
valid_files.append(file)
175-
except Exception as e:
176-
log.debug(f"Skipping file {file}: {e}")
177-
178-
log.info(f"Valid files to scan: {len(valid_files)}")
179-
180-
semaphore = asyncio.Semaphore(max_workers)
181-
sensitive_files = []
182-
183-
async def scan_worker(scan_file):
184-
async with semaphore:
185-
result, prob, reason = await self.scanner.scan_file_async(scan_file)
186-
if result:
187-
log.debug(
188-
f"SENSITIVE: {scan_file} | Confidence: {prob:.2f} | Reason: {reason}"
189-
)
190-
sensitive_files.append(scan_file)
191-
192-
tasks = [scan_worker(f) for f in valid_files]
193-
194-
with tqdm(
195-
total=len(valid_files),
196-
desc="\033[32mSCAN\033[0m \033[94mScanning Files\033[0m",
197-
unit="file",
198-
bar_format="{l_bar} {bar} {n_fmt}/{total_fmt}\n",
199-
) as pbar:
200-
for f in asyncio.as_completed(tasks):
201-
await f
202-
pbar.update(1)
203-
204-
with open("Sensitive_File_Paths.txt", "a") as out:
205-
out.write(
206-
"\n".join(sensitive_files) + "\n"
207-
if sensitive_files
208-
else "No sensitive files detected.\n"
209-
)
210-
211-
self.scanner.cleanup()
126+
output = model(emb.unsqueeze(0))
127+
probs.append(torch.sigmoid(output).item())
128+
129+
max_prob = max(probs)
130+
if max_prob < SENSITIVE_THRESHOLD:
131+
return None
132+
133+
# Get top 5 lines contributing most
134+
top_lines = [lines[i] for i, p in sorted(enumerate(probs), key=lambda x: x[1], reverse=True)[:5]]
135+
136+
# Backup file
137+
rel_path = os.path.relpath(filepath, ROOT_DIR)
138+
backup_path = os.path.join(BACKUP_DIR, rel_path)
139+
os.makedirs(os.path.dirname(backup_path), exist_ok=True)
140+
shutil.copy2(filepath, backup_path)
212141

142+
return {
143+
"file": filepath,
144+
"probability": max_prob,
145+
"copied_to": backup_path,
146+
"reason": top_lines
147+
}
213148

149+
except Exception as e:
150+
print(f"[ERROR] Could not process {filepath}: {e}")
151+
return None
152+
153+
154+
# ================== DIRECTORY SCAN ==================
155+
def scan_directory(root):
156+
sensitive_files = []
157+
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
158+
futures = []
159+
for dirpath, _, filenames in os.walk(root):
160+
for file in filenames:
161+
futures.append(executor.submit(process_file, os.path.join(dirpath, file)))
162+
163+
for future in as_completed(futures):
164+
result = future.result()
165+
if result:
166+
sensitive_files.append(result)
167+
168+
return sensitive_files
169+
170+
171+
# ================== MAIN ==================
214172
if __name__ == "__main__":
215-
try:
216-
base_paths = [
217-
"C:\\Users\\",
218-
"C:\\Windows\\Logs",
219-
"C:\\Program Files",
220-
"C:\\Program Files (x86)",
221-
]
222-
vulnscan = VulnScan("vulnscan/SenseMini.3n3.pth", "vulnscan/vectorizer.3n3.pkl")
223-
vulnscan.scan_directory(base_paths)
224-
except KeyboardInterrupt:
225-
log.warning("User interrupted. Exiting gracefully.")
226-
exit(0)
173+
print(f"Scanning directory: {ROOT_DIR}")
174+
sensitive = scan_directory(ROOT_DIR)
175+
176+
# Save JSON report
177+
with open(REPORT_JSON, "w", encoding="utf-8") as f:
178+
json.dump(sensitive, f, indent=2, ensure_ascii=False)
179+
180+
# Save CSV report
181+
with open(REPORT_CSV, "w", newline="", encoding="utf-8") as f:
182+
writer = csv.DictWriter(f, fieldnames=["file", "probability", "copied_to", "reason"])
183+
writer.writeheader()
184+
for entry in sensitive:
185+
# Join top lines as single string for CSV
186+
entry_csv = entry.copy()
187+
entry_csv["reason"] = " | ".join(entry["reason"])
188+
writer.writerow(entry_csv)
189+
190+
print("\nSensitive files detected and backed up:")
191+
for entry in sensitive:
192+
print(f" - {entry['file']} (prob={entry['probability']:.4f})")
193+
for line in entry["reason"]:
194+
print(f" -> {line}")
195+
196+
print(f"\nBackup completed.\nFiles copied into: {BACKUP_DIR}")
197+
print(f"Reports saved as:\n - {REPORT_JSON}\n - {REPORT_CSV}")
1.33 MB
Binary file not shown.

CODE/vulnscan/SenseMini.3n3.pth

-4.89 MB
Binary file not shown.

CODE/vulnscan/vectorizer.3n3.pkl

-305 KB
Binary file not shown.

PLANS.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
99
| Task | Version | Might or Will be done? |
1010
|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|------------------------|
11-
| Remake VulnScan .pkl and .pth to be more accurate | v3.6.0 | |
12-
| Merge `sensitive data miner` with `vulnscan` to be 1 tool | v4.0.0 | |
11+
| Update to model 4n2 of vulnscan | v3.6.1 | |
12+
| Merge `sensitive data miner` with `vulnscan` to be 1 tool | v4.0.0 | |
1313
| Remake Logicytics End-Execution cycle, where files created must go in `temp/` directory, and zipper takes it from there only, simplifying any code logic with this as well | v4.0.0 ||
1414
| Replace Logger.py with Util that contains (tprint), also implement the ExceptionHandler and UpdateManager from Util | v4.0.0 ||
1515
| Make WIKI in the git repo, with a yaml file that updates it to the default github wiki | v4.0.0 ||

0 commit comments

Comments
 (0)