1- """from __future__ import annotations
2-
3- import asyncio
4- import os
5- import threading
6- import warnings
7-
8- import aiofiles
9- import joblib
10- import numpy as np
11- # noinspection PyPackageRequirements
12- import torch
13- from pathlib import Path
14- from safetensors import safe_open
15- from tqdm import tqdm
16-
17- from logicytics import log, config
18-
19- warnings.filterwarnings("ignore")
20-
21- UNREADABLE_EXTENSIONS = config.get("VulnScan Settings", "unreadable_extensions").split(
22- ","
23- )
24- MAX_FILE_SIZE_MB = config.get("VulnScan Settings", "max_file_size_mb", fallback="None")
25- raw_workers = config.get("VulnScan Settings", "max_workers", fallback="auto")
26- max_workers = min(32, os.cpu_count() * 2) if raw_workers == "auto" else int(raw_workers)
27-
28- if MAX_FILE_SIZE_MB != "None":
29- MAX_FILE_SIZE_MB = max(int(MAX_FILE_SIZE_MB), 1)
30- else:
31- MAX_FILE_SIZE_MB = None
32- """
331import csv
342import json
353import os
408from sentence_transformers import SentenceTransformer
419from torch import nn
4210
11+ from logicytics import log , config
12+
4313# ================== GLOBAL SETTINGS ==================
44- # Paths
45- ROOT_DIR = r"C:\Users\Hp\Desktop\Shahm" # Folder to scan
46- BACKUP_DIR = r"C:\Users\Hp\Desktop\VulnScan_Files" # Backup folder
47- MODEL_PATH = r"vulnscan/Model_SenseMacro.4n1.pth" # Your trained model checkpoint
4814
4915# File scan settings
50- TEXT_EXTENSIONS = {".txt" , ".log" , ".csv" , ".json" , ".xml" , ".html" , ".md" , ".cfg" , ".ini" , ".yml" , ".yaml" }
51- MAX_TEXT_LENGTH = 1000000 # Max characters per file to scan
52-
16+ TEXT_EXTENSIONS = {
17+ ".txt" , ".log" , ".csv" , ".json" , ".xml" , ".html" , ".md" , ".cfg" , ".ini" , ".yml" , ".yaml" ,
18+ ".rtf" , ".tex" , ".rst" , ".adoc" , ".properties" , ".conf" , ".bat" , ".ps1" , ".sh" , ".tsv" ,
19+ ".dat" , ".env" , ".toml" , ".dockerfile" , ".gitignore" , ".gitattributes" , ".npmrc" , ".editorconfig"
20+ }
21+ MAX_TEXT_LENGTH = config .get ("VulnScan Settings" , "text_char_limit" , fallback = None )
22+ MAX_TEXT_LENGTH = int (MAX_TEXT_LENGTH ) if MAX_TEXT_LENGTH not in (None , "None" , "" ) else None
5323# Threading
54- NUM_WORKERS = 8 # Number of parallel threads for scanning
55-
24+ NUM_WORKERS = config . get ( "VulnScan Settings" , "max_workers" , fallback = "auto" )
25+ NUM_WORKERS = min ( 32 , os . cpu_count () * 2 ) if NUM_WORKERS == "auto" else int ( NUM_WORKERS )
5626# Classification threshold
57- SENSITIVE_THRESHOLD = 0.5 # Probability cutoff to consider a file sensitive
27+ SENSITIVE_THRESHOLD = float (
28+ config .get ("VulnScan Settings" , "threshold" , fallback = 0.6 )) # Probability cutoff to consider a file sensitive
5829
59- # Reports
60- REPORT_JSON = os .path .join (os .getcwd (), "report.json" )
61- REPORT_CSV = os .path .join (os .getcwd (), "report.csv" )
30+ # Paths
31+ ROOT_DIR = r"C:/" # Folder to scan
32+ SAVE_DIR = r"VulnScan_Files" # Backup folder
33+ MODEL_PATH = r"vulnscan/Model_SenseMacro.4n1.pth" # Your trained model checkpoint
34+ REPORT_JSON = "report.json"
35+ REPORT_CSV = "report.csv"
6236
6337# ================== DEVICE SETUP ==================
6438DEVICE = "cuda" if torch .cuda .is_available () else "cpu"
65- print (f"Using device: { DEVICE } " )
39+ log . debug (f"Using device: { DEVICE } " )
6640
6741
6842# ================== MODEL DEFINITION ==================
@@ -93,7 +67,7 @@ def forward(self, x):
9367embed_model = SentenceTransformer ("sentence-transformers/all-MiniLM-L6-v2" , device = DEVICE )
9468
9569# Make backup folder
96- os .makedirs (BACKUP_DIR , exist_ok = True )
70+ os .makedirs (SAVE_DIR , exist_ok = True )
9771
9872
9973# ================== FILE PROCESSING ==================
@@ -103,16 +77,17 @@ def process_file(filepath):
10377 if ext .lower () not in TEXT_EXTENSIONS :
10478 return None
10579
106- with open (filepath , "r" , encoding = "utf-8" , errors = "ignore" ) as f :
107- content = f .read ()
80+ with open (filepath , "r" , encoding = "utf-8" , errors = "ignore" ) as f_ :
81+ content = f_ .read ()
10882 if not content .strip ():
10983 return None
11084
11185 # Limit file length
112- content = content [:MAX_TEXT_LENGTH ]
86+ if MAX_TEXT_LENGTH is not None :
87+ content = content [:MAX_TEXT_LENGTH ]
11388
11489 # Split content into lines
115- lines = [line for line in content .splitlines () if line .strip ()]
90+ lines = [line_ for line_ in content .splitlines () if line_ .strip ()]
11691 if not lines :
11792 return None
11893
@@ -135,7 +110,7 @@ def process_file(filepath):
135110
136111 # Backup file
137112 rel_path = os .path .relpath (filepath , ROOT_DIR )
138- backup_path = os .path .join (BACKUP_DIR , rel_path )
113+ backup_path = os .path .join (SAVE_DIR , rel_path )
139114 os .makedirs (os .path .dirname (backup_path ), exist_ok = True )
140115 shutil .copy2 (filepath , backup_path )
141116
@@ -147,7 +122,7 @@ def process_file(filepath):
147122 }
148123
149124 except Exception as e :
150- print (f"[ERROR] Could not process { filepath } : { e } " )
125+ log . error (f"Could not process { filepath } : { e } " )
151126 return None
152127
153128
@@ -170,7 +145,7 @@ def scan_directory(root):
170145
171146# ================== MAIN ==================
172147if __name__ == "__main__" :
173- print (f"Scanning directory: { ROOT_DIR } " )
148+ log . info (f"Scanning directory: { ROOT_DIR } - This will take some time... " )
174149 sensitive = scan_directory (ROOT_DIR )
175150
176151 # Save JSON report
@@ -187,11 +162,15 @@ def scan_directory(root):
187162 entry_csv ["reason" ] = " | " .join (entry ["reason" ])
188163 writer .writerow (entry_csv )
189164
190- print ("\n Sensitive files detected and backed up:" )
165+ print ()
166+ log .debug ("Sensitive files detected and backed up:" )
191167 for entry in sensitive :
192- print (f" - { entry ['file' ]} (prob={ entry ['probability' ]:.4f} )" )
168+ log . debug (f" - { entry ['file' ]} (prob={ entry ['probability' ]:.4f} )" )
193169 for line in entry ["reason" ]:
194- print (f" -> { line } " )
170+ log . debug (f" -> { line } " )
195171
196- print (f"\n Backup completed.\n Files copied into: { BACKUP_DIR } " )
197- print (f"Reports saved as:\n - { REPORT_JSON } \n - { REPORT_CSV } " )
172+ print ()
173+ log .info ("Backup completed.\n " )
174+ log .debug (f"Files copied into: { SAVE_DIR } " )
175+ log .debug (f"JSON report saved as: { REPORT_JSON } " )
176+ log .debug (f"CSV report saved as: { REPORT_CSV } " )
0 commit comments