Skip to content

Commit 69e9391

Browse files
Refactor vulnscan.py for improved file processing and logging; update config.ini for new settings
Signed-off-by: Shahm Najeeb <[email protected]>
1 parent 1e88262 commit 69e9391

File tree

3 files changed

+62
-144
lines changed

3 files changed

+62
-144
lines changed

.idea/csv-editor.xml

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

CODE/config.ini

Lines changed: 8 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -100,93 +100,16 @@ timeout = 10
100100
max_retry_time = 30
101101

102102
###################################################
103+
103104
[VulnScan Settings]
104-
# Following extensions to be skipped by the model
105-
# Format: comma-separated list with dots (e.g., .exe, .dll)
106-
unreadable_extensions = .exe, .dll, .so, .zip, .tar, .gz, .7z, .rar, .jpg, .jpeg, .png, .gif, .bmp, .tiff, .webp, .mp3, .wav, .flac, .aac, .ogg, .mp4, .mkv, .avi, .mov, .wmv, .flv, .pdf, .doc, .docx, .xls, .xlsx, .ppt, .pptx, .odt, .ods, .odp, .bin, .dat, .iso, .class, .pyc, .o, .obj, .sqlite, .db, .ttf, .otf, .woff, .woff2, .lnk, .url
105+
# Following extensions to be ignored by the model
107106
# In MB, max file size that the model is allowed to scan, if commented out disables the limit, you can also just say None
108-
max_file_size_mb = None
109-
# Max workers to be used, either integer or use auto to make it decide the best value
107+
text_char_limit = None
108+
# Max workers to be used, either integer or use "auto" to make it decide the best value
110109
max_workers = auto
111-
112-
[VulnScan.generate Settings]
113-
# The following settings are for the Generate module for fake training data
114-
extensions = .txt, .log, .md, .csv, .json, .xml, .html, .yaml, .ini, .pdf, .docx, .xlsx, .pptx
115-
save_path = PATH
116-
117-
# Options include:
118-
# 'Sense' - Generates 50k files, each 25KB in size.
119-
# 'SenseNano' - Generates 5 files, each 5KB in size.
120-
# 'SenseMacro' - Generates 1m files, each 10KB in size.
121-
# 'SenseMini' - Generates 10k files, each 10KB in size.
122-
# 'SenseCustom' - Uses custom size settings from the configuration file.
123-
code_name = SenseMini
124-
125-
# This allows more randomness in the file sizes, use 0 to disable
126-
# this is applied randomly every time a file is generated
127-
# Variation is applied in the following way:
128-
# size +- (size */ variation) where its random weather to add or subtract and divide or multiply
129-
size_variation = 0.1
130-
131-
# Set to SenseCustom to use below size settings
132-
min_file_size = 5KB
133-
max_file_size = 50KB
134-
135-
# Chances for the following data types in files:
136-
# 0.0 - 1.0, the rest will be for pure data
137-
full_sensitive_chance = 0.07
138-
partial_sensitive_chance = 0.2
139-
140-
[VulnScan.vectorizer Settings]
141-
# The following settings are for the Vectorizer module for vectorizing data
142-
# Usually it automatically vectorizes data, but this is for manual vectorization
143-
144-
# We advise to use this vectorization, although not knowing the vectorizer is not advised
145-
# as this may lead to ValueErrors due to different inputs
146-
# Use the vectorizer supplied for any v3 model on SenseMini
147-
148-
# The path to the data to vectorize, either a file or a directory
149-
data_path = PATH
150-
# The path to save the vectorized data - It will automatically be appended '\Vectorizer.pkl'
151-
# Make sure the path is a directory, and it exists
152-
output_path = PATH
153-
154-
# Vectorizer to use, options include:
155-
# tfidf or count - The code for the training only supports tfidf - we advise to use tfidf
156-
vectorizer_type = tfidf
157-
158-
[VulnScan.train Settings]
159-
# The following settings are for the Train module for training models
160-
# NeuralNetwork seems to be the best choice for this task
161-
# Options: "NeuralNetwork", "LogReg",
162-
# "RandomForest", "ExtraTrees", "GBM",
163-
# "XGBoost", "DecisionTree", "NaiveBayes"
164-
model_name = NeuralNetwork
165-
166-
# General Training Parameters
167-
epochs = 10
168-
batch_size = 32
169-
learning_rate = 0.001
170-
use_cuda = true
171-
172-
# Paths to train and save data
173-
train_data_path = PATH
174-
# If all models are to be trained, this is the path to save all models,
175-
# and will be appended with the model codename and follow naming convention
176-
save_model_path = PATH
177-
178-
[VulnScan.study Settings]
179-
# Here is the basics of the study module
180-
# This is useful to generate graphs and data that may help in understanding the model
181-
# Everything is found online pre-studied, so this is not necessary
182-
# But it is useful for understanding the model locally
183-
# All files be saved here, and can't be changed, PATH is "NN features/"
184-
185-
# This is the path to the model, and the vectorizer
186-
model_path = PATH
187-
vectorizer_path = PATH
188-
# Number of features to visualise in the SVG Bar graph, maximum is 3000 due to limitations
189-
# Placing -1 will visualise first 3000 features. Bar will be a color gradient heatmap.
190-
number_of_features = -1
110+
# Sensitivity threshold for the model to flag something as a sensitive
111+
threshold = 0.6
112+
# Paths for required files
113+
model = vulnscan/Model_SenseMacro.4n1.pth
191114

192115
##################################################

CODE/vulnscan.py

Lines changed: 38 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,3 @@
1-
"""from __future__ import annotations
2-
3-
import asyncio
4-
import os
5-
import threading
6-
import warnings
7-
8-
import aiofiles
9-
import joblib
10-
import numpy as np
11-
# noinspection PyPackageRequirements
12-
import torch
13-
from pathlib import Path
14-
from safetensors import safe_open
15-
from tqdm import tqdm
16-
17-
from logicytics import log, config
18-
19-
warnings.filterwarnings("ignore")
20-
21-
UNREADABLE_EXTENSIONS = config.get("VulnScan Settings", "unreadable_extensions").split(
22-
","
23-
)
24-
MAX_FILE_SIZE_MB = config.get("VulnScan Settings", "max_file_size_mb", fallback="None")
25-
raw_workers = config.get("VulnScan Settings", "max_workers", fallback="auto")
26-
max_workers = min(32, os.cpu_count() * 2) if raw_workers == "auto" else int(raw_workers)
27-
28-
if MAX_FILE_SIZE_MB != "None":
29-
MAX_FILE_SIZE_MB = max(int(MAX_FILE_SIZE_MB), 1)
30-
else:
31-
MAX_FILE_SIZE_MB = None
32-
"""
331
import csv
342
import json
353
import os
@@ -40,29 +8,35 @@
408
from sentence_transformers import SentenceTransformer
419
from torch import nn
4210

11+
from logicytics import log, config
12+
4313
# ================== GLOBAL SETTINGS ==================
44-
# Paths
45-
ROOT_DIR = r"C:\Users\Hp\Desktop\Shahm" # Folder to scan
46-
BACKUP_DIR = r"C:\Users\Hp\Desktop\VulnScan_Files" # Backup folder
47-
MODEL_PATH = r"vulnscan/Model_SenseMacro.4n1.pth" # Your trained model checkpoint
4814

4915
# File scan settings
50-
TEXT_EXTENSIONS = {".txt", ".log", ".csv", ".json", ".xml", ".html", ".md", ".cfg", ".ini", ".yml", ".yaml"}
51-
MAX_TEXT_LENGTH = 1000000 # Max characters per file to scan
52-
16+
TEXT_EXTENSIONS = {
17+
".txt", ".log", ".csv", ".json", ".xml", ".html", ".md", ".cfg", ".ini", ".yml", ".yaml",
18+
".rtf", ".tex", ".rst", ".adoc", ".properties", ".conf", ".bat", ".ps1", ".sh", ".tsv",
19+
".dat", ".env", ".toml", ".dockerfile", ".gitignore", ".gitattributes", ".npmrc", ".editorconfig"
20+
}
21+
MAX_TEXT_LENGTH = config.get("VulnScan Settings", "text_char_limit", fallback=None)
22+
MAX_TEXT_LENGTH = int(MAX_TEXT_LENGTH) if MAX_TEXT_LENGTH not in (None, "None", "") else None
5323
# Threading
54-
NUM_WORKERS = 8 # Number of parallel threads for scanning
55-
24+
NUM_WORKERS = config.get("VulnScan Settings", "max_workers", fallback="auto")
25+
NUM_WORKERS = min(32, os.cpu_count() * 2) if NUM_WORKERS == "auto" else int(NUM_WORKERS)
5626
# Classification threshold
57-
SENSITIVE_THRESHOLD = 0.5 # Probability cutoff to consider a file sensitive
27+
SENSITIVE_THRESHOLD = float(
28+
config.get("VulnScan Settings", "threshold", fallback=0.6)) # Probability cutoff to consider a file sensitive
5829

59-
# Reports
60-
REPORT_JSON = os.path.join(os.getcwd(), "report.json")
61-
REPORT_CSV = os.path.join(os.getcwd(), "report.csv")
30+
# Paths
31+
ROOT_DIR = r"C:/" # Folder to scan
32+
SAVE_DIR = r"VulnScan_Files" # Backup folder
33+
MODEL_PATH = r"vulnscan/Model_SenseMacro.4n1.pth" # Your trained model checkpoint
34+
REPORT_JSON = "report.json"
35+
REPORT_CSV = "report.csv"
6236

6337
# ================== DEVICE SETUP ==================
6438
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
65-
print(f"Using device: {DEVICE}")
39+
log.debug(f"Using device: {DEVICE}")
6640

6741

6842
# ================== MODEL DEFINITION ==================
@@ -93,7 +67,7 @@ def forward(self, x):
9367
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=DEVICE)
9468

9569
# Make backup folder
96-
os.makedirs(BACKUP_DIR, exist_ok=True)
70+
os.makedirs(SAVE_DIR, exist_ok=True)
9771

9872

9973
# ================== FILE PROCESSING ==================
@@ -103,16 +77,17 @@ def process_file(filepath):
10377
if ext.lower() not in TEXT_EXTENSIONS:
10478
return None
10579

106-
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
107-
content = f.read()
80+
with open(filepath, "r", encoding="utf-8", errors="ignore") as f_:
81+
content = f_.read()
10882
if not content.strip():
10983
return None
11084

11185
# Limit file length
112-
content = content[:MAX_TEXT_LENGTH]
86+
if MAX_TEXT_LENGTH is not None:
87+
content = content[:MAX_TEXT_LENGTH]
11388

11489
# Split content into lines
115-
lines = [line for line in content.splitlines() if line.strip()]
90+
lines = [line_ for line_ in content.splitlines() if line_.strip()]
11691
if not lines:
11792
return None
11893

@@ -135,7 +110,7 @@ def process_file(filepath):
135110

136111
# Backup file
137112
rel_path = os.path.relpath(filepath, ROOT_DIR)
138-
backup_path = os.path.join(BACKUP_DIR, rel_path)
113+
backup_path = os.path.join(SAVE_DIR, rel_path)
139114
os.makedirs(os.path.dirname(backup_path), exist_ok=True)
140115
shutil.copy2(filepath, backup_path)
141116

@@ -147,7 +122,7 @@ def process_file(filepath):
147122
}
148123

149124
except Exception as e:
150-
print(f"[ERROR] Could not process {filepath}: {e}")
125+
log.error(f"Could not process {filepath}: {e}")
151126
return None
152127

153128

@@ -170,7 +145,7 @@ def scan_directory(root):
170145

171146
# ================== MAIN ==================
172147
if __name__ == "__main__":
173-
print(f"Scanning directory: {ROOT_DIR}")
148+
log.info(f"Scanning directory: {ROOT_DIR} - This will take some time...")
174149
sensitive = scan_directory(ROOT_DIR)
175150

176151
# Save JSON report
@@ -187,11 +162,15 @@ def scan_directory(root):
187162
entry_csv["reason"] = " | ".join(entry["reason"])
188163
writer.writerow(entry_csv)
189164

190-
print("\nSensitive files detected and backed up:")
165+
print()
166+
log.debug("Sensitive files detected and backed up:")
191167
for entry in sensitive:
192-
print(f" - {entry['file']} (prob={entry['probability']:.4f})")
168+
log.debug(f" - {entry['file']} (prob={entry['probability']:.4f})")
193169
for line in entry["reason"]:
194-
print(f" -> {line}")
170+
log.debug(f" -> {line}")
195171

196-
print(f"\nBackup completed.\nFiles copied into: {BACKUP_DIR}")
197-
print(f"Reports saved as:\n - {REPORT_JSON}\n - {REPORT_CSV}")
172+
print()
173+
log.info("Backup completed.\n")
174+
log.debug(f"Files copied into: {SAVE_DIR}")
175+
log.debug(f"JSON report saved as: {REPORT_JSON}")
176+
log.debug(f"CSV report saved as: {REPORT_CSV}")

0 commit comments

Comments
 (0)