Refactor vulnscan.py for improved file processing and logging; update config.ini for new settings

DefinetlyNotAI · DefinetlyNotAI · commit 69e93910dce5 · 2025-08-31T16:01:47.000+03:00
Signed-off-by: Shahm Najeeb &lt;Nirt_12023@outlook.com&gt;
diff --git a/.idea/csv-editor.xml b/.idea/csv-editor.xml
diff --git a/CODE/config.ini b/CODE/config.ini
@@ -100,93 +100,16 @@ timeout = 10
 max_retry_time = 30
 
 ###################################################
+
 [VulnScan Settings]
-# Following extensions to be skipped by the model
-# Format: comma-separated list with dots (e.g., .exe, .dll)
-unreadable_extensions = .exe, .dll, .so, .zip, .tar, .gz, .7z, .rar, .jpg, .jpeg, .png, .gif, .bmp, .tiff, .webp, .mp3, .wav, .flac, .aac, .ogg, .mp4, .mkv, .avi, .mov, .wmv, .flv, .pdf, .doc, .docx, .xls, .xlsx, .ppt, .pptx, .odt, .ods, .odp, .bin, .dat, .iso, .class, .pyc, .o, .obj, .sqlite, .db, .ttf, .otf, .woff, .woff2, .lnk, .url
+# Following extensions to be ignored by the model
 # In MB, max file size that the model is allowed to scan, if commented out disables the limit, you can also just say None
-max_file_size_mb = None
-# Max workers to be used, either integer or use auto to make it decide the best value
+text_char_limit = None
+# Max workers to be used, either integer or use "auto" to make it decide the best value
 max_workers = auto
-
-[VulnScan.generate Settings]
-# The following settings are for the Generate module for fake training data
-extensions = .txt, .log, .md, .csv, .json, .xml, .html, .yaml, .ini, .pdf, .docx, .xlsx, .pptx
-save_path = PATH
-
-# Options include:
-# 'Sense' - Generates 50k files, each 25KB in size.
-# 'SenseNano' - Generates 5 files, each 5KB in size.
-# 'SenseMacro' - Generates 1m files, each 10KB in size.
-# 'SenseMini' - Generates 10k files, each 10KB in size.
-# 'SenseCustom' - Uses custom size settings from the configuration file.
-code_name = SenseMini
-
-# This allows more randomness in the file sizes, use 0 to disable
-# this is applied randomly every time a file is generated
-# Variation is applied in the following way:
-# size +- (size */ variation) where its random weather to add or subtract and divide or multiply
-size_variation = 0.1
-
-# Set to SenseCustom to use below size settings
-min_file_size = 5KB
-max_file_size = 50KB
-
-# Chances for the following data types in files:
-# 0.0 - 1.0, the rest will be for pure data
-full_sensitive_chance = 0.07
-partial_sensitive_chance = 0.2
-
-[VulnScan.vectorizer Settings]
-# The following settings are for the Vectorizer module for vectorizing data
-# Usually it automatically vectorizes data, but this is for manual vectorization
-
-# We advise to use this vectorization, although not knowing the vectorizer is not advised
-# as this may lead to ValueErrors due to different inputs
-# Use the vectorizer supplied for any v3 model on SenseMini
-
-# The path to the data to vectorize, either a file or a directory
-data_path = PATH
-# The path to save the vectorized data - It will automatically be appended '\Vectorizer.pkl'
-# Make sure the path is a directory, and it exists
-output_path = PATH
-
-# Vectorizer to use, options include:
-# tfidf or count - The code for the training only supports tfidf - we advise to use tfidf
-vectorizer_type = tfidf
-
-[VulnScan.train Settings]
-# The following settings are for the Train module for training models
-# NeuralNetwork seems to be the best choice for this task
-# Options: "NeuralNetwork", "LogReg",
-#          "RandomForest", "ExtraTrees", "GBM",
-#          "XGBoost", "DecisionTree", "NaiveBayes"
-model_name = NeuralNetwork
-
-# General Training Parameters
-epochs = 10
-batch_size = 32
-learning_rate = 0.001
-use_cuda = true
-
-# Paths to train and save data
-train_data_path = PATH
-# If all models are to be trained, this is the path to save all models,
-# and will be appended with the model codename and follow naming convention
-save_model_path = PATH
-
-[VulnScan.study Settings]
-# Here is the basics of the study module
-# This is useful to generate graphs and data that may help in understanding the model
-# Everything is found online pre-studied, so this is not necessary
-# But it is useful for understanding the model locally
-# All files be saved here, and can't be changed, PATH is "NN features/"
-
-# This is the path to the model, and the vectorizer
-model_path = PATH
-vectorizer_path = PATH
-# Number of features to visualise in the SVG Bar graph, maximum is 3000 due to limitations
-# Placing -1 will visualise first 3000 features. Bar will be a color gradient heatmap.
-number_of_features = -1
+# Sensitivity threshold for the model to flag something as a sensitive
+threshold = 0.6
+# Paths for required files
+model = vulnscan/Model_SenseMacro.4n1.pth
 
 ##################################################
diff --git a/CODE/vulnscan.py b/CODE/vulnscan.py
@@ -1,35 +1,3 @@
-"""from __future__ import annotations
-
-import asyncio
-import os
-import threading
-import warnings
-
-import aiofiles
-import joblib
-import numpy as np
-# noinspection PyPackageRequirements
-import torch
-from pathlib import Path
-from safetensors import safe_open
-from tqdm import tqdm
-
-from logicytics import log, config
-
-warnings.filterwarnings("ignore")
-
-UNREADABLE_EXTENSIONS = config.get("VulnScan Settings", "unreadable_extensions").split(
-    ","
-)
-MAX_FILE_SIZE_MB = config.get("VulnScan Settings", "max_file_size_mb", fallback="None")
-raw_workers = config.get("VulnScan Settings", "max_workers", fallback="auto")
-max_workers = min(32, os.cpu_count() * 2) if raw_workers == "auto" else int(raw_workers)
-
-if MAX_FILE_SIZE_MB != "None":
-    MAX_FILE_SIZE_MB = max(int(MAX_FILE_SIZE_MB), 1)
-else:
-    MAX_FILE_SIZE_MB = None
-"""
 import csv
 import json
 import os
@@ -40,29 +8,35 @@
 from sentence_transformers import SentenceTransformer
 from torch import nn
 
+from logicytics import log, config
+
 # ================== GLOBAL SETTINGS ==================
-# Paths
-ROOT_DIR = r"C:\Users\Hp\Desktop\Shahm"  # Folder to scan
-BACKUP_DIR = r"C:\Users\Hp\Desktop\VulnScan_Files"  # Backup folder
-MODEL_PATH = r"vulnscan/Model_SenseMacro.4n1.pth"  # Your trained model checkpoint
 
 # File scan settings
-TEXT_EXTENSIONS = {".txt", ".log", ".csv", ".json", ".xml", ".html", ".md", ".cfg", ".ini", ".yml", ".yaml"}
-MAX_TEXT_LENGTH = 1000000  # Max characters per file to scan
-
+TEXT_EXTENSIONS = {
+    ".txt", ".log", ".csv", ".json", ".xml", ".html", ".md", ".cfg", ".ini", ".yml", ".yaml",
+    ".rtf", ".tex", ".rst", ".adoc", ".properties", ".conf", ".bat", ".ps1", ".sh", ".tsv",
+    ".dat", ".env", ".toml", ".dockerfile", ".gitignore", ".gitattributes", ".npmrc", ".editorconfig"
+}
+MAX_TEXT_LENGTH = config.get("VulnScan Settings", "text_char_limit", fallback=None)
+MAX_TEXT_LENGTH = int(MAX_TEXT_LENGTH) if MAX_TEXT_LENGTH not in (None, "None", "") else None
 # Threading
-NUM_WORKERS = 8  # Number of parallel threads for scanning
-
+NUM_WORKERS = config.get("VulnScan Settings", "max_workers", fallback="auto")
+NUM_WORKERS = min(32, os.cpu_count() * 2) if NUM_WORKERS == "auto" else int(NUM_WORKERS)
 # Classification threshold
-SENSITIVE_THRESHOLD = 0.5  # Probability cutoff to consider a file sensitive
+SENSITIVE_THRESHOLD = float(
+    config.get("VulnScan Settings", "threshold", fallback=0.6))  # Probability cutoff to consider a file sensitive
 
-# Reports
-REPORT_JSON = os.path.join(os.getcwd(), "report.json")
-REPORT_CSV = os.path.join(os.getcwd(), "report.csv")
+# Paths
+ROOT_DIR = r"C:/"  # Folder to scan
+SAVE_DIR = r"VulnScan_Files"  # Backup folder
+MODEL_PATH = r"vulnscan/Model_SenseMacro.4n1.pth"  # Your trained model checkpoint
+REPORT_JSON = "report.json"
+REPORT_CSV = "report.csv"
 
 # ================== DEVICE SETUP ==================
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {DEVICE}")
+log.debug(f"Using device: {DEVICE}")
 
 
 # ================== MODEL DEFINITION ==================
@@ -93,7 +67,7 @@ def forward(self, x):
 embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=DEVICE)
 
 # Make backup folder
-os.makedirs(BACKUP_DIR, exist_ok=True)
+os.makedirs(SAVE_DIR, exist_ok=True)
 
 
 # ================== FILE PROCESSING ==================
@@ -103,16 +77,17 @@ def process_file(filepath):
         if ext.lower() not in TEXT_EXTENSIONS:
             return None
 
-        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
-            content = f.read()
+        with open(filepath, "r", encoding="utf-8", errors="ignore") as f_:
+            content = f_.read()
         if not content.strip():
             return None
 
         # Limit file length
-        content = content[:MAX_TEXT_LENGTH]
+        if MAX_TEXT_LENGTH is not None:
+            content = content[:MAX_TEXT_LENGTH]
 
         # Split content into lines
-        lines = [line for line in content.splitlines() if line.strip()]
+        lines = [line_ for line_ in content.splitlines() if line_.strip()]
         if not lines:
             return None
 
@@ -135,7 +110,7 @@ def process_file(filepath):
 
         # Backup file
         rel_path = os.path.relpath(filepath, ROOT_DIR)
-        backup_path = os.path.join(BACKUP_DIR, rel_path)
+        backup_path = os.path.join(SAVE_DIR, rel_path)
         os.makedirs(os.path.dirname(backup_path), exist_ok=True)
         shutil.copy2(filepath, backup_path)
 
@@ -147,7 +122,7 @@ def process_file(filepath):
         }
 
     except Exception as e:
-        print(f"[ERROR] Could not process {filepath}: {e}")
+        log.error(f"Could not process {filepath}: {e}")
     return None
 
 
@@ -170,7 +145,7 @@ def scan_directory(root):
 
 # ================== MAIN ==================
 if __name__ == "__main__":
-    print(f"Scanning directory: {ROOT_DIR}")
+    log.info(f"Scanning directory: {ROOT_DIR} - This will take some time...")
     sensitive = scan_directory(ROOT_DIR)
 
     # Save JSON report
@@ -187,11 +162,15 @@ def scan_directory(root):
             entry_csv["reason"] = " | ".join(entry["reason"])
             writer.writerow(entry_csv)
 
-    print("\nSensitive files detected and backed up:")
+    print()
+    log.debug("Sensitive files detected and backed up:")
     for entry in sensitive:
-        print(f" - {entry['file']} (prob={entry['probability']:.4f})")
+        log.debug(f" - {entry['file']} (prob={entry['probability']:.4f})")
         for line in entry["reason"]:
-            print(f"     -> {line}")
+            log.debug(f"     -> {line}")
 
-    print(f"\nBackup completed.\nFiles copied into: {BACKUP_DIR}")
-    print(f"Reports saved as:\n - {REPORT_JSON}\n - {REPORT_CSV}")
+    print()
+    log.info("Backup completed.\n")
+    log.debug(f"Files copied into: {SAVE_DIR}")
+    log.debug(f"JSON report saved as: {REPORT_JSON}")
+    log.debug(f"CSV report saved as: {REPORT_CSV}")