|
1 | | -from rich.console import Console |
| 1 | +from rich.console import Console |
2 | 2 | from rich.table import Table |
3 | 3 | import json, requests, argparse, yaml, re, datetime, os, subprocess, platform, hashlib |
4 | 4 | from tinydb import TinyDB, Query |
| 5 | +import pytesseract |
| 6 | +from PIL import Image |
| 7 | +from docx import Document |
| 8 | +from openpyxl import load_workbook |
| 9 | +import PyPDF2 |
| 10 | +import patoolib |
| 11 | +import tempfile |
| 12 | +import shutil |
| 13 | +import os |
| 14 | +import tarfile |
5 | 15 |
|
6 | 16 | # Create a TinyDB instance for storing previous alert hashes |
7 | 17 | db = TinyDB('previous_alerts.json') |
@@ -240,15 +250,108 @@ def list_all_files_iteratively(path, exclude_patterns): |
240 | 250 | def read_match_strings(file_path, source): |
241 | 251 | print_info(f"Scanning file: {file_path}") |
242 | 252 | content = '' |
| 253 | + |
243 | 254 | try: |
244 | | - with open(file_path, 'r', encoding="utf-8") as file: |
245 | | - content = file.read() |
| 255 | + # Check if the file is an image |
| 256 | + if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')): |
| 257 | + # Use OCR to extract text from the image |
| 258 | + image = Image.open(file_path) |
| 259 | + content = pytesseract.image_to_string(image) |
| 260 | + # Check if the file is a PDF document |
| 261 | + elif file_path.lower().endswith('.pdf'): |
| 262 | + content = read_pdf(file_path) |
| 263 | + # Check if the file is an office document (Word, Excel, PowerPoint) |
| 264 | + elif file_path.lower().endswith(('.docx', '.xlsx', '.pptx')): |
| 265 | + content = read_office_document(file_path) |
| 266 | + # Check if the file is an archive (zip, rar, tar, tar.gz) |
| 267 | + elif file_path.lower().endswith(('.zip', '.rar', '.tar', '.tar.gz')): |
| 268 | + content = read_archive(file_path) |
| 269 | + else: |
| 270 | + # For other file types, read content normally |
| 271 | + with open(file_path, 'rb') as file: |
| 272 | + # Attempt to decode using UTF-8, fallback to 'latin-1' if needed |
| 273 | + content = file.read().decode('utf-8', errors='replace') |
246 | 274 | except Exception as e: |
247 | 275 | print_debug(f"Error in read_match_strings: {e}") |
248 | 276 | pass |
| 277 | + |
249 | 278 | matched_strings = match_strings(content) |
250 | 279 | return matched_strings |
251 | 280 |
|
| 281 | + |
| 282 | +def read_pdf(file_path): |
| 283 | + content = '' |
| 284 | + try: |
| 285 | + # Read content from PDF document |
| 286 | + with open(file_path, 'rb') as file: |
| 287 | + pdf_reader = PyPDF2.PdfReader(file) |
| 288 | + for page_num in range(len(pdf_reader.pages)): # Use len() instead of deprecated numPages |
| 289 | + page = pdf_reader.pages[page_num] |
| 290 | + try: |
| 291 | + content += page.extract_text() |
| 292 | + except UnicodeDecodeError: |
| 293 | + # Handle decoding errors by trying a different encoding |
| 294 | + content += page.extract_text(encoding='latin-1') |
| 295 | + except Exception as e: |
| 296 | + print_debug(f"Error in read_pdf: {e}") |
| 297 | + return content |
| 298 | + |
| 299 | + |
| 300 | +def read_office_document(file_path): |
| 301 | + content = '' |
| 302 | + try: |
| 303 | + # Check the file type and read content accordingly |
| 304 | + if file_path.lower().endswith('.docx'): |
| 305 | + # Read content from Word document |
| 306 | + doc = Document(file_path) |
| 307 | + for paragraph in doc.paragraphs: |
| 308 | + content += paragraph.text + '\n' |
| 309 | + elif file_path.lower().endswith('.xlsx'): |
| 310 | + # Read content from Excel spreadsheet |
| 311 | + workbook = load_workbook(file_path) |
| 312 | + for sheet_name in workbook.sheetnames: |
| 313 | + sheet = workbook[sheet_name] |
| 314 | + for row in sheet.iter_rows(): |
| 315 | + for cell in row: |
| 316 | + content += str(cell.value) + '\n' |
| 317 | + elif file_path.lower().endswith('.pptx'): |
| 318 | + # Read content from PowerPoint presentation |
| 319 | + # You can add specific logic for PowerPoint if needed |
| 320 | + pass |
| 321 | + except Exception as e: |
| 322 | + print_debug(f"Error in read_office_document: {e}") |
| 323 | + return content |
| 324 | + |
| 325 | +def read_archive(file_path): |
| 326 | + content = '' |
| 327 | + try: |
| 328 | + # Create a temporary directory to extract the contents of the archive |
| 329 | + with tempfile.TemporaryDirectory() as tmp_dir: |
| 330 | + # Extract the contents of the archive based on the file extension |
| 331 | + if file_path.lower().endswith('.zip'): |
| 332 | + patoolib.extract_archive(file_path, outdir=tmp_dir) |
| 333 | + elif file_path.lower().endswith('.rar'): |
| 334 | + patoolib.extract_archive(file_path, outdir=tmp_dir) |
| 335 | + elif file_path.lower().endswith('.tar'): |
| 336 | + with tarfile.open(file_path, 'r') as tar: |
| 337 | + tar.extractall(tmp_dir) |
| 338 | + elif file_path.lower().endswith('.tar.gz'): |
| 339 | + with tarfile.open(file_path, 'r:gz') as tar: |
| 340 | + tar.extractall(tmp_dir) |
| 341 | + |
| 342 | + # Iterate over all files in the temporary directory |
| 343 | + for root, dirs, files in os.walk(tmp_dir): |
| 344 | + for file in files: |
| 345 | + file_path = os.path.join(root, file) |
| 346 | + content += read_match_strings(file_path, 'archive') # Recursively read content |
| 347 | + |
| 348 | + # Clean up the temporary directory |
| 349 | + shutil.rmtree(tmp_dir) |
| 350 | + except Exception as e: |
| 351 | + print_debug(f"Error in read_archive: {e}") |
| 352 | + return content |
| 353 | + |
| 354 | + |
252 | 355 | def getFileData(file_path): |
253 | 356 | try: |
254 | 357 | # Get file metadata |
|
0 commit comments