Skip to content

Commit 5fc4b9e

Browse files
committed
Added support for OCR and docx, xlsx, pptx, pdf, jpg, png, gif, zip, tar, rar file types
1 parent 6a7e94d commit 5fc4b9e

File tree

2 files changed

+107
-4
lines changed

2 files changed

+107
-4
lines changed

hawk_scanner/internals/system.py

Lines changed: 106 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,17 @@
1-
from rich.console import Console
1+
from rich.console import Console
22
from rich.table import Table
33
import json, requests, argparse, yaml, re, datetime, os, subprocess, platform, hashlib
44
from tinydb import TinyDB, Query
5+
import pytesseract
6+
from PIL import Image
7+
from docx import Document
8+
from openpyxl import load_workbook
9+
import PyPDF2
10+
import patoolib
11+
import tempfile
12+
import shutil
13+
import os
14+
import tarfile
515

616
# Create a TinyDB instance for storing previous alert hashes
717
db = TinyDB('previous_alerts.json')
@@ -240,15 +250,108 @@ def list_all_files_iteratively(path, exclude_patterns):
240250
def read_match_strings(file_path, source):
241251
print_info(f"Scanning file: {file_path}")
242252
content = ''
253+
243254
try:
244-
with open(file_path, 'r', encoding="utf-8") as file:
245-
content = file.read()
255+
# Check if the file is an image
256+
if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
257+
# Use OCR to extract text from the image
258+
image = Image.open(file_path)
259+
content = pytesseract.image_to_string(image)
260+
# Check if the file is a PDF document
261+
elif file_path.lower().endswith('.pdf'):
262+
content = read_pdf(file_path)
263+
# Check if the file is an office document (Word, Excel, PowerPoint)
264+
elif file_path.lower().endswith(('.docx', '.xlsx', '.pptx')):
265+
content = read_office_document(file_path)
266+
# Check if the file is an archive (zip, rar, tar, tar.gz)
267+
elif file_path.lower().endswith(('.zip', '.rar', '.tar', '.tar.gz')):
268+
content = read_archive(file_path)
269+
else:
270+
# For other file types, read content normally
271+
with open(file_path, 'rb') as file:
272+
# Attempt to decode using UTF-8, fallback to 'latin-1' if needed
273+
content = file.read().decode('utf-8', errors='replace')
246274
except Exception as e:
247275
print_debug(f"Error in read_match_strings: {e}")
248276
pass
277+
249278
matched_strings = match_strings(content)
250279
return matched_strings
251280

281+
282+
def read_pdf(file_path):
283+
content = ''
284+
try:
285+
# Read content from PDF document
286+
with open(file_path, 'rb') as file:
287+
pdf_reader = PyPDF2.PdfReader(file)
288+
for page_num in range(len(pdf_reader.pages)): # Use len() instead of deprecated numPages
289+
page = pdf_reader.pages[page_num]
290+
try:
291+
content += page.extract_text()
292+
except UnicodeDecodeError:
293+
# Handle decoding errors by trying a different encoding
294+
content += page.extract_text(encoding='latin-1')
295+
except Exception as e:
296+
print_debug(f"Error in read_pdf: {e}")
297+
return content
298+
299+
300+
def read_office_document(file_path):
301+
content = ''
302+
try:
303+
# Check the file type and read content accordingly
304+
if file_path.lower().endswith('.docx'):
305+
# Read content from Word document
306+
doc = Document(file_path)
307+
for paragraph in doc.paragraphs:
308+
content += paragraph.text + '\n'
309+
elif file_path.lower().endswith('.xlsx'):
310+
# Read content from Excel spreadsheet
311+
workbook = load_workbook(file_path)
312+
for sheet_name in workbook.sheetnames:
313+
sheet = workbook[sheet_name]
314+
for row in sheet.iter_rows():
315+
for cell in row:
316+
content += str(cell.value) + '\n'
317+
elif file_path.lower().endswith('.pptx'):
318+
# Read content from PowerPoint presentation
319+
# You can add specific logic for PowerPoint if needed
320+
pass
321+
except Exception as e:
322+
print_debug(f"Error in read_office_document: {e}")
323+
return content
324+
325+
def read_archive(file_path):
326+
content = ''
327+
try:
328+
# Create a temporary directory to extract the contents of the archive
329+
with tempfile.TemporaryDirectory() as tmp_dir:
330+
# Extract the contents of the archive based on the file extension
331+
if file_path.lower().endswith('.zip'):
332+
patoolib.extract_archive(file_path, outdir=tmp_dir)
333+
elif file_path.lower().endswith('.rar'):
334+
patoolib.extract_archive(file_path, outdir=tmp_dir)
335+
elif file_path.lower().endswith('.tar'):
336+
with tarfile.open(file_path, 'r') as tar:
337+
tar.extractall(tmp_dir)
338+
elif file_path.lower().endswith('.tar.gz'):
339+
with tarfile.open(file_path, 'r:gz') as tar:
340+
tar.extractall(tmp_dir)
341+
342+
# Iterate over all files in the temporary directory
343+
for root, dirs, files in os.walk(tmp_dir):
344+
for file in files:
345+
file_path = os.path.join(root, file)
346+
content += read_match_strings(file_path, 'archive') # Recursively read content
347+
348+
# Clean up the temporary directory
349+
shutil.rmtree(tmp_dir)
350+
except Exception as e:
351+
print_debug(f"Error in read_archive: {e}")
352+
return content
353+
354+
252355
def getFileData(file_path):
253356
try:
254357
# Get file metadata

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = "0.3.0"
1+
VERSION = "0.3.1"
22

33
from setuptools import setup, find_packages
44

0 commit comments

Comments
 (0)