Skip to content

Commit 0fda18e

Browse files
committed
Fixed bugs in grdrive_workspace
1 parent 5098af7 commit 0fda18e

File tree

6 files changed

+112
-33
lines changed

6 files changed

+112
-33
lines changed

Dockerfile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Use the official Python image as the base image
2+
FROM python:3
3+
4+
# Set the working directory in the container
5+
WORKDIR /app
6+
7+
# Copy the local requirements.txt file to the container at /app
8+
COPY requirements.txt /app/
9+
10+
# Install the dependencies from requirements.txt
11+
RUN pip3 install --no-cache-dir -r requirements.txt
12+
13+
# Copy the local code to the container at /app
14+
COPY . /app/
15+
16+
# Install the Python package (assuming it contains a setup.py file)
17+
RUN pip3 install .
18+
19+
# Set the entrypoint to hawk_scanner
20+
ENTRYPOINT ["hawk_scanner"]

hawk_scanner/commands/gdrive_workspace.py

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,35 +23,53 @@ def connect_google_drive(credentials_file, impersonate_user=None):
2323
print(f"Failed to connect to Google Drive: {e}")
2424

2525
def download_file(drive, file_obj, base_path):
26+
print(f"Downloading file: {file_obj['name']} to {base_path}")
2627
try:
2728
file_name = file_obj['name']
2829
file_id = file_obj['id']
2930

3031
folder_path = base_path
32+
33+
# Handle parents (folders)
3134
if 'parents' in file_obj:
3235
for parent_id in file_obj['parents']:
3336
parent_folder = drive.files().get(fileId=parent_id).execute()
34-
if parent_folder['name'] == 'My Drive':
35-
continue
36-
folder_path = os.path.join(folder_path, parent_folder['name'])
37+
parent_folder_name = parent_folder['name']
38+
39+
# Update folder_path to include the parent folder
40+
folder_path = os.path.join(folder_path, parent_folder_name)
3741

38-
file_path = os.path.join(folder_path, file_name)
42+
# Update folder_path to include the current file's name
43+
folder_path = os.path.join(folder_path, file_name)
3944

4045
if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
41-
if not os.path.exists(file_path):
42-
os.makedirs(file_path)
46+
if not os.path.exists(folder_path):
47+
os.makedirs(folder_path)
4348
folder_files = drive.files().list(q=f"'{file_id}' in parents").execute().get('files', [])
4449
for folder_file in folder_files:
4550
download_file(drive, folder_file, folder_path)
4651
else:
47-
download_url = drive.files().get_media(fileId=file_id).execute()
48-
with open(file_path, 'wb') as fh:
49-
fh.write(download_url)
50-
51-
system.print_debug(f"File downloaded to: {file_path}")
52+
try:
53+
# Check if the file is a Google Docs type
54+
if 'application/vnd.google-apps' in file_obj.get('mimeType', ''):
55+
# For Google Docs Editors files, use export instead of GetMedia
56+
response = drive.files().export(fileId=file_id, mimeType='application/pdf').execute()
57+
with open(folder_path, 'wb') as f:
58+
f.write(response)
59+
else:
60+
# For other file types, use GetMedia
61+
content = drive.files().get_media(fileId=file_id).execute()
62+
with open(folder_path, 'wb') as f:
63+
f.write(content)
64+
except Exception as e:
65+
print(f"Failed to write file: {e}")
66+
67+
system.print_debug(f"File downloaded to: {folder_path}")
5268
except Exception as e:
5369
print(f"Failed to download file: {e}")
5470

71+
72+
5573
def list_files(drive, impersonate_user=None):
5674
try:
5775
query = "'root' in parents"
@@ -88,20 +106,13 @@ def execute(args):
88106
if drive:
89107
files = list_files(drive, impersonate_user)
90108
for file_obj in files:
91-
download_file(drive, file_obj, "data/google_drive")
109+
110+
if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.document' or file_obj['mimeType'] == 'application/vnd.google-apps.spreadsheet' or file_obj['mimeType'] == 'application/vnd.google-apps.presentation' or file_obj['mimeType'] == 'application/vnd.google-apps.drawing' or file_obj['mimeType'] == 'application/vnd.google-apps.script':
111+
file_obj['name'] = file_obj['name'] + '-runtime.pdf'
112+
92113
file_id = file_obj['id']
93114
file_name = file_obj['name']
94-
if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
95-
continue
96-
97-
parent_folder_ids = file_obj.get('parents', [])
98115
folder_path = "data/google_drive"
99-
if parent_folder_ids:
100-
for parent_id in parent_folder_ids:
101-
parent_folder = drive.files().get(fileId=parent_id).execute()
102-
if parent_folder['name'] == 'My Drive':
103-
continue
104-
folder_path = os.path.join(folder_path, parent_folder['name'])
105116

106117
file_path = os.path.join(folder_path, file_name)
107118

@@ -115,9 +126,10 @@ def execute(args):
115126
is_cache_enabled = True
116127

117128
if is_cache_enabled:
118-
download_file(drive, file_obj, "data/google_drive")
129+
download_file(drive, file_obj, "data/google_drive/")
119130

120-
matches = system.read_match_strings(file_path, 'gdrive')
131+
matches = system.read_match_strings(file_path, 'gdrive_workspace')
132+
file_name = file_name.replace('-runtime.pdf', '')
121133
if matches:
122134
for match in matches:
123135
results.append({
@@ -136,8 +148,8 @@ def execute(args):
136148
else:
137149
system.print_error("No Google Drive connection details found in connection file")
138150

139-
if not is_cache_enabled:
140-
os.system("rm -rf data/google_drive")
151+
"""if not is_cache_enabled:
152+
os.system("rm -rf data/google_drive")"""
141153

142154
return results
143155

hawk_scanner/internals/system.py

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
import json, requests, argparse, yaml, re, datetime, os, subprocess, platform, hashlib
44
from tinydb import TinyDB, Query
55
import pytesseract
6-
from PIL import Image
6+
from PIL import Image, ImageEnhance
77
from docx import Document
88
from openpyxl import load_workbook
99
import PyPDF2
1010
import patoolib
1111
import tempfile
1212
import shutil
13-
import os
13+
import os, cv2
1414
import tarfile
1515

1616
# Create a TinyDB instance for storing previous alert hashes
@@ -256,10 +256,12 @@ def read_match_strings(file_path, source):
256256

257257
try:
258258
# Check if the file is an image
259+
print(file_path)
259260
if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
260-
# Use OCR to extract text from the image
261-
image = Image.open(file_path)
262-
content = pytesseract.image_to_string(image)
261+
print("ocr started for "+file_path)
262+
content = enhance_and_ocr(file_path)
263+
print("texts")
264+
print(content)
263265
# Check if the file is a PDF document
264266
elif file_path.lower().endswith('.pdf'):
265267
content = read_pdf(file_path)
@@ -411,3 +413,42 @@ def SlackNotify(msg):
411413
db.insert({'msg_hash': msg_hash})
412414
except Exception as e:
413415
print_error(f"An error occurred: {str(e)}")
416+
417+
def enhance_and_ocr(image_path):
418+
# Load the image
419+
original_image = Image.open(image_path)
420+
421+
# Enhance the image (you can adjust enhancement factors as needed)
422+
enhanced_image = enhance_image(original_image)
423+
424+
# Save the enhanced image for reference
425+
enhanced_image.save("enhanced_image.png")
426+
427+
# Perform OCR on the enhanced image
428+
ocr_text = perform_ocr(enhanced_image)
429+
430+
return ocr_text
431+
432+
def enhance_image(image):
433+
# Convert to grayscale
434+
grayscale_image = image.convert('L')
435+
436+
# Increase contrast
437+
contrast_enhancer = ImageEnhance.Contrast(grayscale_image)
438+
contrast_factor = 2.0 # Adjust as needed
439+
contrast_enhanced_image = contrast_enhancer.enhance(contrast_factor)
440+
441+
# Apply thresholding
442+
threshold_value = 100 # Adjust as needed
443+
thresholded_image = contrast_enhanced_image.point(lambda x: 0 if x < threshold_value else 255)
444+
445+
# Reduce noise (optional)
446+
denoised_image = cv2.fastNlMeansDenoising(np.array(thresholded_image), None, h=10, templateWindowSize=7, searchWindowSize=21)
447+
448+
return Image.fromarray(denoised_image)
449+
450+
def perform_ocr(image):
451+
# Use Tesseract OCR
452+
ocr_text = pytesseract.image_to_string(image)
453+
454+
return ocr_text

hawk_scanner/main.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ def main():
7272

7373
if args.json:
7474
with open(args.json, 'w') as file:
75+
#file_path = file_path.replace('-runtime.pdf', '')
76+
if 'gdrive_workspace' in grouped_results:
77+
for result in grouped_results['gdrive_workspace']:
78+
result['file_name'] = result['file_name'].replace('-runtime.pdf', '')
79+
7580
file.write(json.dumps(grouped_results, indent=4))
7681
system.print_success(f"Results saved to {args.json}")
7782
sys.exit(0)

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ pydrive2
2020
appdirs
2121
tqdm
2222
funcy
23-
fsspec
23+
fsspec
24+
opencv-python

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = "0.3.6"
1+
VERSION = "0.3.7"
22

33
from setuptools import setup, find_packages
44

0 commit comments

Comments
 (0)