Skip to content

Commit 3198cf7

Browse files
committed
gui interface added
1 parent 81bf427 commit 3198cf7

File tree

1 file changed

+125
-0
lines changed

1 file changed

+125
-0
lines changed

amazon_scrapping/Gui_ocr.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import pytesseract
2+
from PIL import Image
3+
from selenium import webdriver
4+
from selenium.webdriver.chrome.options import Options
5+
from selenium.webdriver.support.ui import WebDriverWait
6+
from selenium.common.exceptions import TimeoutException
7+
from io import BytesIO
8+
import random
9+
import os
10+
import csv
11+
import time
12+
import tkinter as tk
13+
from tkinter import filedialog, messagebox
14+
15+
# Update this line with your Tesseract installation path
16+
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\kulitesh\Scrape-ML\Tesseract-OCR\tesseract.exe'
17+
18+
def take_screenshot_and_analyze(url, output_csv, save_location, num_screenshots=4):
19+
options = Options()
20+
options.headless = True
21+
22+
try:
23+
driver = webdriver.Chrome(options=options)
24+
driver.get(url)
25+
WebDriverWait(driver, 20).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
26+
27+
# Create a directory to store screenshots if it doesn't exist
28+
screenshot_dir = os.path.join(save_location, "Screenshots")
29+
if not os.path.exists(screenshot_dir):
30+
os.makedirs(screenshot_dir)
31+
32+
data = [] # List to store scraped data
33+
34+
for i in range(num_screenshots):
35+
# Scroll down a random amount
36+
scroll_amount = random.randint(500, 1000) # Adjust as needed
37+
driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
38+
# Add some waiting time after scrolling
39+
time.sleep(1) # Adjust scroll time as needed
40+
41+
# Capture screenshot
42+
screenshot = driver.get_screenshot_as_png()
43+
image = Image.open(BytesIO(screenshot))
44+
45+
# Save screenshot to file
46+
screenshot_path = os.path.join(screenshot_dir, f"screenshot_{i + 1}.png")
47+
image.save(screenshot_path)
48+
49+
# Use Tesseract OCR to extract text
50+
extracted_text = pytesseract.image_to_string(image)
51+
print(f"Extracted Text from screenshot {i + 1}:", extracted_text)
52+
53+
# Add the extracted text to the data list
54+
data.append({"Screenshot": screenshot_path, "Extracted Text": extracted_text})
55+
56+
# Write the scraped data to a CSV file
57+
write_to_csv(data, output_csv, save_location)
58+
59+
except TimeoutException:
60+
print("Timed out waiting for page to load")
61+
62+
finally:
63+
if 'driver' in locals():
64+
driver.quit()
65+
66+
def write_to_csv(data, output_csv, save_location):
67+
# Define CSV file path
68+
csv_file = os.path.join(save_location, output_csv)
69+
70+
# Write data to CSV file
71+
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
72+
writer = csv.DictWriter(file, fieldnames=["Screenshot", "Extracted Text"])
73+
writer.writeheader()
74+
writer.writerows(data)
75+
76+
print(f"Scraped data written to {csv_file}")
77+
78+
def browse_folder():
79+
folder_selected = filedialog.askdirectory()
80+
save_location_entry.delete(0, tk.END)
81+
save_location_entry.insert(0, folder_selected)
82+
83+
def start_analysis():
84+
url_to_analyze = url_entry.get()
85+
output_csv = output_csv_entry.get()
86+
save_location = save_location_entry.get()
87+
88+
if not url_to_analyze or not output_csv or not save_location:
89+
messagebox.showwarning("Input Error", "Please fill in all fields.")
90+
return
91+
92+
take_screenshot_and_analyze(url_to_analyze, output_csv, save_location)
93+
messagebox.showinfo("Success", f"Scraped data written to {os.path.join(save_location, output_csv)}")
94+
95+
# Set up the main application window
96+
root = tk.Tk()
97+
root.title("Sentiment Analysis Tool")
98+
99+
# Set window size and background color
100+
root.geometry("600x300")
101+
root.configure(bg="black")
102+
103+
# URL input
104+
tk.Label(root, text="Website URL:", bg="black", fg="white").grid(row=0, column=0, padx=10, pady=5, sticky="e")
105+
url_entry = tk.Entry(root, width=50, bg="white", fg="black")
106+
url_entry.grid(row=0, column=1, padx=10, pady=5)
107+
108+
# Output CSV file name input
109+
tk.Label(root, text="Output CSV File Name:", bg="black", fg="white").grid(row=1, column=0, padx=10, pady=5, sticky="e")
110+
output_csv_entry = tk.Entry(root, width=50, bg="white", fg="black")
111+
output_csv_entry.grid(row=1, column=1, padx=10, pady=5)
112+
113+
# Save location input
114+
tk.Label(root, text="Save Location:", bg="black", fg="white").grid(row=2, column=0, padx=10, pady=5, sticky="e")
115+
save_location_entry = tk.Entry(root, width=50, bg="white", fg="black")
116+
save_location_entry.grid(row=2, column=1, padx=10, pady=5)
117+
browse_button = tk.Button(root, text="Browse", command=browse_folder, bg="white", fg="black")
118+
browse_button.grid(row=2, column=2, padx=10, pady=5)
119+
120+
# Start analysis button
121+
start_button = tk.Button(root, text="Start Analysis", command=start_analysis, bg="white", fg="black")
122+
start_button.grid(row=3, column=1, padx=10, pady=20)
123+
124+
# Start the Tkinter event loop
125+
root.mainloop()

0 commit comments

Comments
 (0)