Skip to content

Commit 8031577

Browse files
Revert "deleted irrevelant files"
This reverts commit a484acb.
1 parent a484acb commit 8031577

18 files changed

+29524
-0
lines changed
File renamed without changes.

README (1).md

Lines changed: 227 additions & 0 deletions
Large diffs are not rendered by default.

abstract_training.ipynb

Lines changed: 751 additions & 0 deletions
Large diffs are not rendered by default.

amazon_scrapping/Amazon-dataset/Product listing.csv

Lines changed: 3155 additions & 0 deletions
Large diffs are not rendered by default.
Binary file not shown.

amazon_scrapping/Dataset-Amazon_Reviews.csv

Lines changed: 501 additions & 0 deletions
Large diffs are not rendered by default.

amazon_scrapping/Gui_ocr.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import pytesseract
2+
from PIL import Image
3+
from selenium import webdriver
4+
from selenium.webdriver.chrome.options import Options
5+
from selenium.webdriver.support.ui import WebDriverWait
6+
from selenium.common.exceptions import TimeoutException
7+
from io import BytesIO
8+
import random
9+
import os
10+
import csv
11+
import time
12+
import tkinter as tk
13+
from tkinter import filedialog, messagebox
14+
15+
# Update this line with your Tesseract installation path
16+
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\kulitesh\Scrape-ML\Tesseract-OCR\tesseract.exe'
17+
18+
def take_screenshot_and_analyze(url, output_csv, save_location, num_screenshots=4):
19+
options = Options()
20+
options.headless = True
21+
22+
try:
23+
driver = webdriver.Chrome(options=options)
24+
driver.get(url)
25+
WebDriverWait(driver, 20).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
26+
27+
# Create a directory to store screenshots if it doesn't exist
28+
screenshot_dir = os.path.join(save_location, "Screenshots")
29+
if not os.path.exists(screenshot_dir):
30+
os.makedirs(screenshot_dir)
31+
32+
data = [] # List to store scraped data
33+
34+
for i in range(num_screenshots):
35+
# Scroll down a random amount
36+
scroll_amount = random.randint(500, 1000) # Adjust as needed
37+
driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
38+
# Add some waiting time after scrolling
39+
time.sleep(1) # Adjust scroll time as needed
40+
41+
# Capture screenshot
42+
screenshot = driver.get_screenshot_as_png()
43+
image = Image.open(BytesIO(screenshot))
44+
45+
# Save screenshot to file
46+
screenshot_path = os.path.join(screenshot_dir, f"screenshot_{i + 1}.png")
47+
image.save(screenshot_path)
48+
49+
# Use Tesseract OCR to extract text
50+
extracted_text = pytesseract.image_to_string(image)
51+
print(f"Extracted Text from screenshot {i + 1}:", extracted_text)
52+
53+
# Add the extracted text to the data list
54+
data.append({"Screenshot": screenshot_path, "Extracted Text": extracted_text})
55+
56+
# Write the scraped data to a CSV file
57+
write_to_csv(data, output_csv, save_location)
58+
59+
except TimeoutException:
60+
print("Timed out waiting for page to load")
61+
62+
finally:
63+
if 'driver' in locals():
64+
driver.quit()
65+
66+
def write_to_csv(data, output_csv, save_location):
67+
# Define CSV file path
68+
csv_file = os.path.join(save_location, output_csv)
69+
70+
# Write data to CSV file
71+
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
72+
writer = csv.DictWriter(file, fieldnames=["Screenshot", "Extracted Text"])
73+
writer.writeheader()
74+
writer.writerows(data)
75+
76+
print(f"Scraped data written to {csv_file}")
77+
78+
def browse_folder():
79+
folder_selected = filedialog.askdirectory()
80+
save_location_entry.delete(0, tk.END)
81+
save_location_entry.insert(0, folder_selected)
82+
83+
def start_analysis():
84+
url_to_analyze = url_entry.get()
85+
output_csv = output_csv_entry.get()
86+
save_location = save_location_entry.get()
87+
88+
if not url_to_analyze or not output_csv or not save_location:
89+
messagebox.showwarning("Input Error", "Please fill in all fields.")
90+
return
91+
92+
take_screenshot_and_analyze(url_to_analyze, output_csv, save_location)
93+
messagebox.showinfo("Success", f"Scraped data written to {os.path.join(save_location, output_csv)}")
94+
95+
# Set up the main application window
96+
root = tk.Tk()
97+
root.title("Sentiment Analysis Tool")
98+
99+
# Set window size and background color
100+
root.geometry("600x300")
101+
root.configure(bg="black")
102+
103+
# URL input
104+
tk.Label(root, text="Website URL:", bg="black", fg="white").grid(row=0, column=0, padx=10, pady=5, sticky="e")
105+
url_entry = tk.Entry(root, width=50, bg="white", fg="black")
106+
url_entry.grid(row=0, column=1, padx=10, pady=5)
107+
108+
# Output CSV file name input
109+
tk.Label(root, text="Output CSV File Name:", bg="black", fg="white").grid(row=1, column=0, padx=10, pady=5, sticky="e")
110+
output_csv_entry = tk.Entry(root, width=50, bg="white", fg="black")
111+
output_csv_entry.grid(row=1, column=1, padx=10, pady=5)
112+
113+
# Save location input
114+
tk.Label(root, text="Save Location:", bg="black", fg="white").grid(row=2, column=0, padx=10, pady=5, sticky="e")
115+
save_location_entry = tk.Entry(root, width=50, bg="white", fg="black")
116+
save_location_entry.grid(row=2, column=1, padx=10, pady=5)
117+
browse_button = tk.Button(root, text="Browse", command=browse_folder, bg="white", fg="black")
118+
browse_button.grid(row=2, column=2, padx=10, pady=5)
119+
120+
# Start analysis button
121+
start_button = tk.Button(root, text="Start Analysis", command=start_analysis, bg="white", fg="black")
122+
start_button.grid(row=3, column=1, padx=10, pady=20)
123+
124+
# Start the Tkinter event loop
125+
root.mainloop()

amazon_scrapping/README.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
2+
3+
<h1 align="center">Amazon Scrapping</h1>
4+
<blockquote align="center">Scrapping the product lisitng✏️ using python programming language💻. </blockquote>
5+
<p align="center">For new data generation <b>Classification part</b> we have writtern a python script to fetch📊, data from the 💻, Amazon website 🌐 and converted into csv files. </p>
6+
7+
8+
9+
# Introduction
10+
11+
**`Semi-supervised-sequence-learning-Project`** :computer: replication process is done over here and for further analysis creation of new data is required.
12+
13+
- The following script includes the following.
14+
- `scrapping.py` - Script to scrap the data from Amazon website
15+
- Product label listing: `Laptop`, `Phones`, `Printers`, `Desktops`, `Monitors`, `Mouse`, `Pendrive`, `Earphones`, `Smart TV`, `Power banks`
16+
17+
18+
19+
## Dependencies
20+
21+
- Install Selenium using `pip install -U selenium`
22+
23+
- Install Python 3 using the [MSI available in python.org download page](http://www.python.org/download).
24+
25+
- Load the drivers
26+
27+
```python
28+
from selenium import webdriver
29+
30+
browser = webdriver.Firefox()
31+
browser.get('http://selenium.dev/')
32+
```
33+
34+
- Selenium Server (optional)
35+
36+
```python
37+
java -jar selenium-server-standalone-4.0.0.jar
38+
```
39+
40+
## Installation
41+
42+
**1️⃣ Fork the `Semi-supervised-sequence-learning-Project/` repository**
43+
Link to [`Semi-supervised-sequence-learning-Project'](https://github.com/sanjay-kv/Semi-supervised-sequence-learning-Project)
44+
Follow these instructions on [how to fork a repository](https://help.github.com/en/articles/fork-a-repo)
45+
46+
**2️⃣ Cloning the repository**
47+
Once you have set up your fork of the `/Semi-supervised-sequence-learning-Project` repository, you'll want to clone it to your local machine. This is so you can make and test all of your personal edits before adding it to the master version of `/Semi-supervised-sequence-learning-Project`.
48+
49+
Navigate to the location on your computer where you want to host your code. Once in the appropriate folder, run the following command to clone the repository to your local machine.
50+
51+
```bash
52+
git clone https://github.com/your-username/Semi-supervised-sequence-learning-Project.git
53+
```
54+
55+
## Final Dataset
56+
57+
1️⃣ Here is the Link to **Final Dataset:** [Drive Link](https://drive.google.com/drive/folders/1HB8FCUVqkQpSbV7syq2ZsZ59kT5B-SGo?usp=sharing)
58+
59+
60+

0 commit comments

Comments
 (0)