Skip to content

Commit 5b428dd

Browse files
authored
Update Scarper.py
1 parent 570639b commit 5b428dd

File tree

1 file changed

+106
-86
lines changed

1 file changed

+106
-86
lines changed

Web_app/Scarper.py

Lines changed: 106 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,93 +1,113 @@
1-
from selenium.webdriver.common.by import By
2-
from selenium.webdriver.common.keys import Keys
3-
from selenium.webdriver.support.ui import WebDriverWait
4-
from selenium.webdriver.support import expected_conditions as EC
1+
import streamlit as st
2+
from selenium import webdriver
3+
from webdriver_manager.chrome import ChromeDriverManager
4+
from selenium.webdriver.chrome.service import Service
55
import time
66
import csv
77
import re
88
from bs4 import BeautifulSoup
9-
from selenium.webdriver.chrome.options import Options
10-
from selenium import webdriver
9+
import os
10+
11+
# Function to scrape IMDb data
12+
def scrape_imdb_data():
13+
options = webdriver.ChromeOptions()
14+
options.add_argument('--no-sandbox')
15+
options.add_argument('--disable-dev-shm-usage')
16+
options.add_argument('--headless') # Run Chrome in headless mode
17+
18+
service = Service(ChromeDriverManager().install())
19+
driver = webdriver.Chrome(options=options, service=service)
20+
21+
driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31')
22+
driver.set_script_timeout(10000)
1123

12-
DRIVER_PATH = 'E:/chromedriver-win64/chromedriver'
13-
# Initialize the Chrome driver
14-
15-
16-
options = webdriver.ChromeOptions()
17-
options.add_argument('--no-sandbox')
18-
options.add_argument('--disable-dev-shm-usage')
19-
driver = webdriver.Chrome(options=options,executable_path=DRIVER_PATH)
20-
21-
# Navigate to the URL
22-
driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31')
23-
24-
driver.set_script_timeout(10000)
25-
def load_more_results():
26-
try:
27-
load_more_button = WebDriverWait(driver, 10).until(
28-
EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "ipc-see-more__button")]'))
29-
)
30-
driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
31-
driver.execute_script("arguments[0].click();", load_more_button)
32-
time.sleep(2)
33-
return True
34-
except Exception as e:
35-
print(f"Error: {e}")
36-
return False
37-
def save_to_csv(movies, filename='movies.csv'):
38-
keys = movies[0].keys()
39-
with open(filename, 'a', newline='', encoding='utf-8') as output_file:
40-
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
41-
dict_writer.writeheader()
42-
dict_writer.writerows(movies)
43-
44-
45-
all_movies=[]
46-
cnt=0
47-
while(cnt<300):
48-
cnt+=1
49-
print(cnt)
50-
if not load_more_results():
24+
def load_more_results():
25+
try:
26+
load_more_button = WebDriverWait(driver, 10).until(
27+
EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "ipc-see-more__button")]'))
28+
)
29+
driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
30+
driver.execute_script("arguments[0].click();", load_more_button)
31+
time.sleep(2)
32+
return True
33+
except Exception as e:
34+
print(f"Error: {e}")
35+
return False
36+
37+
def save_to_csv(movies, filename='movies.csv'):
38+
file_exists = os.path.isfile(filename)
39+
keys = movies[0].keys()
40+
with open(filename, 'a', newline='', encoding='utf-8') as output_file:
41+
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
42+
if not file_exists:
43+
dict_writer.writeheader()
44+
dict_writer.writerows(movies)
45+
46+
all_movies = []
47+
cnt = 0
48+
while cnt < 300:
49+
cnt += 1
50+
if not load_more_results():
5151
break
52-
53-
movie_elements = driver.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul")
54-
print("movie_list")
55-
56-
html_content = movie_elements.get_attribute('outerHTML')
57-
print("html movie_list")
58-
soup = BeautifulSoup(html_content, 'html.parser')
59-
60-
lst= soup.find_all("li", class_="ipc-metadata-list-summary-item")
61-
print("list")
62-
for i in lst:
63-
org_title= i.find("h3",class_="ipc-title__text").text
64-
try:
65-
title=re.sub(r'\d+\.\s*', '', org_title)
66-
except:
67-
title="NA"
68-
try:
69-
year = i.find("span", class_="sc-b189961a-8 kLaxqf dli-title-metadata-item").text
52+
53+
movie_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'lister-item mode-advanced')]")
7054

71-
except:
72-
year="NA"
73-
try:
74-
rating = i.find("span", class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text.split()[0]
75-
except:
76-
rating="NA"
77-
try:
78-
description = i.find("div", class_='ipc-html-content-inner-div').text
79-
except:
80-
description = "NA"
81-
all_movies.append({
82-
'title': title,
83-
'type':"Tv-Series",
84-
'year': year,
85-
'rating': rating,
86-
'description': description
87-
})
88-
89-
print("saving started")
90-
if all_movies:
91-
save_to_csv(all_movies)
92-
print("completed")
93-
driver.quit()
55+
for element in movie_elements:
56+
soup = BeautifulSoup(element.get_attribute('outerHTML'), 'html.parser')
57+
58+
try:
59+
org_title = soup.find("h3", class_="lister-item-header").find("a").text
60+
title = re.sub(r'\d+\.\s*', '', org_title)
61+
except:
62+
title = "NA"
63+
64+
try:
65+
year = soup.find("span", class_="lister-item-year").text
66+
except:
67+
year = "NA"
68+
69+
try:
70+
rating = soup.find("div", class_="ratings-bar").find("strong").text
71+
except:
72+
rating = "NA"
73+
74+
try:
75+
description = soup.find_all("p", class_="text-muted")[1].text.strip()
76+
except:
77+
description = "NA"
78+
79+
all_movies.append({
80+
'title': title,
81+
'type': "Tv-Series",
82+
'year': year,
83+
'rating': rating,
84+
'description': description
85+
})
86+
87+
if all_movies:
88+
save_to_csv(all_movies)
89+
all_movies = []
90+
91+
driver.quit()
92+
93+
# Streamlit App
94+
def main():
95+
st.title("IMDb Scraper")
96+
97+
if st.button("Scrape IMDb Data"):
98+
with st.spinner("Scraping IMDb data..."):
99+
scrape_imdb_data()
100+
st.success("Data scraped successfully!")
101+
102+
# Show the CSV file content
103+
st.subheader("Scraped IMDb Data:")
104+
filename = 'movies.csv'
105+
if os.path.exists(filename):
106+
with open(filename, 'r', encoding='utf-8') as file:
107+
csv_content = file.read()
108+
st.code(csv_content, language='csv')
109+
else:
110+
st.error("CSV file not found.")
111+
112+
if __name__ == "__main__":
113+
main()

0 commit comments

Comments
 (0)