Skip to content

Commit 6b84edf

Browse files
Create Updated_Scrapper.py
1 parent 4c83bc7 commit 6b84edf

File tree

1 file changed

+88
-0
lines changed

1 file changed

+88
-0
lines changed

Web_app/Updated_Scrapper.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import time
2+
import random
3+
import pandas as pd
4+
from selenium import webdriver
5+
from selenium.webdriver.chrome.service import Service
6+
from selenium.webdriver.chrome.options import Options
7+
from selenium.webdriver.common.by import By
8+
from selenium.webdriver.support.ui import WebDriverWait
9+
from selenium.webdriver.support import expected_conditions as EC
10+
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException
11+
from fake_useragent import UserAgent
12+
from bs4 import BeautifulSoup
13+
from selenium.webdriver.chrome.service import Service
14+
from selenium.webdriver.chrome.options import Options
15+
from selenium import webdriver
16+
import os
17+
18+
def load_more_results(driver):
19+
try:
20+
load_more_button = WebDriverWait(driver, 10).until(
21+
EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ipc-see-more__button')] or contains(@class, 'next-page')]"))
22+
)
23+
driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
24+
driver.execute_script("arguments[0].click();", load_more_button)
25+
time.sleep(random.uniform(1, 3))
26+
return True
27+
except (NoSuchElementException, ElementNotInteractableException, TimeoutException) as e:
28+
print(f"Error loading more results: {e}")
29+
return False
30+
31+
def scrape_movie_data(driver):
32+
all_movies = []
33+
movie_elements = driver.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul")
34+
html_content = movie_elements.get_attribute('outerHTML')
35+
soup = BeautifulSoup(html_content, 'html.parser')
36+
lst = soup.find_all("li", class_="ipc-metadata-list-summary-item")
37+
for i in lst:
38+
try:
39+
org_title = i.find("h3", class_="ipc-title__text").text
40+
title = re.sub(r'\d+\.\s*', '', org_title)
41+
except:
42+
title = "NA"
43+
try:
44+
year = i.find("span", class_="sc-b189961a-8 kLaxqf dli-title-metadata-item").text
45+
except:
46+
year = "NA"
47+
try:
48+
rating = i.find("span", class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text.split()[0]
49+
except:
50+
rating = "NA"
51+
try:
52+
description = i.find("div", class_='ipc-html-content-inner-div').text
53+
except:
54+
description = "NA"
55+
56+
all_movies.append({
57+
'title': title,
58+
'type': "Tv-Series",
59+
'year': year,
60+
'rating': rating,
61+
'description': description
62+
})
63+
return all_movies
64+
65+
def main():
66+
# Replace with the actual path to your chromedriver executable
67+
driver_path = "C:\chromedriver-win64\chromedriver-win64\chromedriver.exe"
68+
user_data_dir = os.path.join(os.path.expanduser('~'), 'AppData/Local/Google/Chrome/User Data/Default')
69+
options = Options()
70+
options.add_argument('--no-sandbox')
71+
options.add_argument('--disable-dev-shm-usage')
72+
options.add_argument('--headless')
73+
options.add_argument(f'user-data-dir={user_data_dir}')
74+
service = Service(executable_path=driver_path)
75+
driver = webdriver.Chrome(service=service, options=options)
76+
driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31')
77+
all_movies_data = []
78+
while True:
79+
movies_data = scrape_movie_data(driver)
80+
all_movies_data.extend(movies_data)
81+
if not load_more_results(driver):
82+
break
83+
df = pd.DataFrame(all_movies_data)
84+
df.to_csv('imdb_movies.csv', index=False)
85+
driver.quit()
86+
print("Data scraped and saved to 'imdb_movies.csv'")
87+
if __name__ == "__main__":
88+
main()

0 commit comments

Comments
 (0)