Skip to content

Commit 557c7f8

Browse files
authored
Merge pull request #10 from Toby1219/patch-1
Update scrape.py
2 parents d27e4d7 + c3a7f81 commit 557c7f8

File tree

1 file changed

+51
-29
lines changed

1 file changed

+51
-29
lines changed

Movie Scraper/scrape.py

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,84 @@
11
from bs4 import BeautifulSoup
22
import requests
3+
from fake_useragent import UserAgent
34

5+
def requestUrl_and_bs4(url:str):
6+
# All request and parser goes through here
7+
agents = UserAgent().random
8+
user_agent={"User-Agent": agents}
49

5-
def getMovieDetails(movieName):
10+
# Fixed the requests (changed status code from 403 to 200)
11+
html = requests.get(url, headers=user_agent)
12+
soup = BeautifulSoup(html.text, 'html.parser')
13+
14+
return soup
15+
16+
17+
def getMovieDetails(movieName:str):
618
url = 'https://www.imdb.com'
719
query = '/search/title?title='
820
movieDetails = {}
921
movienamequery = query+'+'.join(movieName.strip().split(' '))
22+
website_url = url+movienamequery+'&title_type=feature'
1023

11-
html = requests.get(url+movienamequery+'&title_type=feature')
12-
bs = BeautifulSoup(html.text, 'html.parser')
13-
result = bs.find('h3', {'class': 'lister-item-header'})
24+
bs = requestUrl_and_bs4(website_url)
1425

26+
result = bs.find('a', {'class': 'ipc-title-link-wrapper'})
1527
if result is None:
1628
return None
1729

18-
movielink = url+result.a.attrs['href']
19-
movieDetails['name'] = result.a.text
30+
movielink = url+result.attrs['href']
2031

21-
html = requests.get(movielink)
22-
bs = BeautifulSoup(html.text, 'html.parser')
32+
bs = requestUrl_and_bs4(movielink)
33+
34+
# Fix the movie name
35+
movieDetails['name'] = bs.find('h1', {'data-testid': 'hero__pageTitle'}).text
36+
37+
# Fix year, runtime
38+
box_one = bs.find('div', {'class': 'sc-b7c53eda-0 dUpRPQ'}).ul
39+
box = box_one.find_all('li')
2340
try:
24-
movieDetails['year'] = bs.find('span', {'id': 'titleYear'}).a.text
41+
movieDetails['year'] = box[0].text
2542
except AttributeError:
2643
movieDetails['year'] = 'Not available'
27-
subtext = bs.find('div', {'class': 'subtext'})
2844

45+
# Fix genres
46+
box_two = bs.find('div', {'data-testid': "genres"})
2947
movieDetails['genres'] = [
30-
i.text for i in subtext.findAll('a', {'title': None})]
48+
i.text for i in box_two.select('div.ipc-chip-list__scroller>a>span')]
49+
50+
# Fix ratings
3151
try:
32-
movieDetails['rating'] = bs.find(
33-
'div', {'class': 'ratingValue'}).span.text
34-
movieDetails['runtime'] = subtext.time.text.strip()
52+
movieDetails['rating'] = f"{bs.find(
53+
'div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text}/10 ({bs.find('div', {'class': 'sc-bde20123-3 gPVQxL'}).text})"
54+
movieDetails['runtime'] = box[2].text.strip()
3555
except AttributeError:
3656
movieDetails['rating'] = 'Not yet rated'
3757
movieDetails['runtime'] = 'Not available'
38-
movieDetails['release_date'] = subtext.find(
39-
'a', {'title': 'See more release dates'}).text.strip()
4058

41-
creditSummary = bs.findAll('div', {'class': 'credit_summary_item'})
59+
# To get movie release date
60+
movie_release_dates_url= f"{url}{box[0].a.attrs['href']}"
61+
soup = requestUrl_and_bs4(movie_release_dates_url)
62+
63+
movieDetails['release_date'] = soup.select_one('#rel_1 > div > ul > li > span.ipc-metadata-list-item__list-content-item').text
4264

43-
movieDetails['directors'] = [i.text for i in creditSummary[0].findAll('a')]
44-
movieDetails['writers'] = [i.text for i in creditSummary[1].findAll(
45-
'a') if 'name' in i.attrs['href']]
65+
creditSummary = bs.select('div.ipc-metadata-list-item__content-container > ul')
66+
67+
movieDetails['directors'] = [i.text for i in creditSummary[0].select('li>a')]
68+
4669
try:
47-
movieDetails['cast'] = [i.text for i in creditSummary[2].findAll(
48-
'a') if 'name' in i.attrs['href']]
70+
movieDetails['cast'] = [i.text for i in creditSummary[2].select('li>a')]
71+
movieDetails['writers'] = [i.text for i in creditSummary[1].select('li>a')]
4972

5073
except IndexError:
5174
movieDetails['cast']=movieDetails['writers']
5275
movieDetails['writers']='Not found'
53-
html = requests.get(movielink+'plotsummary')
54-
bs = BeautifulSoup(html.text, 'html.parser')
55-
56-
movieDetails['plot'] = bs.find(
57-
'li', {'class': 'ipl-zebra-list__item'}).p.text.strip()
5876

77+
movieDetails['plot'] = bs.find('span', {'data-testid': 'plot-l'}).text.strip()
78+
5979
return movieDetails
6080

61-
62-
if __name__ == "__main__":
81+
def main():
6382
movieName = input('Enter the movie name : \n')
6483
movieDetails = getMovieDetails(movieName)
6584
if movieDetails is None:
@@ -75,3 +94,6 @@ def getMovieDetails(movieName):
7594
print('Writer:', ', '.join(movieDetails['writers']))
7695
print('Cast:', ', '.join(movieDetails['cast']))
7796
print('Plot Summary:\n', movieDetails['plot'])
97+
98+
if __name__ == "__main__":
99+
main()

0 commit comments

Comments
 (0)