Skip to content

Commit 15c677b

Browse files
committed
feat: Added web scraper script to scrape imdb and get movie details
1 parent 983f730 commit 15c677b

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

Movie Scraper/scrape.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
4+
5+
def getMovieDetails(movieName):
6+
url = 'https://www.imdb.com'
7+
query = '/search/title?title='
8+
movieDetails = {}
9+
movienamequery = query+'+'.join(movieName.strip().split(' '))
10+
11+
html = requests.get(url+movienamequery+'&title_type=feature')
12+
bs = BeautifulSoup(html.text, 'html.parser')
13+
result = bs.find('h3', {'class': 'lister-item-header'})
14+
15+
if result is None:
16+
return None
17+
18+
movielink = url+result.a.attrs['href']
19+
movieDetails['name'] = result.a.text
20+
21+
html = requests.get(movielink)
22+
bs = BeautifulSoup(html.text, 'html.parser')
23+
try:
24+
movieDetails['year'] = bs.find('span', {'id': 'titleYear'}).a.text
25+
except AttributeError:
26+
movieDetails['year'] = 'Not available'
27+
subtext = bs.find('div', {'class': 'subtext'})
28+
29+
movieDetails['genres'] = [
30+
i.text for i in subtext.findAll('a', {'title': None})]
31+
try:
32+
movieDetails['rating'] = bs.find(
33+
'div', {'class': 'ratingValue'}).span.text
34+
movieDetails['runtime'] = subtext.time.text.strip()
35+
except AttributeError:
36+
movieDetails['rating'] = 'Not yet rated'
37+
movieDetails['runtime'] = 'Not available'
38+
movieDetails['release_date'] = subtext.find(
39+
'a', {'title': 'See more release dates'}).text.strip()
40+
41+
creditSummary = bs.findAll('div', {'class': 'credit_summary_item'})
42+
43+
movieDetails['directors'] = [i.text for i in creditSummary[0].findAll('a')]
44+
movieDetails['writers'] = [i.text for i in creditSummary[1].findAll(
45+
'a') if 'name' in i.attrs['href']]
46+
try:
47+
movieDetails['cast'] = [i.text for i in creditSummary[2].findAll(
48+
'a') if 'name' in i.attrs['href']]
49+
50+
except IndexError:
51+
movieDetails['cast']=movieDetails['writers']
52+
movieDetails['writers']='Not found'
53+
html = requests.get(movielink+'plotsummary')
54+
bs = BeautifulSoup(html.text, 'html.parser')
55+
56+
movieDetails['plot'] = bs.find(
57+
'li', {'class': 'ipl-zebra-list__item'}).p.text.strip()
58+
59+
return movieDetails
60+
61+
62+
if __name__ == "__main__":
63+
movieName = input('Enter the movie name : \n')
64+
movieDetails = getMovieDetails(movieName)
65+
if movieDetails is None:
66+
print('No movie found with given name!!!!!')
67+
quit()
68+
print('\n{movie} ({year})'.format(
69+
movie=movieDetails['name'], year=movieDetails['year']))
70+
print('Rating:', movieDetails['rating'])
71+
print('Runtime:', movieDetails['runtime'])
72+
print('Release Date:', movieDetails['release_date'])
73+
print('Genres:', ', '.join(movieDetails['genres']))
74+
print('Director:', ', '.join(movieDetails['directors']))
75+
print('Writer:', ', '.join(movieDetails['writers']))
76+
print('Cast:', ', '.join(movieDetails['cast']))
77+
print('Plot Summary:\n', movieDetails['plot'])

0 commit comments

Comments
 (0)