1
1
from bs4 import BeautifulSoup
2
2
import requests
3
+ from fake_useragent import UserAgent
3
4
5
+ def requestUrl_and_bs4 (url :str ):
6
+ # All request and parser goes through here
7
+ agents = UserAgent ().random
8
+ user_agent = {"User-Agent" : agents }
4
9
5
- def getMovieDetails (movieName ):
10
+ # Fixed the requests (changed status code from 403 to 200)
11
+ html = requests .get (url , headers = user_agent )
12
+ soup = BeautifulSoup (html .text , 'html.parser' )
13
+
14
+ return soup
15
+
16
+
17
+ def getMovieDetails (movieName :str ):
6
18
url = 'https://www.imdb.com'
7
19
query = '/search/title?title='
8
20
movieDetails = {}
9
21
movienamequery = query + '+' .join (movieName .strip ().split (' ' ))
22
+ website_url = url + movienamequery + '&title_type=feature'
10
23
11
- html = requests .get (url + movienamequery + '&title_type=feature' )
12
- bs = BeautifulSoup (html .text , 'html.parser' )
13
- result = bs .find ('h3' , {'class' : 'lister-item-header' })
24
+ bs = requestUrl_and_bs4 (website_url )
14
25
26
+ result = bs .find ('a' , {'class' : 'ipc-title-link-wrapper' })
15
27
if result is None :
16
28
return None
17
29
18
- movielink = url + result .a .attrs ['href' ]
19
- movieDetails ['name' ] = result .a .text
30
+ movielink = url + result .attrs ['href' ]
20
31
21
- html = requests .get (movielink )
22
- bs = BeautifulSoup (html .text , 'html.parser' )
32
+ bs = requestUrl_and_bs4 (movielink )
33
+
34
+ # Fix the movie name
35
+ movieDetails ['name' ] = bs .find ('h1' , {'data-testid' : 'hero__pageTitle' }).text
36
+
37
+ # Fix year, runtime
38
+ box_one = bs .find ('div' , {'class' : 'sc-b7c53eda-0 dUpRPQ' }).ul
39
+ box = box_one .find_all ('li' )
23
40
try :
24
- movieDetails ['year' ] = bs . find ( 'span' , { 'id' : 'titleYear' }). a .text
41
+ movieDetails ['year' ] = box [ 0 ] .text
25
42
except AttributeError :
26
43
movieDetails ['year' ] = 'Not available'
27
- subtext = bs .find ('div' , {'class' : 'subtext' })
28
44
45
+ # Fix genres
46
+ box_two = bs .find ('div' , {'data-testid' : "genres" })
29
47
movieDetails ['genres' ] = [
30
- i .text for i in subtext .findAll ('a' , {'title' : None })]
48
+ i .text for i in box_two .select ('div.ipc-chip-list__scroller>a>span' )]
49
+
50
+ # Fix ratings
31
51
try :
32
- movieDetails ['rating' ] = bs .find (
33
- 'div' , {'class ' : 'ratingValue ' }).span .text
34
- movieDetails ['runtime' ] = subtext . time .text .strip ()
52
+ movieDetails ['rating' ] = f" { bs .find (
53
+ 'div' , {'data-testid ' : 'hero-rating-bar__aggregate-rating__score ' }).span .text } /10 ( { bs . find ( 'div' , { 'class' : 'sc-bde20123-3 gPVQxL' }). text } )"
54
+ movieDetails ['runtime' ] = box [ 2 ] .text .strip ()
35
55
except AttributeError :
36
56
movieDetails ['rating' ] = 'Not yet rated'
37
57
movieDetails ['runtime' ] = 'Not available'
38
- movieDetails ['release_date' ] = subtext .find (
39
- 'a' , {'title' : 'See more release dates' }).text .strip ()
40
58
41
- creditSummary = bs .findAll ('div' , {'class' : 'credit_summary_item' })
59
+ # To get movie release date
60
+ movie_release_dates_url = f"{ url } { box [0 ].a .attrs ['href' ]} "
61
+ soup = requestUrl_and_bs4 (movie_release_dates_url )
62
+
63
+ movieDetails ['release_date' ] = soup .select_one ('#rel_1 > div > ul > li > span.ipc-metadata-list-item__list-content-item' ).text
42
64
43
- movieDetails ['directors' ] = [i .text for i in creditSummary [0 ].findAll ('a' )]
44
- movieDetails ['writers' ] = [i .text for i in creditSummary [1 ].findAll (
45
- 'a' ) if 'name' in i .attrs ['href' ]]
65
+ creditSummary = bs .select ('div.ipc-metadata-list-item__content-container > ul' )
66
+
67
+ movieDetails ['directors' ] = [i .text for i in creditSummary [0 ].select ('li>a' )]
68
+
46
69
try :
47
- movieDetails ['cast' ] = [i .text for i in creditSummary [2 ].findAll (
48
- 'a' ) if 'name' in i . attrs [ 'href' ] ]
70
+ movieDetails ['cast' ] = [i .text for i in creditSummary [2 ].select ( 'li>a' )]
71
+ movieDetails [ 'writers' ] = [ i . text for i in creditSummary [ 1 ]. select ( 'li>a' ) ]
49
72
50
73
except IndexError :
51
74
movieDetails ['cast' ]= movieDetails ['writers' ]
52
75
movieDetails ['writers' ]= 'Not found'
53
- html = requests .get (movielink + 'plotsummary' )
54
- bs = BeautifulSoup (html .text , 'html.parser' )
55
-
56
- movieDetails ['plot' ] = bs .find (
57
- 'li' , {'class' : 'ipl-zebra-list__item' }).p .text .strip ()
58
76
77
+ movieDetails ['plot' ] = bs .find ('span' , {'data-testid' : 'plot-l' }).text .strip ()
78
+
59
79
return movieDetails
60
80
61
-
62
- if __name__ == "__main__" :
81
+ def main ():
63
82
movieName = input ('Enter the movie name : \n ' )
64
83
movieDetails = getMovieDetails (movieName )
65
84
if movieDetails is None :
@@ -75,3 +94,6 @@ def getMovieDetails(movieName):
75
94
print ('Writer:' , ', ' .join (movieDetails ['writers' ]))
76
95
print ('Cast:' , ', ' .join (movieDetails ['cast' ]))
77
96
print ('Plot Summary:\n ' , movieDetails ['plot' ])
97
+
98
+ if __name__ == "__main__" :
99
+ main ()
0 commit comments