Skip to content

Commit 836bf60

Browse files
2 parents 6798b8c + 87d510d commit 836bf60

File tree

8 files changed

+688
-44
lines changed

8 files changed

+688
-44
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.e

Movie_review_rotten_tomatoes.ipynb

Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"## Importing necessary Imports"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import requests\n",
17+
"from bs4 import BeautifulSoup\n",
18+
"\n",
19+
"import pandas as pd\n",
20+
"import numpy as np\n",
21+
"import itertools #to create efficent looping to fetch more data in a go\n",
22+
"import re \n",
23+
"import random \n",
24+
"from textblob import TextBlob"
25+
]
26+
},
27+
{
28+
"cell_type": "markdown",
29+
"metadata": {},
30+
"source": [
31+
"## Movie Urls\n",
32+
"\n",
33+
"- https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:fresh?page=5\n",
34+
"\n",
35+
"- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled~critics:fresh?page=5\n",
36+
"\n",
37+
"- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled,upright~critics:fresh?page=5\n",
38+
"\n",
39+
"- https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:certified_fresh?page=5\n",
40+
"\n",
41+
"- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled~critics:certified_fresh?page=5\n",
42+
"\n",
43+
"- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled,upright~critics:certified_fresh?page=5\n",
44+
"\n",
45+
"- https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:rotten?page=5\n",
46+
"\n",
47+
"- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled~critics:rotten?page=5\n",
48+
"\n",
49+
"- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled,upright~critics:rotten?page=5\n",
50+
"\n",
51+
"Here we use page=5 as rottentomatoes will only allow us to check 140 movies at a time."
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"url = \"https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:fresh?page=5\"\n",
61+
"\n",
62+
"def getSoup(url):\n",
63+
" \"\"\"\n",
64+
" Utility function this get soup function will fetch the above url which stored in url var.\n",
65+
" \"\"\"\n",
66+
" headers = {\n",
67+
" 'User-Agent': 'Your User-Agent String',\n",
68+
" 'Authorization': 'Bearer Your_Authentication_Token' # Include this if authentication is required\n",
69+
" }\n",
70+
" response = requests.get(url, headers=headers)\n",
71+
" soup = BeautifulSoup(response.text, 'html.parser')\n",
72+
" return soup\n",
73+
"\n",
74+
"def getReviewText(review_url):\n",
75+
" '''Returns the user review text given the review url.'''\n",
76+
" # find div tags with class text show-more__control\n",
77+
" tag = review_url.find('p', attrs={'class': 'review-text'})\n",
78+
" return tag.getText()\n",
79+
"\n",
80+
"def getMovieTitle(review_url):\n",
81+
" '''Returns the movie title from the review url.'''\n",
82+
" # find title tag\n",
83+
" tag = review_url.find('title')\n",
84+
" title_tag = list(tag.children)[0].getText()\n",
85+
" \n",
86+
" # split the title and remove the unnecessary part\n",
87+
" movie_title = title_tag.split(' - Movie Reviews | Rotten Tomatoes')[0]\n",
88+
" return movie_title\n",
89+
"\n",
90+
"\n",
91+
"def getNounChunks(user_review):\n",
92+
" # create the doc object\n",
93+
" doc = nlp(user_review)\n",
94+
" # get a list of noun_chunks\n",
95+
" noun_chunks = list(doc.noun_chunks)\n",
96+
" # convert noun_chunks from span objects to strings, otherwise it won't pick\n",
97+
" noun_chunks_strlist = [chunk.text for chunk in noun_chunks]\n",
98+
" return noun_chunks_strlist"
99+
]
100+
},
101+
{
102+
"cell_type": "markdown",
103+
"metadata": {},
104+
"source": [
105+
"# Filtering the movie tags"
106+
]
107+
},
108+
{
109+
"cell_type": "code",
110+
"execution_count": null,
111+
"metadata": {},
112+
"outputs": [],
113+
"source": [
114+
"movies_soup = getSoup(url)\n",
115+
"movie_tags = movies_soup.find_all('a', attrs={'data-qa': \"discovery-media-list-item-caption\"}) + movies_soup.find_all('a', attrs={'class': \"js-tile-link\"})\n",
116+
"\n",
117+
"# filter the a-tags to get just the titles\n",
118+
"movie_links = [tag['href'] for tag in movie_tags]\n",
119+
"# remove duplicate links\n",
120+
"unique_movie_links = list(dict.fromkeys(movie_links))\n",
121+
"\n",
122+
"print(\"There are a total of \" + str(len(unique_movie_links)) + \" movie titles\")\n",
123+
"print(\"Displaying 10 titles\")\n",
124+
"unique_movie_links[:10]"
125+
]
126+
},
127+
{
128+
"cell_type": "markdown",
129+
"metadata": {},
130+
"source": [
131+
"## Filtering the movie URL's"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": null,
137+
"metadata": {},
138+
"outputs": [],
139+
"source": [
140+
"\n",
141+
"base_url = \"https://www.rottentomatoes.com\"\n",
142+
"movie_links = [base_url + tag['href'] + '/reviews' for tag in movie_tags]\n",
143+
"print(\"There are a total of \" + str(len(movie_links)) + \" movie user reviews\")\n",
144+
"print(\"Displaying 20 user reviews links\")\n",
145+
"movie_links[:20]"
146+
]
147+
},
148+
{
149+
"cell_type": "code",
150+
"execution_count": null,
151+
"metadata": {},
152+
"outputs": [],
153+
"source": [
154+
"movie_soups = [getSoup(link) for link in movie_links]\n",
155+
"# get all movie review links from the 140 listing\n",
156+
"movie_review_list = [getReviewText(movie_soup) for movie_soup in movie_soups]"
157+
]
158+
},
159+
{
160+
"cell_type": "code",
161+
"execution_count": null,
162+
"metadata": {},
163+
"outputs": [],
164+
"source": [
165+
"#Checking how many movie review were able to filter.\n",
166+
"movie_review_list = list(itertools.chain(*movie_review_list))\n",
167+
"\n",
168+
"print(\"There are a total of \" + str(len(movie_review_list)) + \" individual movie reviews\")\n",
169+
"print(\"Displaying 10 reviews\")\n",
170+
"print(movie_review_list[:10])"
171+
]
172+
},
173+
{
174+
"cell_type": "markdown",
175+
"metadata": {},
176+
"source": [
177+
"## Converting into the Pandas Data Frame"
178+
]
179+
},
180+
{
181+
"cell_type": "code",
182+
"execution_count": null,
183+
"metadata": {},
184+
"outputs": [],
185+
"source": [
186+
"review_texts = [getReviewText(url) for url in movie_soups]\n",
187+
"\n",
188+
"# get movie name from the review link\n",
189+
"movie_titles = [getMovieTitle(url) for url in movie_soups]\n",
190+
"print(movie_titles)\n",
191+
"\n",
192+
"# Filtering the dataframe with only User_reviews by avoiding links and title\n",
193+
"\n",
194+
"# construct a dataframe\n",
195+
"df = pd.DataFrame({'user_review': review_texts })"
196+
]
197+
},
198+
{
199+
"cell_type": "code",
200+
"execution_count": null,
201+
"metadata": {},
202+
"outputs": [],
203+
"source": [
204+
"df.head(5) #displaying the resultant data frame"
205+
]
206+
},
207+
{
208+
"cell_type": "markdown",
209+
"metadata": {},
210+
"source": [
211+
"## The data frame need to remove index and filter the limit review length by 50 words"
212+
]
213+
},
214+
{
215+
"cell_type": "code",
216+
"execution_count": null,
217+
"metadata": {},
218+
"outputs": [],
219+
"source": [
220+
"text_list = [m for m in df['user_review']]\n",
221+
"#text_list"
222+
]
223+
},
224+
{
225+
"cell_type": "code",
226+
"execution_count": null,
227+
"metadata": {},
228+
"outputs": [],
229+
"source": [
230+
"#calculating the length of the text\n",
231+
"text_list_length = [len(m.split()) for m in text_list] \n",
232+
"df['length'] = text_list_length\n",
233+
"df"
234+
]
235+
},
236+
{
237+
"cell_type": "code",
238+
"execution_count": null,
239+
"metadata": {},
240+
"outputs": [],
241+
"source": [
242+
"df = df[df['length'] < 50] #limiting the df by 50 in length\n",
243+
"df"
244+
]
245+
},
246+
{
247+
"cell_type": "code",
248+
"execution_count": null,
249+
"metadata": {},
250+
"outputs": [],
251+
"source": [
252+
"df.drop('length', axis=1, inplace=True)\n",
253+
"df\n",
254+
"#dropping the len row"
255+
]
256+
},
257+
{
258+
"cell_type": "code",
259+
"execution_count": null,
260+
"metadata": {},
261+
"outputs": [],
262+
"source": [
263+
"#converting only reviews to CSV & removing the index\n",
264+
"df.to_csv('data_scrapped/data_rotten_tomatoes.csv', index=False) "
265+
]
266+
},
267+
{
268+
"cell_type": "markdown",
269+
"metadata": {},
270+
"source": [
271+
"## Splitting the csv file to the indivitual text files"
272+
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": null,
277+
"metadata": {},
278+
"outputs": [],
279+
"source": [
280+
"import csv\n",
281+
"\n",
282+
"with open(\"data_scrapped/data_rotten_tomatoes.csv\", \"r\",encoding=\"utf-8\") as f:\n",
283+
" reader = csv.reader(f)\n",
284+
" rownumber = 2639 # used to start the naming of the file , change it accordingly \n",
285+
" for row in reader:\n",
286+
" g=open(str(rownumber)+\".txt\",\"w\")\n",
287+
" g.write(str(row))\n",
288+
" rownumber = rownumber + 1\n",
289+
" g.close()"
290+
]
291+
},
292+
{
293+
"cell_type": "code",
294+
"execution_count": null,
295+
"metadata": {},
296+
"outputs": [],
297+
"source": [
298+
"def analyze_sentiment(text):\n",
299+
" \"\"\"\n",
300+
" Analyzes the sentiment of the input text.\n",
301+
" \n",
302+
" Returns:\n",
303+
" - 'positive' if sentiment polarity > 0\n",
304+
" - 'negative' if sentiment polarity < 0\n",
305+
" - 'neutral' if sentiment polarity == 0\n",
306+
" \"\"\"\n",
307+
" blob = TextBlob(text)\n",
308+
" polarity = blob.sentiment.polarity\n",
309+
" \n",
310+
" if polarity > 0:\n",
311+
" return 'positive'\n",
312+
" elif polarity < 0:\n",
313+
" return 'negative'\n",
314+
" else:\n",
315+
" return 'neutral'\n",
316+
"\n",
317+
"# Assuming df is your DataFrame containing the reviews\n",
318+
"df['sentiment'] = df['user_review'].apply(analyze_sentiment)\n"
319+
]
320+
},
321+
{
322+
"cell_type": "code",
323+
"execution_count": null,
324+
"metadata": {},
325+
"outputs": [],
326+
"source": [
327+
"df"
328+
]
329+
}
330+
],
331+
"metadata": {
332+
"language_info": {
333+
"name": "python"
334+
}
335+
},
336+
"nbformat": 4,
337+
"nbformat_minor": 2
338+
}

Web_app/Movie_Animated.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.
-13 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)