1+ {
2+ "metadata" : {
3+ "language_info" : {
4+ "codemirror_mode" : {
5+ "name" : " ipython" ,
6+ "version" : 3
7+ },
8+ "file_extension" : " .py" ,
9+ "mimetype" : " text/x-python" ,
10+ "name" : " python" ,
11+ "nbconvert_exporter" : " python" ,
12+ "pygments_lexer" : " ipython3" ,
13+ "version" : 3
14+ },
15+ "orig_nbformat" : 4 ,
16+ "coopTranslator" : {
17+ "original_hash" : " 2d05e7db439376aa824f4b387f8324ca" ,
18+ "translation_date" : " 2025-12-19T16:49:23+00:00" ,
19+ "source_file" : " 6-NLP/4-Hotel-Reviews-1/solution/notebook.ipynb" ,
20+ "language_code" : " te"
21+ }
22+ },
23+ "nbformat" : 4 ,
24+ "nbformat_minor" : 2 ,
25+ "cells" : [
26+ {
27+ "cell_type" : " code" ,
28+ "execution_count" : null ,
29+ "metadata" : {},
30+ "outputs" : [],
31+ "source" : [
32+ " # EDA\n " ,
33+ " import pandas as pd\n " ,
34+ " import time"
35+ ]
36+ },
37+ {
38+ "cell_type" : " code" ,
39+ "execution_count" : null ,
40+ "metadata" : {},
41+ "outputs" : [],
42+ "source" : [
43+ " def get_difference_review_avg(row):\n " ,
44+ " return row[\" Average_Score\" ] - row[\" Calc_Average_Score\" ]"
45+ ]
46+ },
47+ {
48+ "cell_type" : " code" ,
49+ "execution_count" : null ,
50+ "metadata" : {},
51+ "outputs" : [],
52+ "source" : [
53+ " # Load the hotel reviews from CSV\n " ,
54+ " print(\" Loading data file now, this could take a while depending on file size\" )\n " ,
55+ " start = time.time()\n " ,
56+ " df = pd.read_csv('../../data/Hotel_Reviews.csv')\n " ,
57+ " end = time.time()\n " ,
58+ " print(\" Loading took \" + str(round(end - start, 2)) + \" seconds\" )\n "
59+ ]
60+ },
61+ {
62+ "cell_type" : " code" ,
63+ "execution_count" : null ,
64+ "metadata" : {},
65+ "outputs" : [],
66+ "source" : [
67+ " # What shape is the data (rows, columns)?\n " ,
68+ " print(\" The shape of the data (rows, cols) is \" + str(df.shape))\n "
69+ ]
70+ },
71+ {
72+ "cell_type" : " code" ,
73+ "execution_count" : null ,
74+ "metadata" : {},
75+ "outputs" : [],
76+ "source" : [
77+ " # value_counts() creates a Series object that has index and values\n " ,
78+ " # in this case, the country and the frequency they occur in reviewer nationality\n " ,
79+ " nationality_freq = df[\" Reviewer_Nationality\" ].value_counts()\n "
80+ ]
81+ },
82+ {
83+ "cell_type" : " code" ,
84+ "execution_count" : null ,
85+ "metadata" : {},
86+ "outputs" : [],
87+ "source" : [
88+ " # What reviewer nationality is the most common in the dataset?\n " ,
89+ " print(\" The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\" )\n "
90+ ]
91+ },
92+ {
93+ "cell_type" : " code" ,
94+ "execution_count" : null ,
95+ "metadata" : {},
96+ "outputs" : [],
97+ "source" : [
98+ " # What is the top 10 most common nationalities and their frequencies?\n " ,
99+ " print(\" The top 10 highest frequency reviewer nationalities are:\" )\n " ,
100+ " print(nationality_freq[0:10].to_string())\n "
101+ ]
102+ },
103+ {
104+ "cell_type" : " code" ,
105+ "execution_count" : null ,
106+ "metadata" : {},
107+ "outputs" : [],
108+ "source" : [
109+ " # How many unique nationalities are there?\n " ,
110+ " print(\" There are \" + str(nationality_freq.index.size) + \" unique nationalities in the dataset\" )\n "
111+ ]
112+ },
113+ {
114+ "cell_type" : " code" ,
115+ "execution_count" : null ,
116+ "metadata" : {},
117+ "outputs" : [],
118+ "source" : [
119+ " # What was the most frequently reviewed hotel for the top 10 nationalities - print the hotel and number of reviews\n " ,
120+ " for nat in nationality_freq[:10].index:\n " ,
121+ " # First, extract all the rows that match the criteria into a new dataframe\n " ,
122+ " nat_df = df[df[\" Reviewer_Nationality\" ] == nat] \n " ,
123+ " # Now get the hotel freq\n " ,
124+ " freq = nat_df[\" Hotel_Name\" ].value_counts()\n " ,
125+ " print(\" The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\" ) \n "
126+ ]
127+ },
128+ {
129+ "cell_type" : " code" ,
130+ "execution_count" : null ,
131+ "metadata" : {},
132+ "outputs" : [],
133+ "source" : [
134+ " # How many reviews are there per hotel (frequency count of hotel) and do the results match the value in `Total_Number_of_Reviews`?\n " ,
135+ " # First create a new dataframe based on the old one, removing the uneeded columns\n " ,
136+ " hotel_freq_df = df.drop([\" Hotel_Address\" , \" Additional_Number_of_Scoring\" , \" Review_Date\" , \" Average_Score\" , \" Reviewer_Nationality\" , \" Negative_Review\" , \" Review_Total_Negative_Word_Counts\" , \" Positive_Review\" , \" Review_Total_Positive_Word_Counts\" , \" Total_Number_of_Reviews_Reviewer_Has_Given\" , \" Reviewer_Score\" , \" Tags\" , \" days_since_review\" , \" lat\" , \" lng\" ], axis = 1)\n " ,
137+ " # Group the rows by Hotel_Name, count them and put the result in a new column Total_Reviews_Found\n " ,
138+ " hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n " ,
139+ " # Get rid of all the duplicated rows\n " ,
140+ " hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\" Hotel_Name\" ])\n " ,
141+ " print()\n " ,
142+ " print(hotel_freq_df.to_string())\n " ,
143+ " print(str(hotel_freq_df.shape))"
144+ ]
145+ },
146+ {
147+ "cell_type" : " code" ,
148+ "execution_count" : null ,
149+ "metadata" : {},
150+ "outputs" : [],
151+ "source" : [
152+ " # While there is an `Average_Score` for each hotel according to the dataset, \n " ,
153+ " # you can also calculate an average score (getting the average of all reviewer scores in the dataset for each hotel)\n " ,
154+ " # Add a new column to your dataframe with the column header `Calc_Average_Score` that contains that calculated average. \n " ,
155+ " df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n " ,
156+ " # Add a new column with the difference between the two average scores\n " ,
157+ " df[\" Average_Score_Difference\" ] = df.apply(get_difference_review_avg, axis = 1)\n " ,
158+ " # Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n " ,
159+ " review_scores_df = df.drop_duplicates(subset = [\" Hotel_Name\" ])\n " ,
160+ " # Sort the dataframe to find the lowest and highest average score difference\n " ,
161+ " review_scores_df = review_scores_df.sort_values(by=[\" Average_Score_Difference\" ])\n " ,
162+ " print(review_scores_df[[\" Average_Score_Difference\" , \" Average_Score\" , \" Calc_Average_Score\" , \" Hotel_Name\" ]])\n " ,
163+ " # Do any hotels have the same (rounded to 1 decimal place) `Average_Score` and `Calc_Average_Score`?\n "
164+ ]
165+ },
166+ {
167+ "cell_type" : " markdown" ,
168+ "metadata" : {},
169+ "source" : [
170+ "---\n\n<!-- CO-OP TRANSLATOR DISCLAIMER START -->\n**అస్పష్టత**: \nఈ పత్రాన్ని AI అనువాద సేవ [Co-op Translator](https://github.com/Azure/co-op-translator) ఉపయోగించి అనువదించబడింది. మేము ఖచ్చితత్వానికి ప్రయత్నించినప్పటికీ, ఆటోమేటెడ్ అనువాదాల్లో పొరపాట్లు లేదా తప్పిదాలు ఉండవచ్చు. మూల పత్రం దాని స్వదేశీ భాషలోనే అధికారిక మూలంగా పరిగణించాలి. ముఖ్యమైన సమాచారానికి, ప్రొఫెషనల్ మానవ అనువాదం సిఫార్సు చేయబడుతుంది. ఈ అనువాదం వాడకం వల్ల కలిగే ఏవైనా అపార్థాలు లేదా తప్పుదారితీసే అర్థాలు కోసం మేము బాధ్యత వహించము.\n<!-- CO-OP TRANSLATOR DISCLAIMER END -->\n"
171+ ]
172+ }
173+ ]
174+ }
0 commit comments