-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path6_Advance_Data_Analysis.sql
More file actions
366 lines (325 loc) · 14.2 KB
/
6_Advance_Data_Analysis.sql
File metadata and controls
366 lines (325 loc) · 14.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
/*
====================================================================================================
ADVANCE DATA ANALYTICS
====================================================================================================
What we cover:
- Book Popularity vs Quality Comparation Analysis
- Author Productivity vs Quality Comparation Analysis
- Price Segmentation Analysis
- Language Part-to-Whole Analysis.
- Temporal Overtime Analysis
Problem Question:
1. Is there a significant discrepancy between audiobook ratings and total engagement? Do the most popular books also have the highest ratings?
2. How are ratings distributed across low- and high-engagement audiobooks?
3. What are the differences between popular and niche books in terms of key metrics?
4. Does the distribution of audiobook engagement follow the Pareto principle?
5. How does the top 1% market share compare with the remaining 99%?
6. What is the relationship between an author’s total released audiobooks and their total engagement?
7. How does an author’s productivity (number of books) correlate with their average ratings?
8. Are there authors who achieve high engagement despite releasing only a few books?
9. Do authors with massive engagement also maintain strong ratings?
10. Does price influence audiobook engagement?
11. Does price affect audiobook ratings?
12. What proportion of audiobooks are published in each language, and how does this compare across languages?
13. How have audiobook trends evolved over time? Are there signs of growth or decline?
14. How have audiobook quality trends changed over time?
*/
USE MyDatabase
GO
/*
================================================================================================================================================
1. Is there a significant discrepancy between audiobook ratings and total engagement? Do the most popular books also have the highest ratings?
================================================================================================================================================
*/
SELECT
[Book Name],
[Total Review],
Ratings
FROM cleaned_audible_book
ORDER BY [Total Review] DESC;
/*
================================================================================================================================================
2. How are ratings distributed across low- and high-engagement audiobooks?
================================================================================================================================================
*/
-- Checking ratings distribution for low-engagement audiobook (Total review < 150)
SELECT
Ratings,
COUNT([Book Name]) as count_of_book
FROM cleaned_audible_book
WHERE [Total Review] <> 0 AND [Total Review] < 150
GROUP BY Ratings
ORDER BY Ratings DESC;
-- Checking ratings distribution for High-engagement audiobook (Total review >= 150)
SELECT
Ratings,
COUNT([Book Name]) as count_of_book
FROM cleaned_audible_book
WHERE [Total Review] <> 0 AND [Total Review] >= 150
GROUP BY Ratings
ORDER BY Ratings DESC;
/*
================================================================================================================================================
3. What are the differences between popular and niche books in terms of key metrics?
================================================================================================================================================
*/
-- Popular books > 500 engagement,
-- Unpopular book <= 500 engagement
-- We filtered out audiobooks without ratings since it will turn down the average ratings.
SELECT
'Popular Books' AS category,
COUNT([Book Name]) AS number_of_books,
SUM([Total Review]) AS total_engagement,
ROUND(AVG(CAST([Total Review] AS float)), 2) AS engagement_per_book,
ROUND(AVG(Ratings), 2) AS avg_ratings,
ROUND(AVG(Price), 2) AS avg_price
FROM cleaned_audible_book
WHERE Ratings <> 0 AND [Total Review] > 500
UNION ALL
SELECT
'Niche Books' AS category,
COUNT([Book Name]) AS number_of_books,
SUM([Total Review]) AS total_engagement,
ROUND(AVG(CAST([Total Review] AS float)), 2) AS engagement_per_book,
ROUND(AVG(Ratings), 2) AS avg_ratings,
ROUND(AVG(Price), 2) AS avg_price
FROM cleaned_audible_book
WHERE Ratings <> 0 AND [Total Review] <= 500;
/*
================================================================================================================================================
4. Does the distribution of audiobook engagement follow the Pareto principle?
================================================================================================================================================
*/
-- Pareto principle: 80% of total engagement is generated only by 20% of audiobooks
-- Analyze is 80% of total engagement is generated only by 20% of audiobooks or not
-- Total audiobooks: 87489; 20% of audiobooks: 17497
WITH cumulative_analysis AS (
SELECT
[Book Name],
[Total Review],
SUM([Total Review]) OVER () AS total_engagement_generated_by_all_books,
ROUND(CAST(SUM([Total Review]) OVER (ORDER BY [Total Review] DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS FLOAT) / SUM([Total Review]) OVER () * 100, 2) AS total_engagement_cumulative_percentage,
ROW_NUMBER() OVER (ORDER BY [Total Review] DESC) AS row_number
FROM cleaned_audible_book
)
SELECT
row_number,
[Book Name],
[Total Review],
total_engagement_cumulative_percentage
FROM cumulative_analysis
WHERE total_engagement_cumulative_percentage <= 80;
/*
========================================================================
5. How does the top 1% market share compare with the remaining 99%?
========================================================================
*/
-- 1% of audiobooks: 875 audiobooks
WITH market_share AS (
SELECT
[Book Name],
[Total Review],
Price,
ROW_NUMBER() OVER (ORDER BY [Total Review] DESC) AS row_index
FROM cleaned_audible_book
)
SELECT
'Top 1% Market' AS market_category,
COUNT([Book Name]) AS number_of_book,
SUM([Total Review]) AS total_engagement,
ROUND(AVG(CAST([Total Review] AS float)), 2) AS engagement_per_book,
ROUND(AVG(Price), 2) AS avg_price
FROM market_share
WHERE row_index <= 875
UNION ALL
SELECT
'The Rest of Market' AS market_category,
COUNT([Book Name]) AS number_of_book,
SUM([Total Review]) AS total_engagement,
ROUND(AVG(CAST([Total Review] AS float)), 2) AS engagement_per_book,
ROUND(AVG(Price), 2) AS avg_price
FROM market_share
WHERE row_index > 875;
/*
========================================================================
6. What is the relationship between an author’s total released audiobooks and their total engagement?
========================================================================
*/
-- Since we only focused on single author, we filtered out audiobooks with multiple author
-- Order By highest total audibooks -> determine is author with many released audiobooks has high engagement too
SELECT
Author,
COUNT(Author) AS number_of_audible_book,
SUM([Total Review]) AS total_engagement
FROM cleaned_audible_book
WHERE Author NOT LIKE '%,%' AND Author NOT LIKE '%Production%' AND Author NOT LIKE '%Studio%' AND Author NOT LIKE '%Various%'
GROUP BY Author
ORDER BY number_of_audible_book DESC;
-- Order By highest total engagement -> determine is author with high engagement has many released audiobooks too
SELECT
Author,
COUNT(Author) AS number_of_audible_book,
SUM([Total Review]) AS total_engagement
FROM cleaned_audible_book
WHERE Author NOT LIKE '%,%' AND Author NOT LIKE '%Production%' AND Author NOT LIKE '%Studio%' AND Author NOT LIKE '%Various%'
GROUP BY Author
ORDER BY total_engagement DESC;
/*
========================================================================================================
7. How does an author’s productivity (number of books) correlate with their average ratings?
========================================================================================================
*/
-- Since we only focused on single author, we filtered out audiobooks with multiple author
-- We filtered out audiobooks without ratings since it will turn down the average ratings.
-- Order By highest total audibooks -> determine is author with many released audiobooks has high ratings too
SELECT
Author,
COUNT(Author) AS number_of_audible_book,
ROUND(AVG(Ratings), 2) AS avg_ratings
FROM cleaned_audible_book
WHERE Author NOT LIKE '%,%' AND Author NOT LIKE '%Production%' AND Author NOT LIKE '%Studio%' AND Author NoT LIKE '%Various%' AND [Total Review] <> 0
GROUP BY Author
ORDER BY number_of_audible_book DESC;
-- Order By highest avg ratings -> determine is author with high ratings has many released audiobooks too
SELECT
Author,
COUNT(Author) AS number_of_audible_book,
SUM([Total Review]) AS total_engagement,
AVG(Ratings) AS avg_ratings
FROM cleaned_audible_book
WHERE Author NOT LIKE '%,%' AND Author NOT LIKE '%Production%' AND Author NOT LIKE '%Studio%' AND Author NoT LIKE '%Various%' AND [Total Review] <> 0
GROUP BY Author
ORDER BY avg_ratings ASC;
/*
========================================================================================================
8. Are there authors who achieve high engagement despite releasing only a few books?
========================================================================================================
*/
SELECT
Author,
COUNT(Author) AS number_of_audible_book,
SUM([Total Review]) AS engagement
FROM cleaned_audible_book
WHERE Author NOT LIKE '%,%' AND Author NOT LIKE '%Production%' AND Author NOT LIKE '%Studio%' AND Author NOT LIKE '%Various%'
GROUP BY Author
ORDER BY engagement DESC;
/*
========================================================================================================
9. Do authors with massive engagement also maintain strong ratings?
========================================================================================================
*/
-- We will check is there any author with massive engagement (> 500 Total Review) has bad ratings (< 4.0 stars)
SELECT
Author,
SUM([Total Review]) AS total_engagement,
ROUND(AVG(Ratings), 2) AS avg_ratings
FROM cleaned_audible_book
WHERE Author NOT LIKE '%,%' AND Author NOT LIKE '%Production%' AND Author NOT LIKE '%Studio%' AND Ratings <> 0
GROUP BY Author
HAVING SUM([Total Review]) > 500 AND AVG(Ratings) < 4.0
ORDER BY total_engagement DESC;
/*
========================================================================================================
10. Does price influence audiobook engagement?
========================================================================================================
*/
-- We will segment the audiobooks by its price:
-- Free: price = $0;
-- Low Price: $0 > price <= $5;
-- Mid Price: $5 > price <= $15;
-- High Price: price > $15.
-- We will analyze engagement of each segment to determine the price influence.
WITH price_analysis AS (
SELECT
CASE
WHEN Price = 0 THEN 'Free'
WHEN Price > 0 AND Price <= 5 THEN 'Low Price'
WHEN Price > 5 AND Price <= 15 THEN 'Mid Price'
ELSE 'High Price'
END AS price_segmentation,
[Total Review]
FROM cleaned_audible_book)
SELECT
price_segmentation,
COUNT(price_segmentation) AS number_of_audible_book,
SUM([Total Review]) AS total_engagement,
ROUND(AVG(CAST([Total Review] AS FLOAT)), 2) AS engagement_per_book
FROM price_analysis
GROUP BY price_segmentation;
/*
========================================================================================================
11. Does price affect audiobook ratings?
========================================================================================================
*/
-- We will segment the audiobooks by its price:
-- Free: price = $0;
-- Low Price: $0 > price <= $5;
-- Mid Price: $5 > price <= $15;
-- High Price: price > $15.
-- We will analyze engagement of each segment to determine the price influence.
-- We filtered out the unratings audiobooks
WITH price_analysis AS (
SELECT
CASE
WHEN Price = 0 THEN 'Free'
WHEN Price > 0 AND Price <= 5 THEN 'Low Price'
WHEN Price > 5 AND Price <= 15 THEN 'Mid Price'
ELSE 'High Price'
END AS price_segmentation,
Ratings
FROM cleaned_audible_book
WHERE Ratings <> 0)
SELECT
price_segmentation,
COUNT(price_segmentation) AS number_of_audible_book,
ROUND(AVG(Ratings), 2) AS avg_ratings
FROM price_analysis
GROUP BY price_segmentation;
/*
========================================================================================================
12. What proportion of audiobooks are published in each language, and how does this compare across languages?
========================================================================================================
*/
WITH part_to_whole AS (
SELECT
language,
[Total Review],
Ratings,
COUNT(Language) OVER () AS number_of_book
FROM cleaned_audible_book
)
SELECT
Language,
COUNT(Language) AS number_of_books,
CAST(ROUND(CAST(COUNT(Language) AS FLOAT)/number_of_book * 100, 3) AS nvarchar) + '%' AS used_percentage,
SUM([Total Review]) AS total_engagement,
ROUND(AVG(CAST([Total Review] AS float)), 2) AS engagement_rate_per_book
FROM part_to_whole
GROUP BY Language, number_of_book
ORDER BY ROUND(CAST(COUNT(Language) AS FLOAT)/number_of_book * 100, 3) DESC
/*
========================================================================================================
13. How have audiobook trends evolved over time? Are there signs of growth or decline?
========================================================================================================
*/
SELECT
YEAR([Release Date]) AS year,
LEFT(DATENAME(MONTH,[Release Date]), 3) AS month,
COUNT([Book Name]) AS number_of_books,
SUM([Total Review]) AS total_engagement
FROM cleaned_audible_book
GROUP BY YEAR([Release Date]), LEFT(DATENAME(MONTH,[Release Date]), 3), DATEPART(MONTH,[Release Date])
ORDER BY YEAR([Release Date]),DATEPART(MONTH,[Release Date])
/*
========================================================================================================
14. How have audiobook quality trends changed over time?
========================================================================================================
*/
SELECT
YEAR([Release Date]) AS year,
LEFT(DATENAME(MONTH,[Release Date]), 3) AS month,
AVG(Ratings) AS avg_rating
FROM cleaned_audible_book
WHERE Ratings <> 0
GROUP BY YEAR([Release Date]), LEFT(DATENAME(MONTH,[Release Date]), 3), DATEPART(MONTH,[Release Date])
ORDER BY YEAR([Release Date]),DATEPART(MONTH,[Release Date])