Spotify-Data-Project_/spotify_data_project.py at main · Edu-png/Spotify-Data-Project_ · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
"""Spotify Data project

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/14lFzkB-evBObeLmcIAPiwXoQVT4PZOnE
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df_tracks = pd.read_csv('tracks.csv')
df_tracks.head()

# let's check if there are null values

pd.isnull(df_tracks).sum()

#Just one for each collum.

#Let's see some information about our data set
df_tracks.info()

#Let's see the least popular songs for the genre of our choice:

sorted_df = df_tracks.sort_values('popularity', ascending = True).head(10)
sorted_df

#all statistics from our data set:

df_tracks.describe().transpose()

#Let's see the most popular songs for the genre of our choice:

most_popular = df_tracks.query('popularity>90', inplace = False).sort_values('popularity', ascending = False)
most_popular

# Organizing the songs in order of release

df_tracks.set_index('release_date', inplace = True)
df_tracks.index = pd.to_datetime(df_tracks.index)
df_tracks.head

# Searching for specific data:

df_tracks[['artists']].iloc[18]

#converting milliseconds to seconds for time duration column

df_tracks['duration'] = df_tracks['duration_ms'].apply(lambda x: round(x/1000))
df_tracks.drop('duration_ms', inplace = True, axis = 1)

df_tracks.duration.head()

df_tracks

# creating a splendor map

corr_df = df_tracks.drop(['key', 'mode', 'explicit'], axis = 1).corr(method = 'pearson')
plt.figure(figsize = (14,6))
heatmap = sns.heatmap(corr_df, annot = True, fmt = '.1g', vmin =-1, vmax = 1, center = 0, cmap = 'inferno', linewidths = 1, linecolor = 'Black' )
heatmap.set_title("Correlation HeatMap Between Variable")
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation = 90)

#Here +1 means that there is a positive correlation between the variables, while - results means that there is no such correlation.

# We can conclude for example that there is a relationship between energy and loudness, and that there is no difference between energy and acoustics.

sample_df = df_tracks.sample(int(0.004*len(df_tracks)))

print(len(sample_df))

# Observing the interaction between intensity and energy

#positive correlation

plt.figure(figsize = (10,6))
sns.regplot(data = sample_df, y = 'loudness', x = 'energy', color = 'g',line_kws=dict(color="Black")).set(title = 'Loudness vs Energy -  Correlation')

#negative correlation

plt.figure(figsize = (10,6))
sns.regplot(data = sample_df, y = 'energy', x = 'acousticness', color = 'r',line_kws=dict(color="Black")).set(title = 'Acousticness vs Energy -  Correlation')

#no correlation

plt.figure(figsize = (10,6))
sns.regplot(data = sample_df, y = 'duration', x = 'liveness', color = 'y',line_kws=dict(color="Black")).set(title = 'Duration vs Liveness -  Correlation')

# Create a distribution chart to see the total number of songs each year

df_tracks['dates'] = df_tracks.index.get_level_values('release_date')
df_tracks.dates=pd.to_datetime(df_tracks.dates)
years = df_tracks.dates.dt.year

sns.displot(years,discrete=True,aspect = 2, height = 5, kind = 'hist', color = "c").set(title = 'Number of songs per year')

# Through this graph, we can see that our data includes songs from 1920 onwards, with the highest number around 2019.

#Barplot Year x Duration

total_dr = df_tracks.duration
fig_dims = (18,7)
fig, ax = plt.subplots(figsize = fig_dims)
fig = sns.barplot(x = years, y = total_dr, ax = ax, errwidth = False).set(title = "Year vs Duration")
plt.xticks(rotation = 90)

#Line plot

total_dr = df_tracks.duration
sns.set_style(style = 'whitegrid')
fig_dims = (10,5)
fig, ax = plt.subplots(figsize = fig_dims)
fig = sns.lineplot(x = years, y = total_dr, ax = ax).set(title = "Year vs Duration")
plt.xticks(rotation = 60)

#Additional Analysis by genres:

#using a new data set

df_genre=pd.read_csv('SpotifyFeatures.csv')

df_genre.head()

plt.title("Duration of the songs in Different Genres")

sns.color_palette('rocket', as_cmap = True)
sns.barplot(y='genre', x='duration_ms', data=df_genre)
plt.xlabel("Duration in milli seconds")
plt.ylabel('Genres')

#Ordering by Popularity

sns.set_style(style = 'darkgrid')
plt.figure(figsize = (10,5))
famous = df_genre.sort_values('popularity', ascending = False).head(10)
sns.barplot(y='genre', x='popularity', data = famous).set(title = 'Top 5 Genres by Popularity')