Skip to content

Commit b999c13

Browse files
authored
Merge pull request #1417 from Shekhar-Raj/olympics_analysis
Successfully Added Olympics_analysis_web_App
2 parents 7277461 + 0db46b4 commit b999c13

File tree

6 files changed

+350
-0
lines changed

6 files changed

+350
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<h2>olympics-data-analysis-web-app</h2>
2+
3+
A Streamlit web application for the analysis of olympics dataset
4+
5+
Dataset Link: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results
6+
7+
Live Link: https://olympicdatanalysis.streamlit.app/
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
import streamlit as st
2+
import pandas as pd
3+
import preprocessor,helper
4+
import plotly.express as px
5+
import matplotlib.pyplot as plt
6+
import seaborn as sns
7+
import plotly.figure_factory as ff
8+
9+
##def main():
10+
11+
df = pd.read_csv('athlete_events.csv')
12+
region_df = pd.read_csv('noc_regions.csv')
13+
14+
df = preprocessor.preprocess(df, region_df)
15+
16+
user_menu = st.sidebar.radio(
17+
'Select an Option',
18+
('Medal Tally','Overall Analysis','Country-wise Analysis','Athlete wise Analysis')
19+
)
20+
21+
st.sidebar.title("Olympics Analysis")
22+
st.sidebar.image('https://e7.pngegg.com/pngimages/1020/402/png-clipart-2024-summer-olympics-brand-circle-area-olympic-rings-olympics-logo-text-sport.png')
23+
if user_menu == 'Medal Tally':
24+
st.sidebar.header("Medal Tally")
25+
years, country = helper.country_year_list(df)
26+
27+
selected_year = st.sidebar.selectbox("Select Year",years)
28+
selected_country = st.sidebar.selectbox("Select Country", country)
29+
30+
medal_tally = helper.fetch_medal_tally(df,selected_year,selected_country)
31+
if selected_year == 'Overall' and selected_country =='Overall':
32+
st.title("Overall Tally")
33+
if selected_year!='Overall' and selected_country=="Overall":
34+
st.title("Medal Tally in " + str(selected_year) + " Olympics ")
35+
if selected_year=="Overall" and selected_country != "overall":
36+
st.title(selected_country + " Overall Performance ")
37+
if selected_year!='Overall' and selected_country!= "overall":
38+
st.title(selected_country + " Performance in " + str(selected_year) + " Olympics ")
39+
40+
st.table(medal_tally)
41+
42+
if user_menu =='Overall Analysis':
43+
editions=df['Year'].unique().shape[0]-1
44+
cities=df['City'].unique().shape[0]
45+
sports=df['Sport'].unique().shape[0]
46+
events=df['Event'].unique().shape[0]
47+
athletes=df['Name'].unique().shape[0]
48+
nations=df['region'].unique().shape[0]
49+
50+
st.title(" Top Statistics")
51+
col1,col2,col3=st.columns(3)
52+
with col1:
53+
st.header(" Edition ")
54+
st.title(editions)
55+
56+
with col2:
57+
st.header(" Hosts ")
58+
st.title(cities)
59+
60+
with col3:
61+
st.header(" Sports ")
62+
st.title(sports)
63+
64+
col1,col2,col3=st.columns(3)
65+
with col1:
66+
st.header(" Events ")
67+
st.title(events)
68+
69+
with col2:
70+
st.header(" Nations ")
71+
st.title(nations)
72+
73+
with col3:
74+
st.header(" Athletes ")
75+
st.title(athletes)
76+
77+
nations_over_time=helper.data_over_time(df,'region')
78+
fig=px.line(nations_over_time,x='Edition',y='region')
79+
st.title(" Participating Nations over the years ")
80+
st.plotly_chart(fig)
81+
82+
83+
events_over_time=helper.data_over_time(df,'Event')
84+
fig=px.line(events_over_time,x='Edition',y='Event')
85+
st.title(" Events over the years ")
86+
st.plotly_chart(fig)
87+
88+
athlete_over_time=helper.data_over_time(df,'Name')
89+
fig=px.line(athlete_over_time,x='Edition',y='Name')
90+
st.title(" Athletes over the years ")
91+
st.plotly_chart(fig)
92+
93+
st.title(" No. of Events over time(Every Sport)")
94+
fig,ax = plt.subplots(figsize=(20,20))
95+
x=df.drop_duplicates(['Year','Sport','Event'])
96+
ax=sns.heatmap(x.pivot_table(index='Sport',columns='Year',values='Event',aggfunc='count').fillna(0).astype('int'),annot=True)
97+
st.pyplot(fig)
98+
99+
st.title("Most Successful Athletes")
100+
sport_list=df['Sport'].unique().tolist()
101+
sport_list.sort()
102+
sport_list.insert(0,'Overall')
103+
104+
selected_sport = st.selectbox("Select a Sport ",sport_list)
105+
x=helper.most_successful(df,selected_sport)
106+
st.table(x)
107+
108+
if user_menu =='Country-wise Analysis':
109+
110+
st.sidebar.title('Country-wise Analysis')
111+
112+
country_list=df['region'].dropna().unique().tolist()
113+
country_list.sort()
114+
115+
selected_country=st.sidebar.selectbox('Select a Country',country_list)
116+
117+
country_df=helper.yearwise_medal_tally(df,selected_country)
118+
fig=px.line(country_df,x='Year',y='Medal')
119+
st.title(selected_country + " Medal Tally over the years ")
120+
st.plotly_chart(fig)
121+
122+
123+
st.title(selected_country + " excels int the following sports")
124+
pt=helper.country_event_heatmap(df,selected_country)
125+
fig,ax = plt.subplots(figsize=(20,20))
126+
ax=sns.heatmap(pt,annot=True)
127+
st.pyplot(fig)
128+
129+
st.title(" Top 10 athletes " + selected_country)
130+
top10_df=helper.most_successful_countrywise(df,selected_country)
131+
st.table(top10_df)
132+
133+
if user_menu == 'Athlete wise Analysis':
134+
athlete_df=df.drop_duplicates(subset=['Name','region'])
135+
136+
x1=athlete_df['Age'].dropna()
137+
x2=athlete_df[athlete_df['Medal']=='Gold']['Age'].dropna()
138+
x3=athlete_df[athlete_df['Medal']=='Silver']['Age'].dropna()
139+
x4=athlete_df[athlete_df['Medal']=='Bronze']['Age'].dropna()
140+
141+
fig=ff.create_distplot([x1,x2,x3,x4],['Overall Age','Gold Medalist','Silver Medalist','Bronze Medalist'],show_hist=False,show_rug=False)
142+
143+
fig.update_layout(autosize=False,width=1000,height=600)
144+
st.plotly_chart(fig)
145+
146+
x = []
147+
name = []
148+
famous_sports = ['Basketball', 'Judo', 'Football', 'Tug-Of-War', 'Athletics',
149+
'Swimming', 'Badminton', 'Sailing', 'Gymnastics',
150+
'Art Competitions', 'Handball', 'Weightlifting', 'Wrestling',
151+
'Water Polo', 'Hockey', 'Rowing', 'Fencing',
152+
'Shooting', 'Boxing', 'Taekwondo', 'Cycling', 'Diving', 'Canoeing',
153+
'Tennis', 'Golf', 'Softball', 'Archery',
154+
'Volleyball', 'Synchronized Swimming', 'Table Tennis', 'Baseball',
155+
'Rhythmic Gymnastics', 'Rugby Sevens',
156+
'Beach Volleyball', 'Triathlon', 'Rugby', 'Polo', 'Ice Hockey']
157+
for sport in famous_sports:
158+
temp_df = athlete_df[athlete_df['Sport'] == sport]
159+
x.append(temp_df[temp_df['Medal'] == 'Gold']['Age'].dropna())
160+
name.append(sport)
161+
162+
fig = ff.create_distplot(x, name, show_hist=False, show_rug=False)
163+
fig.update_layout(autosize=False, width=1000, height=600)
164+
st.title("Distribution of Age wrt Sports(Gold Medalist)")
165+
st.plotly_chart(fig)
166+
167+
168+
sport_list=df['Sport'].unique().tolist()
169+
sport_list.sort()
170+
sport_list.insert(0,'Overall')
171+
172+
st.title("Height Vs Weight ")
173+
selected_sport=st.selectbox(' Select a Sport ',sport_list)
174+
temp_df=helper.weight_v_height(df,selected_sport)
175+
fig,ax=plt.subplots()
176+
ax=sns.scatterplot(x='Weight',y='Height',data=temp_df,hue=temp_df['Medal'],style=temp_df['Sex'],s=60)
177+
178+
st.pyplot(fig)
179+
180+
st.title(' Men VS Women Participation over the years')
181+
final=helper.men_vs_women(df)
182+
fig=px.line(final,x='Year',y=["Male","Female"])
183+
fig.update_layout(autosize=False, width=1000, height=600)
184+
st.plotly_chart(fig)
185+
186+
187+
188+
189+
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import numpy as np
2+
3+
4+
def fetch_medal_tally(df,year,country):
5+
medal_df=df.drop_duplicates(subset=['Team','NOC','Games','Year','City','Sport','Event','Medal'])
6+
flag=0
7+
if year =='Overall' and country=='Overall':
8+
temp_df=medal_df
9+
if year =='Overall' and country!= 'Overall':
10+
flag=1
11+
temp_df=medal_df[medal_df['region'] == country]
12+
if year!= "Overall" and country=='Overall':
13+
temp_df=medal_df[medal_df['Year'] == int(year)]
14+
if year!= "Overall" and country!='Overall':
15+
temp_df=medal_df[(medal_df['Year'] == int(year)) & (medal_df['region']==country)]
16+
17+
if flag==1:
18+
x =temp_df.groupby('Year').sum()[['Gold','Silver','Bronze']].sort_values('Year').reset_index()
19+
20+
else:
21+
x =temp_df.groupby('region').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending=False).reset_index()
22+
23+
x['total']= x['Gold'] + x['Silver'] +x['Bronze']
24+
25+
x['Gold'] = x['Gold'].astype('int')
26+
x['Silver'] = x['Silver'].astype('int')
27+
x['Bronze'] = x['Bronze'].astype('int')
28+
x['total'] = x['total'].astype('int')
29+
30+
return x
31+
32+
def medal_tally(df):
33+
medal_tally = df.drop_duplicates(subset=['Team', 'NOC', 'Games', 'Year', 'City', 'Sport', 'Event', 'Medal'])
34+
medal_tally = (medal_tally.groupby('region').sum()[['Gold', 'Silver', 'Bronze']].sort_values('Gold', ascending=False).reset_index())
35+
36+
medal_tally['total'] = medal_tally['Gold'] + medal_tally['Silver'] + medal_tally['Bronze']
37+
38+
medal_tally['Gold'] = medal_tally['Gold'].astype('int')
39+
medal_tally['Silver'] = medal_tally['Silver'].astype('int')
40+
medal_tally['Bronze'] = medal_tally['Bronze'].astype('int')
41+
medal_tally['total'] = medal_tally['total'].astype('int')
42+
43+
return medal_tally
44+
45+
def country_year_list(df):
46+
years = df['Year'].unique().tolist()
47+
years.sort()
48+
years.insert(0, 'Overall')
49+
50+
country = np.unique(df['region'].dropna().values).tolist()
51+
country.sort()
52+
53+
country.insert(0, 'Overall')
54+
55+
return years,country
56+
57+
def data_over_time(df,col):
58+
nations_over_time=df.drop_duplicates(['Year',col])['Year'].value_counts().reset_index().sort_values('Year')
59+
nations_over_time.rename(columns={'Year': 'Edition','count': col},inplace=True)
60+
return nations_over_time
61+
62+
def most_successful(df,sport):
63+
temp_df=df.dropna(subset=['Medal'])
64+
65+
if sport!= 'Overall':
66+
temp_df=temp_df[temp_df['Sport']==sport]
67+
68+
x= temp_df['Name'].value_counts().reset_index().head(15).merge(df)[['Name','count','Sport','region']].drop_duplicates('Name')
69+
x.rename(columns={'count':'Medals'},inplace=True)
70+
return x
71+
72+
def yearwise_medal_tally(df,country):
73+
temp_df=df.dropna(subset=['Medal'])
74+
temp_df.drop_duplicates(subset=['Team','NOC','Games','Year','City','Sport','Event','Medal'],inplace=True)
75+
76+
new_df=temp_df[temp_df['region']==country]
77+
final_df=new_df.groupby('Year').count()['Medal'].reset_index()
78+
79+
return final_df
80+
81+
82+
def country_event_heatmap(df,country):
83+
temp_df=df.dropna(subset=['Medal'])
84+
temp_df.drop_duplicates(subset=['Team','NOC','Games','Year','City','Sport','Event','Medal'],inplace=True)
85+
86+
new_df=temp_df[temp_df['region']==country]
87+
pt=new_df.pivot_table(index='Sport',columns='Year',values='Medal',aggfunc='count').fillna(0)
88+
return pt
89+
90+
91+
def most_successful_countrywise(df,country):
92+
temp_df=df.dropna(subset=['Medal'])
93+
94+
95+
temp_df=temp_df[temp_df['region']==country]
96+
97+
x= temp_df['Name'].value_counts().reset_index().head(10).merge(df)[['Name','count','Sport']].drop_duplicates('Name')
98+
x.rename(columns={'count':'Medals'},inplace=True)
99+
return x
100+
101+
def weight_v_height(df,sport):
102+
athlete_df=df.drop_duplicates(subset=['Name','region'])
103+
athlete_df['Medal'].fillna('No Medal',inplace=True)
104+
if sport!=' Overall ':
105+
106+
temp_df=athlete_df[athlete_df['Sport'] == sport]
107+
return temp_df
108+
109+
else:
110+
return athlete_df
111+
112+
113+
def men_vs_women(df):
114+
athlete_df=df.drop_duplicates(subset=['Name','region'])
115+
116+
men=athlete_df[athlete_df['Sex']=='M'].groupby('Year').count()['Name'].reset_index()
117+
women=athlete_df[athlete_df['Sex']=='F'].groupby('Year').count()['Name'].reset_index()
118+
119+
final=men.merge(women,on='Year',how='left')
120+
final.rename(columns={'Name_x':'Male','Name_y':'Female'},inplace=True)
121+
122+
final.fillna(0,inplace=True)
123+
124+
return final
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import pandas as pd
2+
#df=pd.read_csv('athlete_events.csv')
3+
#egion_df=pd.read_csv('noc_regions.csv')
4+
5+
def preprocess(df,region_df):
6+
##global df,region_df
7+
8+
# filtering for summer olympics
9+
10+
df = df[df['Season'] == 'Summer']
11+
# merge with region_df
12+
df = df.merge(region_df,on = 'NOC', how='left')
13+
# dropping duplicates
14+
df.drop_duplicates(inplace=True)
15+
# one hor encoding medals
16+
df = pd.concat([df, pd.get_dummies(df['Medal'])], axis=1)
17+
return df
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
streamlit
2+
plotly
3+
seaborn
4+
matplotlib
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
mkdir -p ~/.streamlit/
2+
3+
echo "\
4+
[server]\n\
5+
port = $PORT\n\
6+
enableCORS = false\n\
7+
headless = true\n\
8+
\n\
9+
" > ~/.streamlit/config.toml

0 commit comments

Comments
 (0)