The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago’s Billings Hospital on the survival of patients who had undergone surgery for breast cancer.
The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago’s Billings Hospital on the survival of patients who had undergone surgery for breast cancer.
Column 1: Age of patient at the time of operation (numerical)
Column 2: Patient's year of operation (year 1900 - numerical)
Column 3: Number of positive axillary nodes detected (numerical)
Column 4: Survival status (class attribute)
1 = the patient survived 5 years or longer
2 = the patient died within 5 years
"""
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np plt.rcParams.update({'font.size': 16})
'''IMPORTANT''' '''download haberman.csv from the classroom and upload the dataset in the content file in the left pane of the colab''' #Load haberman.csv into a pandas dataFrame. data = pd.read_csv("haberman.csv")
"""## PART 1: Understanding the dataset
"""data.head(5)
"""### Ques 1.2: How many data-points and features does the dataset have?
"""data.shape
"""### Ques 1.3: What are the column names in our dataset?
"""data.columns
"""### Ques 1.4: How many data-points for each class of column 'status' are present?
"""data['status'].value_counts(dropna= False)
"""## PART 2: Scatter Plot (2-D)
"""
data.plot(kind='scatter', x='age', y='year'); plt.show()
"""### Ques 2.1: Color the points by their class-label/status-type
"""'''HINT: Use hue dimension of the Grid'''
sns.set_style("whitegrid")
sns.FacetGrid(data, hue='status', height=4)
.map(plt.scatter, 'age', 'year')
.add_legend()
plt.show();
"""### Ques 2.2: Make 2 separate plots each of only 1 status-type in column dimension of the grid
"""sns.set_style("whitegrid")
sns.FacetGrid(data, col='status', height=4)
.map(plt.scatter, 'age', 'year')
.add_legend()
plt.show();
"""## PART 3: Pair Plots
"""plt.close(); sns.set_style('whitegrid') sns.pairplot(data, hue='status', height=5); plt.show()
"""## PART 4: PDF and CDF
"""sns.set_style("whitegrid")
sns.FacetGrid(data, col='status', height=5)
.map(sns.distplot, 'year')
.add_legend()
plt.show();
"""### Ques 4.2: Plot the CDF and PDF of 'age' of patients who died within 5 years.
"""df = data[data['status'] == 2] counts, bin_edges = np.histogram(df['age'], bins=10,density=True) pdf = counts/sum(counts) cdf = np.cumsum(pdf) plt.plot(bin_edges[1:], pdf) plt.plot(bin_edges[1:], cdf) plt.show()
"""### Ques 4.3: What percentage of people who died were less than 55 years old.
"""df = data[data['status'] == 2] df1 = df[df['age'] < 55] a = df.shape[0] b = df1.shape[0] c = round(b/a*100) print(f'percentage_dead_under_55 = {c}')