Skip to content

Commit 18e007a

Browse files
committed
Applying best practices to our notebook report.
1 parent e5aa364 commit 18e007a

File tree

7 files changed

+1637
-443
lines changed

7 files changed

+1637
-443
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
Video Game Sales Analysis Package
3+
4+
This package provides comprehensive tools for analyzing video game sales data.
5+
6+
Modules:
7+
--------
8+
- data_loader: Functions for loading and configuring data analysis environment
9+
- data_analysis: Core analysis functions for sales data exploration
10+
- utils: Utility functions for suggestions and reporting
11+
12+
Example Usage:
13+
--------------
14+
from data_loader import load_game_sales_data, setup_display_options
15+
from data_analysis import analyze_missing_data, get_top_performers
16+
17+
# Set up environment
18+
setup_display_options()
19+
20+
# Load and analyze data
21+
df = load_game_sales_data()
22+
missing_info = analyze_missing_data(df)
23+
top_games = get_top_performers(df, 'Global_Sales', 10)
24+
"""
25+
26+
__version__ = "1.0.0"
27+
__author__ = "Video Game Sales Research Team"
28+
29+
# Import main functions for easy access
30+
from .data_loader import setup_display_options, load_game_sales_data
31+
from .data_analysis import (
32+
analyze_missing_data,
33+
get_top_performers,
34+
analyze_distribution,
35+
calculate_regional_breakdown,
36+
analyze_publishers,
37+
analyze_year_trends,
38+
get_regional_market_share,
39+
generate_summary_statistics,
40+
)
41+
from .utils import suggest_next_analysis_steps, print_analysis_complete_message
42+
43+
__all__ = [
44+
"setup_display_options",
45+
"load_game_sales_data",
46+
"analyze_missing_data",
47+
"get_top_performers",
48+
"analyze_distribution",
49+
"calculate_regional_breakdown",
50+
"analyze_publishers",
51+
"analyze_year_trends",
52+
"get_regional_market_share",
53+
"generate_summary_statistics",
54+
"suggest_next_analysis_steps",
55+
"print_analysis_complete_message",
56+
]
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
"""
2+
Data analysis functions for video game sales research.
3+
4+
This module provides comprehensive analysis functions for exploring video game sales data,
5+
including missing data analysis, distribution analysis, and market insights.
6+
"""
7+
8+
import pandas as pd
9+
from typing import Dict, List, Tuple, Union
10+
11+
12+
def analyze_missing_data(df: pd.DataFrame) -> Dict[str, int]:
13+
"""
14+
Analyze missing data in the DataFrame.
15+
16+
Args:
17+
df: DataFrame to analyze
18+
19+
Returns:
20+
Dictionary with column names and missing value counts
21+
"""
22+
missing_values = df.isnull().sum()
23+
missing_dict = missing_values[missing_values > 0].to_dict()
24+
25+
print('=== Missing Values Analysis ===')
26+
if missing_dict:
27+
for col, count in missing_dict.items():
28+
print(f'📊 {col}: {count:,} missing values ({count / len(df) * 100:.2f}%)')
29+
else:
30+
print('✅ No missing values found!')
31+
32+
return missing_dict
33+
34+
35+
def get_top_performers(df: pd.DataFrame, column: str, n: int = 10) -> pd.DataFrame:
36+
"""
37+
Get top N performers by a specified column.
38+
39+
Args:
40+
df: DataFrame to analyze
41+
column: Column to sort by
42+
n: Number of top performers to return
43+
44+
Returns:
45+
DataFrame with top N performers
46+
"""
47+
return df.nlargest(n, column)
48+
49+
50+
def analyze_distribution(df: pd.DataFrame, column: str, top_n: int = 15) -> pd.Series:
51+
"""
52+
Analyze the distribution of values in a column.
53+
54+
Args:
55+
df: DataFrame to analyze
56+
column: Column to analyze
57+
top_n: Number of top values to show
58+
59+
Returns:
60+
Series with value counts
61+
"""
62+
distribution = df[column].value_counts().head(top_n)
63+
64+
print(f'=== {column} Distribution (Top {top_n}) ===')
65+
print(distribution)
66+
print(f'\nTotal unique values: {df[column].nunique():,}')
67+
68+
return distribution
69+
70+
71+
def calculate_regional_breakdown(df: pd.DataFrame, game_names: Union[List[str], int]) -> pd.DataFrame:
72+
"""
73+
Calculate regional sales breakdown for specific games.
74+
75+
Args:
76+
df: DataFrame containing sales data
77+
game_names: List of game names to analyze or integer for top N games
78+
79+
Returns:
80+
DataFrame with regional sales breakdown
81+
"""
82+
regional_cols = ['Name', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
83+
84+
if isinstance(game_names, int):
85+
# If integer passed, get top N games
86+
return df.head(game_names)[regional_cols]
87+
else:
88+
# If list of names passed, filter by those names
89+
filtered_df = df[df['Name'].isin(game_names)]
90+
return filtered_df[regional_cols]
91+
92+
93+
def analyze_publishers(df: pd.DataFrame, top_n: int = 10) -> Tuple[pd.Series, pd.Series]:
94+
"""
95+
Analyze publishers by number of games and total sales.
96+
97+
Args:
98+
df: DataFrame to analyze
99+
top_n: Number of top publishers to show
100+
101+
Returns:
102+
Tuple of (games_count, total_sales) Series
103+
"""
104+
print(f'=== Top {top_n} Publishers by Number of Games ===')
105+
publisher_games = df['Publisher'].value_counts().head(top_n)
106+
print(publisher_games)
107+
108+
print(f'\n=== Top {top_n} Publishers by Total Global Sales ===')
109+
publisher_sales = df.groupby('Publisher')['Global_Sales'].sum().sort_values(ascending=False).head(top_n)
110+
print(publisher_sales.round(2))
111+
112+
return publisher_games, publisher_sales
113+
114+
115+
def analyze_year_trends(df: pd.DataFrame) -> Dict[str, any]:
116+
"""
117+
Analyze gaming trends by year.
118+
119+
Args:
120+
df: DataFrame to analyze
121+
122+
Returns:
123+
Dictionary with year analysis results
124+
"""
125+
# Filter out missing years
126+
df_with_years = df.dropna(subset=['Year'])
127+
128+
# Calculate basic year statistics
129+
year_stats = {
130+
'earliest_year': df_with_years['Year'].min(),
131+
'latest_year': df_with_years['Year'].max(),
132+
'median_year': df_with_years['Year'].median(),
133+
'games_per_year': df_with_years.groupby('Year').size().describe(),
134+
}
135+
136+
print('=== Gaming Industry Timeline ===')
137+
print(f'📅 Dataset covers: {year_stats["earliest_year"]:.0f} - {year_stats["latest_year"]:.0f}')
138+
print(f'📊 Median release year: {year_stats["median_year"]:.0f}')
139+
games_by_year = df_with_years.groupby("Year").size()
140+
peak_year = games_by_year.idxmax()
141+
peak_count = games_by_year.max()
142+
print(f'🎮 Peak gaming year: {peak_year:.0f} ({peak_count} games)')
143+
144+
return year_stats
145+
146+
147+
def get_regional_market_share(df: pd.DataFrame) -> Dict[str, float]:
148+
"""
149+
Calculate global market share by region.
150+
151+
Args:
152+
df: DataFrame with regional sales data
153+
154+
Returns:
155+
Dictionary with regional market shares
156+
"""
157+
total_na = df['NA_Sales'].sum()
158+
total_eu = df['EU_Sales'].sum()
159+
total_jp = df['JP_Sales'].sum()
160+
total_other = df['Other_Sales'].sum()
161+
total_global = total_na + total_eu + total_jp + total_other
162+
163+
market_share = {
164+
'North America': (total_na / total_global) * 100,
165+
'Europe': (total_eu / total_global) * 100,
166+
'Japan': (total_jp / total_global) * 100,
167+
'Other': (total_other / total_global) * 100,
168+
}
169+
170+
print('=== Global Market Share by Region ===')
171+
for region, share in market_share.items():
172+
print(f'🌍 {region}: {share:.1f}%')
173+
174+
return market_share
175+
176+
177+
def generate_summary_statistics(df: pd.DataFrame) -> Dict[str, any]:
178+
"""
179+
Generate comprehensive summary statistics for the dataset.
180+
181+
Args:
182+
df: DataFrame to analyze
183+
184+
Returns:
185+
Dictionary with summary statistics
186+
"""
187+
summary = {
188+
'total_games': len(df),
189+
'total_sales': df['Global_Sales'].sum(),
190+
'average_sales': df['Global_Sales'].mean(),
191+
'unique_platforms': df['Platform'].nunique(),
192+
'unique_genres': df['Genre'].nunique(),
193+
'unique_publishers': df['Publisher'].nunique(),
194+
'year_range': (df['Year'].min(), df['Year'].max()),
195+
'top_selling_game': df.loc[df['Global_Sales'].idxmax(), 'Name'],
196+
'nintendo_dominance': (df['Publisher'] == 'Nintendo').sum() / len(df) * 100,
197+
}
198+
199+
print('=== Dataset Summary Statistics ===')
200+
print(f'🎮 Total games analyzed: {summary["total_games"]:,}')
201+
print(f'💰 Total global sales: {summary["total_sales"]:.2f}M copies')
202+
print(f'📊 Average sales per game: {summary["average_sales"]:.2f}M copies')
203+
print(f'🕹️ Unique platforms: {summary["unique_platforms"]}')
204+
print(f'🎯 Unique genres: {summary["unique_genres"]}')
205+
print(f'🏢 Unique publishers: {summary["unique_publishers"]}')
206+
print(f'👑 Top selling game: {summary["top_selling_game"]}')
207+
print(f'🎯 Nintendo market presence: {summary["nintendo_dominance"]:.1f}% of all games')
208+
209+
return summary
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Data loading and configuration utilities for video game sales analysis.
3+
4+
This module provides functions for loading data and setting up the analysis environment.
5+
"""
6+
7+
import pandas as pd
8+
import matplotlib.pyplot as plt
9+
import seaborn as sns
10+
11+
12+
13+
def setup_display_options() -> None:
14+
"""Configure pandas and matplotlib display options for better output."""
15+
# Pandas display options
16+
pd.set_option('display.max_columns', None)
17+
pd.set_option('display.max_rows', 20)
18+
pd.set_option('display.float_format', '{:.2f}'.format)
19+
20+
# Matplotlib and seaborn styling
21+
plt.style.use('default')
22+
sns.set_palette('husl')
23+
24+
# Set figure size defaults
25+
plt.rcParams['figure.figsize'] = [12, 8]
26+
plt.rcParams['figure.dpi'] = 100
27+
28+
29+
def load_game_sales_data(filepath: str = 'vgsales.csv') -> pd.DataFrame:
30+
"""
31+
Load the video game sales dataset with error handling.
32+
33+
Args:
34+
filepath: Path to the CSV file
35+
36+
Returns:
37+
DataFrame containing the video game sales data
38+
39+
Raises:
40+
FileNotFoundError: If the CSV file doesn't exist
41+
pd.errors.EmptyDataError: If the CSV file is empty
42+
"""
43+
try:
44+
df = pd.read_csv(filepath)
45+
print(f'✅ Dataset loaded successfully from {filepath}')
46+
print(f'📊 Shape: {df.shape}')
47+
print(f'📋 Columns: {list(df.columns)}')
48+
return df
49+
except FileNotFoundError:
50+
print(f"❌ Error: File '{filepath}' not found")
51+
raise
52+
except pd.errors.EmptyDataError:
53+
print(f"❌ Error: File '{filepath}' is empty")
54+
raise

0 commit comments

Comments
 (0)