|
| 1 | +""" |
| 2 | +Data analysis functions for video game sales research. |
| 3 | +
|
| 4 | +This module provides comprehensive analysis functions for exploring video game sales data, |
| 5 | +including missing data analysis, distribution analysis, and market insights. |
| 6 | +""" |
| 7 | + |
| 8 | +import pandas as pd |
| 9 | +from typing import Dict, List, Tuple, Union |
| 10 | + |
| 11 | + |
| 12 | +def analyze_missing_data(df: pd.DataFrame) -> Dict[str, int]: |
| 13 | + """ |
| 14 | + Analyze missing data in the DataFrame. |
| 15 | +
|
| 16 | + Args: |
| 17 | + df: DataFrame to analyze |
| 18 | +
|
| 19 | + Returns: |
| 20 | + Dictionary with column names and missing value counts |
| 21 | + """ |
| 22 | + missing_values = df.isnull().sum() |
| 23 | + missing_dict = missing_values[missing_values > 0].to_dict() |
| 24 | + |
| 25 | + print('=== Missing Values Analysis ===') |
| 26 | + if missing_dict: |
| 27 | + for col, count in missing_dict.items(): |
| 28 | + print(f'📊 {col}: {count:,} missing values ({count / len(df) * 100:.2f}%)') |
| 29 | + else: |
| 30 | + print('✅ No missing values found!') |
| 31 | + |
| 32 | + return missing_dict |
| 33 | + |
| 34 | + |
| 35 | +def get_top_performers(df: pd.DataFrame, column: str, n: int = 10) -> pd.DataFrame: |
| 36 | + """ |
| 37 | + Get top N performers by a specified column. |
| 38 | +
|
| 39 | + Args: |
| 40 | + df: DataFrame to analyze |
| 41 | + column: Column to sort by |
| 42 | + n: Number of top performers to return |
| 43 | +
|
| 44 | + Returns: |
| 45 | + DataFrame with top N performers |
| 46 | + """ |
| 47 | + return df.nlargest(n, column) |
| 48 | + |
| 49 | + |
| 50 | +def analyze_distribution(df: pd.DataFrame, column: str, top_n: int = 15) -> pd.Series: |
| 51 | + """ |
| 52 | + Analyze the distribution of values in a column. |
| 53 | +
|
| 54 | + Args: |
| 55 | + df: DataFrame to analyze |
| 56 | + column: Column to analyze |
| 57 | + top_n: Number of top values to show |
| 58 | +
|
| 59 | + Returns: |
| 60 | + Series with value counts |
| 61 | + """ |
| 62 | + distribution = df[column].value_counts().head(top_n) |
| 63 | + |
| 64 | + print(f'=== {column} Distribution (Top {top_n}) ===') |
| 65 | + print(distribution) |
| 66 | + print(f'\nTotal unique values: {df[column].nunique():,}') |
| 67 | + |
| 68 | + return distribution |
| 69 | + |
| 70 | + |
| 71 | +def calculate_regional_breakdown(df: pd.DataFrame, game_names: Union[List[str], int]) -> pd.DataFrame: |
| 72 | + """ |
| 73 | + Calculate regional sales breakdown for specific games. |
| 74 | +
|
| 75 | + Args: |
| 76 | + df: DataFrame containing sales data |
| 77 | + game_names: List of game names to analyze or integer for top N games |
| 78 | +
|
| 79 | + Returns: |
| 80 | + DataFrame with regional sales breakdown |
| 81 | + """ |
| 82 | + regional_cols = ['Name', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'] |
| 83 | + |
| 84 | + if isinstance(game_names, int): |
| 85 | + # If integer passed, get top N games |
| 86 | + return df.head(game_names)[regional_cols] |
| 87 | + else: |
| 88 | + # If list of names passed, filter by those names |
| 89 | + filtered_df = df[df['Name'].isin(game_names)] |
| 90 | + return filtered_df[regional_cols] |
| 91 | + |
| 92 | + |
| 93 | +def analyze_publishers(df: pd.DataFrame, top_n: int = 10) -> Tuple[pd.Series, pd.Series]: |
| 94 | + """ |
| 95 | + Analyze publishers by number of games and total sales. |
| 96 | +
|
| 97 | + Args: |
| 98 | + df: DataFrame to analyze |
| 99 | + top_n: Number of top publishers to show |
| 100 | +
|
| 101 | + Returns: |
| 102 | + Tuple of (games_count, total_sales) Series |
| 103 | + """ |
| 104 | + print(f'=== Top {top_n} Publishers by Number of Games ===') |
| 105 | + publisher_games = df['Publisher'].value_counts().head(top_n) |
| 106 | + print(publisher_games) |
| 107 | + |
| 108 | + print(f'\n=== Top {top_n} Publishers by Total Global Sales ===') |
| 109 | + publisher_sales = df.groupby('Publisher')['Global_Sales'].sum().sort_values(ascending=False).head(top_n) |
| 110 | + print(publisher_sales.round(2)) |
| 111 | + |
| 112 | + return publisher_games, publisher_sales |
| 113 | + |
| 114 | + |
| 115 | +def analyze_year_trends(df: pd.DataFrame) -> Dict[str, any]: |
| 116 | + """ |
| 117 | + Analyze gaming trends by year. |
| 118 | +
|
| 119 | + Args: |
| 120 | + df: DataFrame to analyze |
| 121 | +
|
| 122 | + Returns: |
| 123 | + Dictionary with year analysis results |
| 124 | + """ |
| 125 | + # Filter out missing years |
| 126 | + df_with_years = df.dropna(subset=['Year']) |
| 127 | + |
| 128 | + # Calculate basic year statistics |
| 129 | + year_stats = { |
| 130 | + 'earliest_year': df_with_years['Year'].min(), |
| 131 | + 'latest_year': df_with_years['Year'].max(), |
| 132 | + 'median_year': df_with_years['Year'].median(), |
| 133 | + 'games_per_year': df_with_years.groupby('Year').size().describe(), |
| 134 | + } |
| 135 | + |
| 136 | + print('=== Gaming Industry Timeline ===') |
| 137 | + print(f'📅 Dataset covers: {year_stats["earliest_year"]:.0f} - {year_stats["latest_year"]:.0f}') |
| 138 | + print(f'📊 Median release year: {year_stats["median_year"]:.0f}') |
| 139 | + games_by_year = df_with_years.groupby("Year").size() |
| 140 | + peak_year = games_by_year.idxmax() |
| 141 | + peak_count = games_by_year.max() |
| 142 | + print(f'🎮 Peak gaming year: {peak_year:.0f} ({peak_count} games)') |
| 143 | + |
| 144 | + return year_stats |
| 145 | + |
| 146 | + |
| 147 | +def get_regional_market_share(df: pd.DataFrame) -> Dict[str, float]: |
| 148 | + """ |
| 149 | + Calculate global market share by region. |
| 150 | +
|
| 151 | + Args: |
| 152 | + df: DataFrame with regional sales data |
| 153 | +
|
| 154 | + Returns: |
| 155 | + Dictionary with regional market shares |
| 156 | + """ |
| 157 | + total_na = df['NA_Sales'].sum() |
| 158 | + total_eu = df['EU_Sales'].sum() |
| 159 | + total_jp = df['JP_Sales'].sum() |
| 160 | + total_other = df['Other_Sales'].sum() |
| 161 | + total_global = total_na + total_eu + total_jp + total_other |
| 162 | + |
| 163 | + market_share = { |
| 164 | + 'North America': (total_na / total_global) * 100, |
| 165 | + 'Europe': (total_eu / total_global) * 100, |
| 166 | + 'Japan': (total_jp / total_global) * 100, |
| 167 | + 'Other': (total_other / total_global) * 100, |
| 168 | + } |
| 169 | + |
| 170 | + print('=== Global Market Share by Region ===') |
| 171 | + for region, share in market_share.items(): |
| 172 | + print(f'🌍 {region}: {share:.1f}%') |
| 173 | + |
| 174 | + return market_share |
| 175 | + |
| 176 | + |
| 177 | +def generate_summary_statistics(df: pd.DataFrame) -> Dict[str, any]: |
| 178 | + """ |
| 179 | + Generate comprehensive summary statistics for the dataset. |
| 180 | +
|
| 181 | + Args: |
| 182 | + df: DataFrame to analyze |
| 183 | +
|
| 184 | + Returns: |
| 185 | + Dictionary with summary statistics |
| 186 | + """ |
| 187 | + summary = { |
| 188 | + 'total_games': len(df), |
| 189 | + 'total_sales': df['Global_Sales'].sum(), |
| 190 | + 'average_sales': df['Global_Sales'].mean(), |
| 191 | + 'unique_platforms': df['Platform'].nunique(), |
| 192 | + 'unique_genres': df['Genre'].nunique(), |
| 193 | + 'unique_publishers': df['Publisher'].nunique(), |
| 194 | + 'year_range': (df['Year'].min(), df['Year'].max()), |
| 195 | + 'top_selling_game': df.loc[df['Global_Sales'].idxmax(), 'Name'], |
| 196 | + 'nintendo_dominance': (df['Publisher'] == 'Nintendo').sum() / len(df) * 100, |
| 197 | + } |
| 198 | + |
| 199 | + print('=== Dataset Summary Statistics ===') |
| 200 | + print(f'🎮 Total games analyzed: {summary["total_games"]:,}') |
| 201 | + print(f'💰 Total global sales: {summary["total_sales"]:.2f}M copies') |
| 202 | + print(f'📊 Average sales per game: {summary["average_sales"]:.2f}M copies') |
| 203 | + print(f'🕹️ Unique platforms: {summary["unique_platforms"]}') |
| 204 | + print(f'🎯 Unique genres: {summary["unique_genres"]}') |
| 205 | + print(f'🏢 Unique publishers: {summary["unique_publishers"]}') |
| 206 | + print(f'👑 Top selling game: {summary["top_selling_game"]}') |
| 207 | + print(f'🎯 Nintendo market presence: {summary["nintendo_dominance"]:.1f}% of all games') |
| 208 | + |
| 209 | + return summary |
0 commit comments