|
339 | 339 | } |
340 | 340 | ], |
341 | 341 | "source": [ |
342 | | - "with open(os.path.join(Path.sql_queries, 'get_all_game_data.sql'), \"r\") as f:\n", |
| 342 | + "with open(os.path.join(Path.sql_queries, \"get_all_game_data.sql\"), \"r\") as f:\n", |
343 | 343 | " query = text(f.read())\n", |
344 | | - " \n", |
| 344 | + "\n", |
345 | 345 | "\n", |
346 | 346 | "with get_db() as db:\n", |
347 | 347 | " result = db.execute(query)\n", |
|
371 | 371 | } |
372 | 372 | ], |
373 | 373 | "source": [ |
374 | | - "game_data['description'].iloc[10000-4-1]" |
| 374 | + "game_data[\"description\"].iloc[10000 - 4 - 1]" |
375 | 375 | ] |
376 | 376 | }, |
377 | 377 | { |
|
396 | 396 | "source": [ |
397 | 397 | "from fuzzywuzzy import process\n", |
398 | 398 | "\n", |
| 399 | + "\n", |
399 | 400 | "def get_unique(series):\n", |
400 | 401 | " \"\"\"\n", |
401 | 402 | " Returns a set of unique values from a series of strings.\n", |
|
407 | 408 | " set: A set of unique values extracted from the series.\n", |
408 | 409 | "\n", |
409 | 410 | " \"\"\"\n", |
410 | | - " return set(list(itertools.chain(*series.apply(lambda x: [c for c in x.split(';')]))))" |
| 411 | + " return set(list(itertools.chain(*series.apply(lambda x: [c for c in x.split(\";\")]))))" |
411 | 412 | ] |
412 | 413 | }, |
413 | 414 | { |
|
461 | 462 | } |
462 | 463 | ], |
463 | 464 | "source": [ |
464 | | - "geners = get_unique(game_data['genres'])\n", |
| 465 | + "geners = get_unique(game_data[\"genres\"])\n", |
465 | 466 | "geners" |
466 | 467 | ] |
467 | 468 | }, |
|
494 | 495 | "def standardize_genre(value, genre_list):\n", |
495 | 496 | " # Convert to lowercase for consistent comparison\n", |
496 | 497 | " value_lower = value.lower()\n", |
497 | | - " \n", |
| 498 | + "\n", |
498 | 499 | " # Define common patterns\n", |
499 | | - " if 'rpg' in value_lower or 'role playing' in value_lower or 'role' in value_lower:\n", |
500 | | - " return 'RPG'\n", |
501 | | - " if 'simulation' in value_lower or 'simulators' in value_lower:\n", |
502 | | - " return 'Simulation'\n", |
503 | | - " if 'adventure' in value_lower:\n", |
504 | | - " return 'Adventure'\n", |
| 500 | + " if \"rpg\" in value_lower or \"role playing\" in value_lower or \"role\" in value_lower:\n", |
| 501 | + " return \"RPG\"\n", |
| 502 | + " if \"simulation\" in value_lower or \"simulators\" in value_lower:\n", |
| 503 | + " return \"Simulation\"\n", |
| 504 | + " if \"adventure\" in value_lower:\n", |
| 505 | + " return \"Adventure\"\n", |
| 506 | + "\n", |
505 | 507 | "\n", |
506 | 508 | "# Function to standardize multiple genres\n", |
507 | 509 | "def standardize_multiple_genres(genres_str, genre_list):\n", |
508 | | - " genres = genres_str.split(';')\n", |
| 510 | + " genres = genres_str.split(\";\")\n", |
509 | 511 | " standardized_genres = [standardize_genre(genre.strip(), genre_list) for genre in genres]\n", |
510 | | - " return ';'.join(sorted(set(standardized_genres))) # Use sorted(set()) to remove duplicates and sort\n", |
511 | | - " \n", |
| 512 | + " return \";\".join(sorted(set(standardized_genres))) # Use sorted(set()) to remove duplicates and sort\n", |
| 513 | + "\n", |
512 | 514 | " # Find the best match from the list of unique genres\n", |
513 | 515 | " match, score = process.extractOne(value, genre_list)\n", |
514 | 516 | " return match\n", |
515 | 517 | "\n", |
| 518 | + "\n", |
516 | 519 | "# Apply the standardization function to the Genres column\n", |
517 | | - "game_data['genres'] = game_data['genres'].apply(lambda x: standardize_multiple_genres(x, geners))\n", |
518 | | - "geners = get_unique(game_data['genres'])\n", |
| 520 | + "game_data[\"genres\"] = game_data[\"genres\"].apply(lambda x: standardize_multiple_genres(x, geners))\n", |
| 521 | + "geners = get_unique(game_data[\"genres\"])\n", |
519 | 522 | "geners" |
520 | 523 | ] |
521 | 524 | }, |
|
615 | 618 | } |
616 | 619 | ], |
617 | 620 | "source": [ |
618 | | - "categories = get_unique(game_data['categories'])\n", |
| 621 | + "categories = get_unique(game_data[\"categories\"])\n", |
619 | 622 | "categories" |
620 | 623 | ] |
621 | 624 | }, |
|
643 | 646 | " - score: The calculated rating score as a percentage.\n", |
644 | 647 | "\n", |
645 | 648 | " \"\"\"\n", |
646 | | - " pos = row['positive_ratings']\n", |
647 | | - " neg = row['negative_ratings']\n", |
| 649 | + " pos = row[\"positive_ratings\"]\n", |
| 650 | + " neg = row[\"negative_ratings\"]\n", |
648 | 651 | "\n", |
649 | 652 | " total_reviews = pos + neg\n", |
650 | | - " \n", |
| 653 | + "\n", |
651 | 654 | " if total_reviews > 0:\n", |
652 | 655 | " average = pos / total_reviews\n", |
653 | | - " score = average - (average * 0.5) * 2**(-math.log10(total_reviews + 1))\n", |
| 656 | + " score = average - (average * 0.5) * 2 ** (-math.log10(total_reviews + 1))\n", |
654 | 657 | " return score * 100\n", |
655 | 658 | " else:\n", |
656 | 659 | " return 0.0\n", |
657 | 660 | "\n", |
658 | | - "game_data['total_ratings'] = game_data['positive_ratings'] + game_data['negative_ratings']\n", |
659 | | - "game_data['review_score'] = game_data['positive_ratings'] / game_data['total_ratings']\n", |
660 | | - "game_data['rating'] = game_data.apply(calc_rating, axis=1)" |
| 661 | + "\n", |
| 662 | + "game_data[\"total_ratings\"] = game_data[\"positive_ratings\"] + game_data[\"negative_ratings\"]\n", |
| 663 | + "game_data[\"review_score\"] = game_data[\"positive_ratings\"] / game_data[\"total_ratings\"]\n", |
| 664 | + "game_data[\"rating\"] = game_data.apply(calc_rating, axis=1)" |
661 | 665 | ] |
662 | 666 | }, |
663 | 667 | { |
|
996 | 1000 | "source": [ |
997 | 1001 | "def categorize_year(year):\n", |
998 | 1002 | " if year < 2020:\n", |
999 | | - " return 'Before 2020'\n", |
| 1003 | + " return \"Before 2020\"\n", |
1000 | 1004 | " elif 2020 <= year <= 2022:\n", |
1001 | | - " return '2020-2022'\n", |
| 1005 | + " return \"2020-2022\"\n", |
1002 | 1006 | " else:\n", |
1003 | | - " return 'After 2022'\n", |
| 1007 | + " return \"After 2022\"\n", |
| 1008 | + "\n", |
1004 | 1009 | "\n", |
1005 | | - "game_data['year'] = game_data['year'].fillna(0).astype(int) \n", |
1006 | | - "game_data['Region'] = game_data['year'].apply(categorize_year)\n", |
| 1010 | + "game_data[\"year\"] = game_data[\"year\"].fillna(0).astype(int)\n", |
| 1011 | + "game_data[\"Region\"] = game_data[\"year\"].apply(categorize_year)\n", |
1007 | 1012 | "\n", |
1008 | 1013 | "# Calculate the frequency of each year\n", |
1009 | | - "yearly_counts = game_data.groupby(['Region', 'year']).size().reset_index(name='Frequency')\n", |
| 1014 | + "yearly_counts = game_data.groupby([\"Region\", \"year\"]).size().reset_index(name=\"Frequency\")\n", |
1010 | 1015 | "\n", |
1011 | 1016 | "# Plotting using Seaborn\n", |
1012 | 1017 | "plt.figure(figsize=(12, 6))\n", |
1013 | | - "sns.barplot(data=yearly_counts, x='year', y='Frequency', hue='Region')\n", |
1014 | | - "plt.title('Game Release by Year')\n", |
1015 | | - "plt.xlabel('Year')\n", |
1016 | | - "plt.ylabel('Frequency')\n", |
| 1018 | + "sns.barplot(data=yearly_counts, x=\"year\", y=\"Frequency\", hue=\"Region\")\n", |
| 1019 | + "plt.title(\"Game Release by Year\")\n", |
| 1020 | + "plt.xlabel(\"Year\")\n", |
| 1021 | + "plt.ylabel(\"Frequency\")\n", |
1017 | 1022 | "plt.xticks(rotation=45)\n", |
1018 | 1023 | "plt.show()" |
1019 | 1024 | ] |
|
1031 | 1036 | "metadata": {}, |
1032 | 1037 | "outputs": [], |
1033 | 1038 | "source": [ |
1034 | | - "tags = col_row_df['tags']\n", |
| 1039 | + "tags = col_row_df[\"tags\"]\n", |
1035 | 1040 | "parsed_tags = tags.apply(lambda x: literal_eval(x) if x else {})\n", |
1036 | 1041 | "\n", |
1037 | 1042 | "unique_tags = set(itertools.chain(*parsed_tags))\n", |
1038 | 1043 | "\n", |
1039 | | - "print('Number of unique tags:', len(unique_tags))\n", |
| 1044 | + "print(\"Number of unique tags:\", len(unique_tags))\n", |
1040 | 1045 | "\n", |
1041 | 1046 | "# Create a DataFrame with 15 columns and 30 rows\n", |
1042 | 1047 | "num_columns = 15\n", |
|
1045 | 1050 | "unique_tags = sorted(list(unique_tags))\n", |
1046 | 1051 | "\n", |
1047 | 1052 | "# Reshape the list into the desired DataFrame shape\n", |
1048 | | - "ut = [unique_tags[i * num_columns:(i + 1) * num_columns] for i in range(num_rows)]\n", |
| 1053 | + "ut = [unique_tags[i * num_columns : (i + 1) * num_columns] for i in range(num_rows)]\n", |
1049 | 1054 | "\n", |
1050 | 1055 | "# Create the DataFrame\n", |
1051 | 1056 | "utdf = pd.DataFrame(ut)\n", |
|
1079 | 1084 | "metadata": {}, |
1080 | 1085 | "outputs": [], |
1081 | 1086 | "source": [ |
1082 | | - "langs = col_row_df['languages']\n", |
1083 | | - "langs = langs.apply(lambda x: x.split(', ') if x else [])\n", |
| 1087 | + "langs = col_row_df[\"languages\"]\n", |
| 1088 | + "langs = langs.apply(lambda x: x.split(\", \") if x else [])\n", |
1084 | 1089 | "\n", |
1085 | 1090 | "langc = Counter()\n", |
1086 | 1091 | "\n", |
|
0 commit comments