|
7 | 7 | import numpy as np |
8 | 8 | import pandas as pd |
9 | 9 | from bs4 import BeautifulSoup |
| 10 | +from deep_translator import GoogleTranslator |
10 | 11 | from tqdm import tqdm |
11 | 12 |
|
12 | 13 | from steam_sales.steam_etl.crud import bulk_ingest_clean_data |
@@ -194,6 +195,7 @@ def process(self, df: pd.DataFrame) -> pd.DataFrame: |
194 | 195 |
|
195 | 196 | def run(self): |
196 | 197 | steamspy_df = self.fetch_data("get_new_steamspy_data.sql") |
| 198 | + # steamspy_df = self.fetch_data("get_all_steamspy_data.sql") |
197 | 199 | self.logger.info(f"{steamspy_df.shape[0]} new records found") |
198 | 200 | cleaned_steamspy_df = self.process(steamspy_df) |
199 | 201 | cleaned_steamspy_df.drop(columns=["name"], inplace=True) |
@@ -282,10 +284,20 @@ def process_price(self, df: pd.DataFrame) -> pd.DataFrame: |
282 | 284 | df.drop(columns=["is_free", "currency", "price_overview"], inplace=True) |
283 | 285 | return df |
284 | 286 |
|
| 287 | + @staticmethod |
| 288 | + def translate(text): |
| 289 | + try: |
| 290 | + lang = GoogleTranslator(source="auto", target="en").translate(text) |
| 291 | + return lang |
| 292 | + except Exception as e: |
| 293 | + return text |
| 294 | + |
285 | 295 | def process_categories_and_genres(self, df: pd.DataFrame) -> pd.DataFrame: |
286 | 296 | df = df[(df["categories"].notna()) & (df["genres"].notna())] |
287 | 297 | for col in ["categories", "genres"]: |
288 | 298 | df[col] = df[col].apply(lambda x: ";".join(item["description"] for item in literal_eval(x))) |
| 299 | + df.loc[df["english"] == 0, "genres"] = df.loc[df["english"] == 0, "genres"].apply(self.translate) |
| 300 | + df.loc[df["english"] == 0, "categories"] = df.loc[df["english"] == 0, "categories"].apply(self.translate) |
289 | 301 | return df |
290 | 302 |
|
291 | 303 | def process_controller(self, df: pd.DataFrame) -> pd.DataFrame: |
@@ -374,6 +386,7 @@ def process(self, df: pd.DataFrame) -> pd.DataFrame: |
374 | 386 |
|
375 | 387 | def run(self): |
376 | 388 | steam_df = self.fetch_data("get_new_steam_data.sql") |
| 389 | + # steam_df = self.fetch_data("get_all_steam_data.sql") |
377 | 390 | self.logger.info(f"{steam_df.shape[0]} new records found") |
378 | 391 | cleaned_steam_df = self.process(steam_df) |
379 | 392 | self.logger.info(f"Clean steam data shape: {cleaned_steam_df.shape}") |
|
0 commit comments