Skip to content

Commit 164e6ba

Browse files
committed
chore: Update SteamStoreCleaner to translate non-English categories and genres
1 parent 109f290 commit 164e6ba

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

steam_sales/steam_etl/cleaner.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88
import pandas as pd
99
from bs4 import BeautifulSoup
10+
from deep_translator import GoogleTranslator
1011
from tqdm import tqdm
1112

1213
from steam_sales.steam_etl.crud import bulk_ingest_clean_data
@@ -194,6 +195,7 @@ def process(self, df: pd.DataFrame) -> pd.DataFrame:
194195

195196
def run(self):
196197
steamspy_df = self.fetch_data("get_new_steamspy_data.sql")
198+
# steamspy_df = self.fetch_data("get_all_steamspy_data.sql")
197199
self.logger.info(f"{steamspy_df.shape[0]} new records found")
198200
cleaned_steamspy_df = self.process(steamspy_df)
199201
cleaned_steamspy_df.drop(columns=["name"], inplace=True)
@@ -282,10 +284,20 @@ def process_price(self, df: pd.DataFrame) -> pd.DataFrame:
282284
df.drop(columns=["is_free", "currency", "price_overview"], inplace=True)
283285
return df
284286

287+
@staticmethod
288+
def translate(text):
289+
try:
290+
lang = GoogleTranslator(source="auto", target="en").translate(text)
291+
return lang
292+
except Exception as e:
293+
return text
294+
285295
def process_categories_and_genres(self, df: pd.DataFrame) -> pd.DataFrame:
286296
df = df[(df["categories"].notna()) & (df["genres"].notna())]
287297
for col in ["categories", "genres"]:
288298
df[col] = df[col].apply(lambda x: ";".join(item["description"] for item in literal_eval(x)))
299+
df.loc[df["english"] == 0, "genres"] = df.loc[df["english"] == 0, "genres"].apply(self.translate)
300+
df.loc[df["english"] == 0, "categories"] = df.loc[df["english"] == 0, "categories"].apply(self.translate)
289301
return df
290302

291303
def process_controller(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -374,6 +386,7 @@ def process(self, df: pd.DataFrame) -> pd.DataFrame:
374386

375387
def run(self):
376388
steam_df = self.fetch_data("get_new_steam_data.sql")
389+
# steam_df = self.fetch_data("get_all_steam_data.sql")
377390
self.logger.info(f"{steam_df.shape[0]} new records found")
378391
cleaned_steam_df = self.process(steam_df)
379392
self.logger.info(f"Clean steam data shape: {cleaned_steam_df.shape}")

0 commit comments

Comments
 (0)