diff --git a/analysis/age/main.py b/analysis/age/main.py index a766556..7c25bdc 100644 --- a/analysis/age/main.py +++ b/analysis/age/main.py @@ -30,11 +30,15 @@ def run_by_excel(cluster_by, filenames, N=5, folder_name="To_be_filled"): assert cluster_by in ["sdName", "wiwName"] level = 1 if cluster_by == "sdName" else 2 - datadir = os.path.join(BASE_DIR, "_data", folder_name) + # datadir = os.path.join(BASE_DIR, "_data", folder_name) + datadir = os.path.join(BASE_DIR, "_data") df = pd.DataFrame() for d in filenames: df_new = pd.read_excel(os.path.join(datadir, d)) df = pd.concat([df, df_new]) + df["sdName"] = df[["sdName", "wiwName"]].apply( + lambda x: "대구광역시" if x[1] == "군위군" else x[0], axis=1 + ) if level == 1: df = df[["sgId", "sdName", "name", "age", "gender"]] else: diff --git a/analysis/diversity_db.py b/analysis/diversity_db.py index a9934e6..578df56 100644 --- a/analysis/diversity_db.py +++ b/analysis/diversity_db.py @@ -29,12 +29,28 @@ def gini_simpson(data, stair=0, opts=True): total = sum(counts.values()) gs_idx = 1 - sum((n / total) * ((n - 1) / (total - 1)) for n in counts.values()) + bins = None + if isinstance(data[0], int): + bins = [0 for _ in range(4)] # 39세 이하, 40대, 50대, 60세 이상 + for age in data: + if age < 40: + bins[0] = 1 + elif age < 50: + bins[1] = 1 + elif age < 60: + bins[2] = 1 + else: + bins[3] = 1 + bins = sum(bins) + if opts: num_cats = len([c for c in counts.values() if c > 0]) if num_cats <= 1: return 0.0 max_gs_idx = (num_cats - 1) / num_cats * total / (total - 1) gs_idx /= max_gs_idx + if gs_idx > 0.8 and bins and bins < 4: + return 0.8 return gs_idx @@ -170,10 +186,12 @@ def save_to_mongo_metro(metroId: int, factor: str, stair=0, opts=True) -> None: factor_field = {"age": "age", "gender": "gender", "party": "jdName"} data = [ councilor[factor_field[factor]] - for councilor in client["council"]["metro_councilor"].find({"metroId": metroId}) + for councilor in client["council"]["metropolitan_councilor"].find( + {"metroId": metroId} + ) ] - # print(f"{metroId} {factor}") - # print(data) + print(f"{metroId} {factor}") + print(data) client["stats"].get_collection("diversity_index").update_one( {"metroId": metroId}, {"$set": {f"{factor}DiversityIndex": gini_simpson(data, stair, opts)}}, @@ -371,10 +389,10 @@ def main(): calculate_rank_metro("party") calculate_age_diversity_rank_history_metro() - save_to_mongo_national("age", stair=10) - save_to_mongo_national("gender") - save_to_mongo_national("party") - calculate_age_diversity_rank_history_national() + # save_to_mongo_national("age", stair=10) + # save_to_mongo_national("gender") + # save_to_mongo_national("party") + # calculate_age_diversity_rank_history_national() if __name__ == "__main__": diff --git a/analysis/gender_party_hist.py b/analysis/gender_party_hist.py index 6b813e5..60ba566 100644 --- a/analysis/gender_party_hist.py +++ b/analysis/gender_party_hist.py @@ -45,6 +45,9 @@ def gender_hist( df["wiwName"] = df[["sdName", "wiwName"]].apply( lambda x: change_local_name(*x), axis=1 ) + df["sdName"] = df[["sdName", "wiwName"]].apply( + lambda x: "대구광역시" if x[1] == "군위군" else x[0], axis=1 + ) if level == 0: df = df[["sgId", "name", "gender"]].groupby(by=["sgId", "gender"]).count() @@ -153,6 +156,9 @@ def party_hist(councilor_type: str, level: int, is_elected: bool, filenames: lis df["wiwName"] = df[["sdName", "wiwName"]].apply( lambda x: change_local_name(*x), axis=1 ) + df["sdName"] = df[["sdName", "wiwName"]].apply( + lambda x: "대구광역시" if x[1] == "군위군" else x[0], axis=1 + ) if level == 0: df = df[["sgId", "name", "jdName"]].groupby(by=["sgId", "jdName"]).count()