|
| 1 | +""" |
| 2 | +This is to clean the data pulled by the photos_detail.py script so as to |
| 3 | +further delete useless columns and reorganize the dataset as this form: |
| 4 | +
|
| 5 | +| locations | amount | time | license | content_categories | highest_comment | total_view | # noqa: E501 |
| 6 | +| -------------------------------- | -----: | ---------- | ------: | ------------------ | --------------: | ---------: | # noqa: E501 |
| 7 | +| Minneapolis, United States | 20 | 2022-10-22 | 4 | football, life | 105 | 100000 | # noqa: E501 |
| 8 | +| São José do Rio Preto SP, Brasil | 30 | 2022-10-22 | 4 | football, life | 50 | 300000 | # noqa: E501 |
| 9 | +... |
| 10 | +
|
| 11 | +Note: |
| 12 | +content_categories will be got from basic NLP on the tags column |
| 13 | +""" |
| 14 | + |
| 15 | +# Standard library |
| 16 | +import sys |
| 17 | +import traceback |
| 18 | + |
| 19 | +# Third-party |
| 20 | +import pandas as pd |
| 21 | + |
| 22 | + |
| 23 | +def drop_empty_column(csv_path, new_csv_path): # attribute is string |
| 24 | + df = pd.read_csv(csv_path) |
| 25 | + for col in df.columns: # to get the column list |
| 26 | + if "Unnamed" in col: |
| 27 | + data = df.drop(col, axis=1) |
| 28 | + print("Dropping column", col) |
| 29 | + data.to_csv(new_csv_path) |
| 30 | + print("Dropping empty columns") |
| 31 | + |
| 32 | + |
| 33 | +def drop_duplicate_id(csv_path, new_csv_path): # attribute is string |
| 34 | + df = pd.read_csv(csv_path) |
| 35 | + data = df.drop_duplicates(subset=["id"]) |
| 36 | + data.to_csv(new_csv_path) |
| 37 | + print("Dropping duplicates") |
| 38 | + |
| 39 | + |
| 40 | +def save_new_data( |
| 41 | + csv_path, column_name_list, new_csv_path |
| 42 | +): # attribute is string |
| 43 | + """ |
| 44 | + column_name_list must belongs to the |
| 45 | + existing column names from original csv |
| 46 | + csv_path is the path of original csv |
| 47 | + This function generate a new dataframe |
| 48 | + to save final data with useful columns |
| 49 | + """ |
| 50 | + df = pd.read_csv(csv_path) |
| 51 | + new_df = pd.DataFrame() |
| 52 | + for col in column_name_list: |
| 53 | + new_df[col] = list(df[col]) |
| 54 | + print("Saving column", col) |
| 55 | + new_df.to_csv(new_csv_path) |
| 56 | + print("Saving new data to new csv") |
| 57 | + |
| 58 | + |
| 59 | +def main(): |
| 60 | + drop_empty_column("hs.csv", "dataset/cleaned_license10.csv") |
| 61 | + drop_duplicate_id( |
| 62 | + "dataset/cleaned_license10.csv", "dataset/cleaned_license10.csv" |
| 63 | + ) |
| 64 | + save_new_data( |
| 65 | + "dataset/cleaned_license10.csv", |
| 66 | + [ |
| 67 | + "location", |
| 68 | + "dates", |
| 69 | + "license", |
| 70 | + "description", |
| 71 | + "tags", |
| 72 | + "views", |
| 73 | + "comments", |
| 74 | + ], |
| 75 | + "dataset/cleaned_license10.csv", |
| 76 | + ) |
| 77 | + |
| 78 | + |
| 79 | +if __name__ == "__main__": |
| 80 | + try: |
| 81 | + main() |
| 82 | + except SystemExit as e: |
| 83 | + sys.exit(e.code) |
| 84 | + except KeyboardInterrupt: |
| 85 | + print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) |
| 86 | + sys.exit(130) |
| 87 | + except Exception: |
| 88 | + print("ERROR (1) Unhandled exception:", file=sys.stderr) |
| 89 | + print(traceback.print_exc(), file=sys.stderr) |
| 90 | + sys.exit(1) |
0 commit comments