Merge branch 'MeghanaNalla/main' into issue60

TimidRobot · TimidRobot · commit 5e197ba914fa · 2023-08-10T11:31:44.000-07:00
diff --git a/README.md b/README.md
@@ -39,20 +39,19 @@ modules:
 - macOS:
   1. Install [Homebrew][homebrew]
   2. Install pipenv:
-        ```
+        ```shell
         brew install pipenv
         ```
-  3. Create the Python virtual environment and install prerequisites using:
-       `pipenv`:
-       	```
-     	shell pipenv sync --dev
-    	``` 
-  4. Then you can run the static analysis tools:
+  3. Create the Python virtual environment and install prerequisites using
+     `pipenv`:
+        ```shell
+        pipenv sync --dev
         ```
-     	shell ./tools.sh
+  4. Then you can run the static analysis tools:
+        ```shell
+        ./tools.sh
         ```
 
-
 [pipenvdocs]: https://pipenv.pypa.io/en/latest/
 [homebrew]: https://brew.sh/
 [pipenvinstall]: https://pipenv.pypa.io/en/latest/install/#installing-pipenv
@@ -82,7 +81,7 @@ To successfully run scripts that require client credentials, you will need to fo
 - [flake8][flake8]: a python tool that glues together pep8, pyflakes, mccabe,
   and third-party plugins to check the style and quality of some python code.
 - [isort][isort]: A Python utility / library to sort imports
-	- It doesn't import any libraries, it only sorts and formats them. 
+  - (It doesn't import any libraries, it only sorts and formats them.)
 
 [ccospyguide]: https://opensource.creativecommons.org/contributing-code/python-guidelines/
 [black]: https://github.com/psf/black
diff --git a/flickr/data_cleaning.py b/flickr/data_cleaning.py
@@ -0,0 +1,90 @@
+"""
+This is to clean the data pulled by the photos_detail.py script so as to
+further delete useless columns and reorganize the dataset as this form:
+
+|       locations                  | amount |   time     | license | content_categories | highest_comment | total_view |  # noqa: E501
+| -------------------------------- | -----: | ---------- | ------: | ------------------ | --------------: | ---------: |  # noqa: E501
+| Minneapolis, United States       |     20 | 2022-10-22 |       4 | football, life     |             105 |     100000 |  # noqa: E501
+| São José do Rio Preto SP, Brasil |     30 | 2022-10-22 |       4 | football, life     |              50 |     300000 |  # noqa: E501
+...
+
+Note:
+content_categories will be got from basic NLP on the tags column
+"""
+
+# Standard library
+import sys
+import traceback
+
+# Third-party
+import pandas as pd
+
+
+def drop_empty_column(csv_path, new_csv_path):  # attribute is string
+    df = pd.read_csv(csv_path)
+    for col in df.columns:  # to get the column list
+        if "Unnamed" in col:
+            data = df.drop(col, axis=1)
+            print("Dropping column", col)
+    data.to_csv(new_csv_path)
+    print("Dropping empty columns")
+
+
+def drop_duplicate_id(csv_path, new_csv_path):  # attribute is string
+    df = pd.read_csv(csv_path)
+    data = df.drop_duplicates(subset=["id"])
+    data.to_csv(new_csv_path)
+    print("Dropping duplicates")
+
+
+def save_new_data(
+    csv_path, column_name_list, new_csv_path
+):  # attribute is string
+    """
+    column_name_list must belongs to the
+    existing column names from original csv
+    csv_path is the path of original csv
+    This function generate a new dataframe
+    to save final data with useful columns
+    """
+    df = pd.read_csv(csv_path)
+    new_df = pd.DataFrame()
+    for col in column_name_list:
+        new_df[col] = list(df[col])
+        print("Saving column", col)
+    new_df.to_csv(new_csv_path)
+    print("Saving new data to new csv")
+
+
+def main():
+    drop_empty_column("hs.csv", "dataset/cleaned_license10.csv")
+    drop_duplicate_id(
+        "dataset/cleaned_license10.csv", "dataset/cleaned_license10.csv"
+    )
+    save_new_data(
+        "dataset/cleaned_license10.csv",
+        [
+            "location",
+            "dates",
+            "license",
+            "description",
+            "tags",
+            "views",
+            "comments",
+        ],
+        "dataset/cleaned_license10.csv",
+    )
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except SystemExit as e:
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr)
+        sys.exit(130)
+    except Exception:
+        print("ERROR (1) Unhandled exception:", file=sys.stderr)
+        print(traceback.print_exc(), file=sys.stderr)
+    sys.exit(1)
diff --git a/flickr/dataset/cleaned_license10.csv b/flickr/dataset/cleaned_license10.csv