Skip to content

Commit 5e197ba

Browse files
committed
Merge branch 'MeghanaNalla/main' into issue60
2 parents 7462cca + a21a71b commit 5e197ba

File tree

3 files changed

+12238
-155
lines changed

3 files changed

+12238
-155
lines changed

README.md

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,19 @@ modules:
3939
- macOS:
4040
1. Install [Homebrew][homebrew]
4141
2. Install pipenv:
42-
```
42+
```shell
4343
brew install pipenv
4444
```
45-
3. Create the Python virtual environment and install prerequisites using:
46-
`pipenv`:
47-
```
48-
shell pipenv sync --dev
49-
```
50-
4. Then you can run the static analysis tools:
45+
3. Create the Python virtual environment and install prerequisites using
46+
`pipenv`:
47+
```shell
48+
pipenv sync --dev
5149
```
52-
shell ./tools.sh
50+
4. Then you can run the static analysis tools:
51+
```shell
52+
./tools.sh
5353
```
5454

55-
5655
[pipenvdocs]: https://pipenv.pypa.io/en/latest/
5756
[homebrew]: https://brew.sh/
5857
[pipenvinstall]: https://pipenv.pypa.io/en/latest/install/#installing-pipenv
@@ -82,7 +81,7 @@ To successfully run scripts that require client credentials, you will need to fo
8281
- [flake8][flake8]: a python tool that glues together pep8, pyflakes, mccabe,
8382
and third-party plugins to check the style and quality of some python code.
8483
- [isort][isort]: A Python utility / library to sort imports
85-
- It doesn't import any libraries, it only sorts and formats them.
84+
- (It doesn't import any libraries, it only sorts and formats them.)
8685

8786
[ccospyguide]: https://opensource.creativecommons.org/contributing-code/python-guidelines/
8887
[black]: https://github.com/psf/black

flickr/data_cleaning.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
"""
2+
This is to clean the data pulled by the photos_detail.py script so as to
3+
further delete useless columns and reorganize the dataset as this form:
4+
5+
| locations | amount | time | license | content_categories | highest_comment | total_view | # noqa: E501
6+
| -------------------------------- | -----: | ---------- | ------: | ------------------ | --------------: | ---------: | # noqa: E501
7+
| Minneapolis, United States | 20 | 2022-10-22 | 4 | football, life | 105 | 100000 | # noqa: E501
8+
| São José do Rio Preto SP, Brasil | 30 | 2022-10-22 | 4 | football, life | 50 | 300000 | # noqa: E501
9+
...
10+
11+
Note:
12+
content_categories will be got from basic NLP on the tags column
13+
"""
14+
15+
# Standard library
16+
import sys
17+
import traceback
18+
19+
# Third-party
20+
import pandas as pd
21+
22+
23+
def drop_empty_column(csv_path, new_csv_path): # attribute is string
24+
df = pd.read_csv(csv_path)
25+
for col in df.columns: # to get the column list
26+
if "Unnamed" in col:
27+
data = df.drop(col, axis=1)
28+
print("Dropping column", col)
29+
data.to_csv(new_csv_path)
30+
print("Dropping empty columns")
31+
32+
33+
def drop_duplicate_id(csv_path, new_csv_path): # attribute is string
34+
df = pd.read_csv(csv_path)
35+
data = df.drop_duplicates(subset=["id"])
36+
data.to_csv(new_csv_path)
37+
print("Dropping duplicates")
38+
39+
40+
def save_new_data(
41+
csv_path, column_name_list, new_csv_path
42+
): # attribute is string
43+
"""
44+
column_name_list must belongs to the
45+
existing column names from original csv
46+
csv_path is the path of original csv
47+
This function generate a new dataframe
48+
to save final data with useful columns
49+
"""
50+
df = pd.read_csv(csv_path)
51+
new_df = pd.DataFrame()
52+
for col in column_name_list:
53+
new_df[col] = list(df[col])
54+
print("Saving column", col)
55+
new_df.to_csv(new_csv_path)
56+
print("Saving new data to new csv")
57+
58+
59+
def main():
60+
drop_empty_column("hs.csv", "dataset/cleaned_license10.csv")
61+
drop_duplicate_id(
62+
"dataset/cleaned_license10.csv", "dataset/cleaned_license10.csv"
63+
)
64+
save_new_data(
65+
"dataset/cleaned_license10.csv",
66+
[
67+
"location",
68+
"dates",
69+
"license",
70+
"description",
71+
"tags",
72+
"views",
73+
"comments",
74+
],
75+
"dataset/cleaned_license10.csv",
76+
)
77+
78+
79+
if __name__ == "__main__":
80+
try:
81+
main()
82+
except SystemExit as e:
83+
sys.exit(e.code)
84+
except KeyboardInterrupt:
85+
print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr)
86+
sys.exit(130)
87+
except Exception:
88+
print("ERROR (1) Unhandled exception:", file=sys.stderr)
89+
print(traceback.print_exc(), file=sys.stderr)
90+
sys.exit(1)

0 commit comments

Comments
 (0)