DataScience/remove_outlier.py at main · jimin61445/DataScience · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
import numpy as np

# Read dataset
df = pd.read_csv("test_dataset/after_handling_nan.csv", encoding='cp949')
# Remove unused columns
df = df.drop(df.columns[:3], axis=1)

# Store time slot data to np.array
time_columns = np.array(['06시이전', '06-07시간대', '07-08시간대', '08-09시간대', '09-10시간대', '10-11시간대', '12-13시간대', '13-14시간대', '14-15시간대', '15-16시간대', '16-17시간대', '17-18시간대', '18-19시간대', '19-20시간대', '20-21시간대', '21-22시간대', '22-23시간대', '23-24시간대', '24시이후'])

# Grouping based on '역번호' and '승하차구분'
grouped_data = df.groupby(['역번호', '승하차구분'])
# Make empty df for final
final_df = pd.DataFrame()

# To know how many data removed finally
total_removed_num = 0

# Traversal group data
# If group_key is (150, 0), 역번호 value is 150 and 승차 / If (150, 1), 역번호 value is 150 and 하차 data
for group_key, group_df in grouped_data:
    print(f"[{group_key}번째 그룹 값]")
    # Make a set for storing outliers
    outlier_set = set()

    # Remove outlier by time slot
    for t in time_columns:
        print(f"{t} 시간의 이상치 제거")
        # Avg and Std of time slot
        mean = np.mean(group_df[t])
        std = np.std(group_df[t])
        # Get index by calculate Z-Score with 95% confidence interval
        idx = group_df[(abs((group_df[t]-mean)/std))>1.96].index
        # For checking value
        print(f"[max] {np.max(group_df[t])}")
        print(f"[min] {np.min(group_df[t])}")
        print(f"[mean] {mean}")
        print(f"[std] {std}")
        print(f"[outlier index] {idx}")
        print(f"[removed num] {len(idx)}")
        # Add outlier index to set
        outlier_set.update(idx)
        print()
    # Filter only data that exists in a group
    # Prevent duplicate data access that has already been deleted
    valid_outliers_idx = [idx for idx in outlier_set if idx in group_df.index]
    print(valid_outliers_idx)
    # Accumulate outlier number
    total_removed_num += len(valid_outliers_idx)
    # Drop outliers from group
    group_df = group_df.drop(valid_outliers_idx)
    # Append group data to final df
    final_df = pd.concat([final_df,group_df])
# Save csv file
final_df.to_csv(f"test_dataset/remove_outlier_final.csv",encoding='cp949')
# Print how many outliers removed
print(f"[total removed num] {total_removed_num}개 삭제됨")