train_delay_pred/data_helper.py at main · jihunkeom/train_delay_pred · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import pandas as pd
import numpy as np

station_list = ['지평', '용문', '원덕', '양평', '오빈', '아신', '국수', '신원',
       '양수', '운길산', '팔당', '도심', '덕소', '1양정', '도농', '구리', '1양원', '망우', '상봉',
       '중랑', '회기', '청량리', '왕십리', '응봉', '옥수', '한남', '서빙고', '이촌', '용산', '효창공',
       '공덕', '서강대', '홍대입', '가좌', '디엠시', '수색', '화전', '강매', '행신', '능곡', '대곡',
       '곡산', '백마', '풍산', '일산', '탄현', '야당', '운정', '금릉', '금촌', '월롱', '파주', '문산']
train_list = ['K5001', 'K5002', 'K5003', 'K5004', 'K5005', 'K5006', 'K5007', 'K5008', 'K5009', 'K5010', 'K5011', 'K5012'
    ,'K5013', 'K5014', 'K5015', 'K5016', 'K5017', 'K5018', 'K5019', 'K5020', 'K5021', 'K5022', 'K5023', 'K5024', 'K5025', 'K5026'
    , 'K5027', 'K5028','K5029', 'K5030','K5031', 'K5032', 'K5033', 'K5034', 'K5035', 'K5036', 'K5037', 'K5038', 'K5039', 'K5040', 'K5041'
    , 'K5042', 'K5043', 'K5044', 'K5045', 'K5046', 'K5047', 'K5048', 'K5049', 'K5050', 'K5051', 'K5052', 'K5053', 'K5054'
    , 'K5055', 'K5056', 'K5057', 'K5058', 'K5059', 'K5060', 'K5061', 'K5062', 'K5063', 'K5064', 'K5065', 'K5066', 'K5067'
    , 'K5068', 'K5069', 'K5070', 'K5071', 'K5072', 'K5073', 'K5074', 'K5075', 'K5076', 'K5077', 'K5078', 'K5079', 'K5080'
    , 'K5081', 'K5082', 'K5083', 'K5084', 'K5085', 'K5086', 'K5087', 'K5088', 'K5089', 'K5090', 'K5091', 'K5092', 'K5093'
    , 'K5094', 'K5095', 'K5096', 'K5097', 'K5098', 'K5099', 'K5100', 'K5101', 'K5102', 'K5103', 'K5104', 'K5105', 'K5106'
    , 'K5107', 'K5108', 'K5109', 'K5110', 'K5111', 'K5112', 'K5113', 'K5114', 'K5115', 'K5116', 'K5117', 'K5118', 'K5119'
    , 'K5120', 'K5121','K5122', 'K5123', 'K5124', 'K5125', 'K5126', 'K5127', 'K5128', 'K5129', 'K5130','K5131', 'K5132', 'K5133', 'K5134'
    , 'K5135', 'K5136', 'K5137', 'K5138', 'K5139', 'K5140', 'K5141', 'K5142', 'K5143', 'K5144', 'K5145', 'K5146', 'K5147'
    , 'K5148', 'K5149', 'K5150', 'K5151', 'K5152', 'K5153', 'K5154', 'K5155', 'K5156', 'K5157', 'K5158', 'K5159', 'K5160'
    , 'K5161', 'K5162', 'K5163', 'K5164', 'K5165', 'K5166', 'K5167', 'K5168', 'K5170', 'K5171', 'K5231', 'K5246', 'K5272'
    , 'K5301', 'K5302', 'K5303', 'K5304', 'K5701', 'K5702', 'K5703', 'K5704', 'K5705', 'K5706', 'K5707', 'K5708', 'K5851'
    , 'K5852', 'K5853', 'K5854']

up_train = ['K5002', 'K5004', 'K5006', 'K5008', 'K5010', 'K5012', 'K5014', 'K5016', 'K5018', 'K5020', 'K5022', 'K5024', 'K5026', 'K5028', 'K5030', 'K5032', 'K5034', 'K5036', 'K5038', 'K5040', 'K5042', 'K5044', 'K5046', 'K5048', 'K5050', 'K5052', 'K5054', 'K5056', 'K5058', 'K5060', 'K5062', 'K5064', 'K5066', 'K5068', 'K5070', 'K5072', 'K5074', 'K5076', 'K5078', 'K5080', 'K5082', 'K5084', 'K5086', 'K5088', 'K5090', 'K5092', 'K5094', 'K5096', 'K5098', 'K5100', 'K5102', 'K5104', 'K5106', 'K5108', 'K5110', 'K5112', 'K5114', 'K5116', 'K5118', 'K5120', 'K5122', 'K5124', 'K5126', 'K5128', 'K5130', 'K5132', 'K5134', 'K5136', 'K5138', 'K5140', 'K5142', 'K5144', 'K5146', 'K5148', 'K5150', 'K5152', 'K5154', 'K5156', 'K5158', 'K5160', 'K5162', 'K5164', 'K5166', 'K5168', 'K5170', 'K5246', 'K5272', 'K5302', 'K5304', 'K5702', 'K5704', 'K5706', 'K5708', 'K5852', 'K5854']
down_train = ['K5001', 'K5003', 'K5005', 'K5007', 'K5009', 'K5011', 'K5013', 'K5015', 'K5017', 'K5019', 'K5021', 'K5023', 'K5025', 'K5027', 'K5029', 'K5031', 'K5033', 'K5035', 'K5037', 'K5039', 'K5041', 'K5043', 'K5045', 'K5047', 'K5049', 'K5051', 'K5053', 'K5055', 'K5057', 'K5059', 'K5061', 'K5063', 'K5065', 'K5067', 'K5069', 'K5071', 'K5073', 'K5075', 'K5077', 'K5079', 'K5081', 'K5083', 'K5085', 'K5087', 'K5089', 'K5091', 'K5093', 'K5095', 'K5097', 'K5099', 'K5101', 'K5103', 'K5105', 'K5107', 'K5109', 'K5111', 'K5113', 'K5115', 'K5117', 'K5119', 'K5121', 'K5123', 'K5125', 'K5127', 'K5129', 'K5131', 'K5133', 'K5135', 'K5137', 'K5139', 'K5141', 'K5143', 'K5145', 'K5147', 'K5149', 'K5151', 'K5153', 'K5155', 'K5157', 'K5159', 'K5161', 'K5163', 'K5165', 'K5167', 'K5171', 'K5231', 'K5301', 'K5303', 'K5701', 'K5703', 'K5705', 'K5707', 'K5851', 'K5853']
#data는 순서대로 열차번호, 요일, 공휴일, 시발역, 종착역, 상/하행, 역별 지연정보

#X는 one hot incoding으로 모두 concat된 입력쌍, 순서대로 열차, 요일, 공휴일, 시발역, 종착역, 상/하행력, 정차역
#Y는 지연정보
#RNN 학습을 위해 입력값을 한열차당 데이터로 묶어서 표현, 한 인풋당 길이 357, 역이 53개 인 2차원 데이터임

# DATA EXAMPLE
# 하나의 x input [0, 1, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, ..... 0, 0, 0, 0, 0,0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
# RNN의 placeholder에 집어넣을 X input [[0,1,0,0,0,0,0...],[0,0,1,0,0,0,0,0,...],[0,0,0,1,0,0,0....],...]
# 하나의 y input 13(초)
# RNN의 placeholder에 집어넣을 Y input [14,2,53,3,1,0,...]


#path는 train table 경로, count는 default all로 전부 불러오며 설정할시 count에 설정한 줄 수만큼 불러옴, 간단히 데이터 내용 보고싶을때 사용하려고 만듬
# default(all)로 불러오면 train할떄 시간이 좀 걸릴 수 있으니 조금씩 늘려가며 간보는걸 추천드립니다. 전체 데이터 수가 약 37000개 정도 됩니다.

# default version = 0, X 역별 한개 길이 357
# 시작역, 종착역 정보제거 version = 1, X 역별 한개 길이 251
# 시작역, 종착역 정보제거, 열차를 상/하행별로 순서만 반영 version = 2,X 역별 한개 길이 157
def readTraindata(path,ver=0,count='all'):
    if ver == 0:
        return readTraindata0(path,count)
    elif ver ==1:
        return readTraindata1(path,count)
    elif ver ==2:
        return readTraindata2(path,count)
    return print('input error')


def readTraindata0(path,count='all'):
    if count == 'all':
        df = pd.read_csv(path).drop(['Unnamed: 0'], axis=1)
    else:
        df = pd.read_csv(path).drop(['Unnamed: 0'], axis=1)[:count]
    x=[]
    y=[]
    for item in df.values:
        x_fraction = oneHot('train',getTrainNum(item[0]))+oneHot('weekdays',item[1])+[item[2]]\
                   +oneHot("station",getStationNum(item[3]))+oneHot('station',getStationNum(item[4]))+[item[5]]
        temp = item[6:]
        x_f = [];
        if item[5] == 0:
            for i in range(len(temp)):
                x_f.append(x_fraction + oneHot('station', i))
            y.append(factorize(list(temp)))
        elif item[5] == 1:
            for i in range(len(temp)):
                x_f.append(x_fraction + oneHot('station', i, inverse=True))
            y_f=list(temp)
            y_f.reverse()
            y.append(factorize(y_f))
        x.append(x_f)
    return x, y

def readTraindata1(path,count='all'):
    if count == 'all':
        df = pd.read_csv(path).drop(['Unnamed: 0'], axis=1)
    else:
        df = pd.read_csv(path).drop(['Unnamed: 0'], axis=1)[:count]
    x=[]
    y=[]
    for item in df.values:
        x_fraction = oneHot('train', getTrainNum(item[0])) + oneHot('weekdays', item[1]) + [item[2]] + [item[5]]
        temp = item[6:]
        x_f = [];
        if item[5] == 0:
            for i in range(len(temp)):
                x_f.append(x_fraction + oneHot('station', i))
            y.append(factorize(list(temp)))
        elif item[5] == 1:
            for i in range(len(temp)):
                x_f.append(x_fraction + oneHot('station', i, inverse=True))
            y_f=list(temp)
            y_f.reverse()
            y.append(factorize(y_f))
        x.append(x_f)
    return x, y

def readTraindata2(path,count='all'):
    if count == 'all':
        df = pd.read_csv(path).drop(['Unnamed: 0'], axis=1)
    else:
        df = pd.read_csv(path).drop(['Unnamed: 0'], axis=1)[:count]
    x=[]
    y=[]
    for item in df.values:
        if item[5] == 0:
            train_onehot = oneHot('train_ud',getTrainNum(item[0],0))
        else:
            train_onehot = oneHot('train_ud',getTrainNum(item[0],1))
        x_fraction = train_onehot +oneHot('weekdays',item[1])+[item[2]]+[item[5]]
        temp = item[6:]
        x_f = [];
        if item[5] == 0:
            for i in range(len(temp)):
                x_f.append(x_fraction + oneHot('station', i))
            y.append(factorize(list(temp)))
        elif item[5] == 1:
            for i in range(len(temp)):
                x_f.append(x_fraction + oneHot('station', i, inverse=True))
            y_f=list(temp)
            y_f.reverse()
            y.append(factorize(y_f))
        x.append(x_f)
    return x, y

def read2(path,count='all'):
    if count == 'all':
        df = pd.read_csv(path)
    else:
        df = pd.read_csv(path)[:count]
    x=[]; y=[]
    for item in df.values:
        x_f = oneHot('train',getTrainNum(item[0]))+oneHot('weekdays',item[1])+[item[2]]\
                   +oneHot("station",getStationNum(item[3]))+oneHot('station',getStationNum(item[4]))+[item[5]];
        x_1 = x_f+oneHot('station',item[6])
        x_2 = x_f+oneHot('station',item[7])
        x.append([x_1,x_2])
        y.append([[item[8]],[item[9]]])
    return np.array(x),np.array(y)


def factorize(y):
    return [[item] for item in y]


def oneHot(type,val,inverse=False):
    if type == 'train':
        long = 189
    elif type == 'station':
        long = 53
    elif type == 'weekdays':
        long = 7
    elif type == 'train_ud':
        long = 95
    a = [0] * (long)
    if inverse:
        a[long-val-1] = 1
    else:
        a[val] = 1
    return a


def getStationNum(name):
    return(station_list.index(name))

def getTrainNum(name,ud=None):
    if ud==0:
        return(up_train.index(name))
    elif ud ==1:
        return(down_train.index(name))
    return(train_list.index(name))

def getStationName(num):
    return(station_list[num])

def getTrainName(num):
    return(train_list[num])