-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_analysis.py
More file actions
61 lines (44 loc) · 2.1 KB
/
data_analysis.py
File metadata and controls
61 lines (44 loc) · 2.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
import numpy as np
def analysis_data(path):
print('-------analysis train and test--------')
train = pd.read_csv(path + "data/train.csv")
test = pd.read_csv(path + "data/test.csv")
print(train.head())
print(test.head())
a = train.groupby(['listing_id','repay_date'],as_index=False)['due_amt'].agg({'repay_amt':'sum'})
print(a.shape)
print(a.head())
print("train columns: \n", list(train.columns))
print("test columns: \n", list(test.columns))
print("train shape: ", train.shape)
print("test shape: ", test.shape)
print("train summary: \n", train.describe())
print("test summary: \n", test.describe())
print("train user_id nunique: ", train['user_id'].nunique())
print("test user_id nunique: ", test['user_id'].nunique())
print("train listing_id nunique: ", train['listing_id'].nunique())
print("test listing_id nunique: ", test['listing_id'].nunique())
print("train auditing_date: ", train['auditing_date'].min(),train['auditing_date'].max())
print("test auditing_date: ", test['auditing_date'].min(), test['auditing_date'].max())
print("train due_date: ", train['due_date'].min(), train['due_date'].max())
print("test due_date: ", test['due_date'].min(), test['due_date'].max())
print(len(set(train.user_id) & set(test.user_id)))
train['label'] = -1
train['due_date'] = pd.to_datetime(train['due_date'])
ind = train[train['repay_date'] != '\\N'].index
train.ix[ind, 'label'] = (train.ix[ind, 'due_date'] - pd.to_datetime(train.ix[ind, 'repay_date'])).map(lambda x: x.days)
train['mm'] = 0
train.ix[ind, 'mm'] = train.ix[ind, 'due_amt'].astype(float) - train.ix[ind, 'repay_amt'].astype(float)
a = train.groupby(['label'],as_index= False)['user_id'].agg({'count':'count'})
a['r'] = a['count'] / len(train)
print(a)
a = train.groupby(['mm'], as_index=False)['user_id'].agg({'count': 'count'})
print(a)
def get_label(due_date, repay_date):
if repay_date == '\\N':
label = -1
else:
label = due_date - pd.to_datetime(repay_date)
print(label)
return label