-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbase.py
More file actions
39 lines (33 loc) · 1.35 KB
/
base.py
File metadata and controls
39 lines (33 loc) · 1.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
import numpy as np
# xlsx source import
data = pd.read_excel('source/task_sample.xlsx',
sheet_name='Лист1')
# transform dtype
data[['time_index']] = data[['time_index']].astype(str)
# add columns
data['iter_str'] = data['time_index'].str[:4].astype(str) + data['task_mark'].astype(str)
# rename columns
data = data.rename(columns={'values':'vals'})
sort_list = sorted(set(data['iter_str']))
# list catch outliers
occup = []
for i in sort_list:
a = data[data['iter_str'] == i]
a = a.reset_index()
# symmetric distribution
out_bottom = (np.percentile(np.array([a.vals]), 25) -
(np.percentile(np.array([a.vals]), 75) - np.percentile(np.array([a.vals]), 25)) * 1.5).astype(int)
# .astype(str))
# positively skewed
if out_bottom <= np.min(np.array([a.vals])):
# try Median - (Q3-Q2)*1.5
out_bottom= (np.percentile(np.array([a.vals]), 25) -
(np.percentile(np.array([a.vals]), 75) - np.median(np.array([a.vals]))) * 1.5).astype(float)
occup.append(out_bottom)
else:
occup.append(out_bottom)
# convert statistic iter list 2 Dataframe
data_distr = pd.DataFrame([list(x) for x in zip(sort_list, occup)])
# merge main data with statistics
data = data.reset_index().merge(data_distr, left_on='iter_str', right_on=0).set_index('index')