-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathshuffle_csv.py
More file actions
60 lines (50 loc) · 1.89 KB
/
shuffle_csv.py
File metadata and controls
60 lines (50 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import os
import datetime as dt
from tqdm import tqdm
import pandas as pd
import numpy as np
def f2cat(filename: str) -> str:
return filename.split('.')[0]
class Simplified():
def __init__(self, input_path='/mnt/raid1/kaggle'):
self.input_path = input_path
def list_all_categories(self):
files = os.listdir(os.path.join(self.input_path, 'train_simplified'))
return sorted([f2cat(f) for f in files], key=str.lower)
def read_training_csv(self, category, nrows=None, usecols=None, drawing_transform=False):
df = pd.read_csv(os.path.join(self.input_path, 'train_simplified', category + '.csv'),
nrows=nrows, parse_dates=['timestamp'], usecols=usecols)
if drawing_transform:
df['drawing'] = df['drawing'].apply(json.loads)
return df
start = dt.datetime.now()
s = Simplified('./input/')
NCSVS = 100
categories = s.list_all_categories()
print(len(categories))
340
for y, cat in tqdm(enumerate(categories)):
df = s.read_training_csv(cat, nrows=100000)
df['y'] = y
df['cv'] = (df.key_id // 10 ** 7) % NCSVS
for k in range(NCSVS):
filename = 'train_k{}.csv'.format(k)
chunk = df[df.cv == k]
chunk = chunk.drop(['key_id'], axis=1)
if y == 0:
chunk.to_csv(filename, index=False)
else:
chunk.to_csv(filename, mode='a', header=False, index=False)
output_path = './input/shuffle-csvs/'
for k in tqdm(range(NCSVS)):
filename = 'train_k{}.csv'.format(k)
if os.path.exists(filename):
df = pd.read_csv(filename)
df['rnd'] = np.random.rand(len(df))
df = df.sort_values(by='rnd').drop('rnd', axis=1)
df.to_csv(output_path + filename + '.gz', compression='gzip', index=False)
os.remove(filename)
print(df.shape)
end = dt.datetime.now()
print('Latest run {}.\nTotal time {}s'.format(end, (end - start).seconds))