-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdataset_maker.py
More file actions
38 lines (33 loc) · 1.08 KB
/
dataset_maker.py
File metadata and controls
38 lines (33 loc) · 1.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import random
import subprocess
clean = "ac"
raw = "ar"
random_samples = 3000
lines = subprocess.Popen(
'diff ' + clean + '.csv ' + raw + '.csv --unchanged-line-format="" --old-line-format="" --new-line-format="%dn,"',
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT).stdout.read().decode("utf-8")
num_lines = sum(1 for line in open(clean + '.csv'))
raw_lines = str(lines).strip(',').split(',')
clean_lines = [str(k) for k in range(1, num_lines) if k not in raw_lines]
clean_lines = random.sample(clean_lines, random_samples)
raw_lines += clean_lines
random.shuffle(raw_lines)
specified_lines = ['0'] + raw_lines
file = open(clean + '.csv')
lines = []
for pos, l_num in enumerate(file):
if str(pos) in specified_lines:
lines.append(l_num)
with open(clean + '_new.csv', 'w') as f:
for item in lines:
f.write("%s" % item)
file = open(raw + '.csv')
lines = []
for pos, l_num in enumerate(file):
if str(pos) in specified_lines:
lines.append(l_num)
with open(raw + '_new.csv', 'w') as f:
for item in lines:
f.write("%s" % item)