-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsample.py
More file actions
executable file
·75 lines (64 loc) · 2.2 KB
/
sample.py
File metadata and controls
executable file
·75 lines (64 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
# given file of line-segmented records, sample n of them.
# implements reservoir sampling -- efficient memory and runtime (one pass through data, memory O(sample)
# alg implementation borrowed from http://stackoverflow.com/questions/2612648/reservoir-sampling
import argparse
import sys
import os
import random
import codecs
if sys.version_info[0] == 2:
from itertools import izip
else:
izip = zip
from collections import defaultdict as dd
import re
import os.path
import gzip
scriptdir = os.path.dirname(os.path.abspath(__file__))
reader = codecs.getreader('utf8')
writer = codecs.getwriter('utf8')
def prepfile(fh, code):
ret = gzip.open(fh.name, code if code.endswith("t") else code+"t") if fh.name.endswith(".gz") else fh
if sys.version_info[0] == 2:
if code.startswith('r'):
ret = reader(fh)
elif code.startswith('w'):
ret = writer(fh)
else:
sys.stderr.write("I didn't understand code "+code+"\n")
sys.exit(1)
return ret
def main():
parser = argparse.ArgumentParser(description="sample k records from file")
parser.add_argument("--infile", "-i", type=argparse.FileType('r'), default=sys.stdin, help="input file")
parser.add_argument("--outfile", "-o", type=argparse.FileType('w'), default=sys.stdout, help="output file")
parser.add_argument("--remainderfile", "-r", type=argparse.FileType('w'), default=os.devnull, help="remainder (lines not sampled) file")
parser.add_argument("--size", "-s", type=int, default=100, help="number of samples")
try:
args = parser.parse_args()
except IOError as msg:
parser.error(str(msg))
infile = prepfile(args.infile, 'r')
outfile = prepfile(args.outfile, 'w')
remainderfile = prepfile(args.remainderfile, 'w')
result = []
N = 0
K = args.size
for item in infile:
N += 1
if len( result ) < K:
result.append( item )
else:
s = int(random.random() * N)
if s < K:
remainderfile.write(result[s])
result[s] = item
else:
remainderfile.write(item)
if len(result) < K:
sys.stderr.write("Warning: only %d items in input; you requested %d\n" % (len(result), K))
for item in result:
outfile.write(item)
if __name__ == '__main__':
main()