forked from aldolipani/TABME
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample_folders.py
More file actions
executable file
·55 lines (45 loc) · 1.22 KB
/
sample_folders.py
File metadata and controls
executable file
·55 lines (45 loc) · 1.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python
import random
from collections import defaultdict
from copy import deepcopy
from pathlib import Path
import click
import numpy as np
DATA_PATH = Path(Path(__file__).parents[1] / 'data/')
@click.command()
@click.option(
'--lambda_val',
'-l',
type=int,
default=11,
required=True,
help='lambda value for Poisson sampling distribution',
)
@click.option(
'--num_samples',
'-n',
type=int,
required=True,
help='Number of total documents to sample when building bundles',
)
@click.argument('docs')
def cli(docs, num_samples, lambda_val):
paths = []
bundles = defaultdict(list)
for path in DATA_PATH.glob('*'):
paths.append(str(path).split('/')[-1])
rng = np.random.default_rng()
expend_paths = deepcopy(paths)
while expend_paths:
random.shuffle(expend_paths)
bundles[random.randint(1, num_samples)].append(expend_paths.pop())
for i, n in enumerate(rng.poisson(lambda_val, num_samples)):
for name in rng.choice(paths, n):
bundles[i + 1].append(name)
counter = 1
for v in bundles.values():
for stem in v:
print(stem, counter)
counter += 1
if __name__ == '__main__':
cli()