-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsampling.py
More file actions
executable file
·135 lines (109 loc) · 4.97 KB
/
sampling.py
File metadata and controls
executable file
·135 lines (109 loc) · 4.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 25 16:21:42 2020
@author: hsjomaa
"""
import tensorflow as tf
import pandas as pd
import random
import numpy as np
np.random.seed(318)
random.seed(3718)
tf.random.set_seed(0)
class Batch(object):
def __init__(self,batch_size,fixed_shape = True):
self.batch_size = batch_size
self.fixed_shape = fixed_shape
self.clear()
def clear(self):
# flattened triplets
self.x = []
# number of instances per item in triplets
self.instances = []
# number of features per item in triplets
self.features = []
# number of classes per item in triplets
self.classes = []
# model input
self.input = None
def append(self,instance):
if len(self.x)==self.batch_size:
self.clear()
self.x.append(instance[0])
self.instances.append(instance[1])
self.features.append(instance[2])
self.classes.append(instance[3])
def collect(self):
if len(self.x)!= self.batch_size and self.fixed_shape:
raise(f'Batch formation incomplete!\n{len(self.x)}!={self.batch_size}')
self.input = (tf.concat(self.x,axis=0),
tf.cast(tf.transpose(tf.concat(self.classes,axis=0)),dtype=tf.int32),
tf.cast(tf.transpose(tf.concat(self.features,axis=0)),dtype=tf.int32),
tf.cast(tf.transpose(tf.concat(self.instances,axis=0)),dtype=tf.int32),
)
self.output = {'similaritytarget':tf.concat([tf.ones(self.batch_size),tf.zeros(self.batch_size)],axis=0)}
def pool(n,ntotal,shuffle):
_pool = [_ for _ in list(range(ntotal)) if _!= n]
if shuffle:
random.shuffle(_pool)
return _pool
class Sampling(object):
def __init__(self,dataset):
self.dataset = dataset
self.distribution = pd.DataFrame(data=None,columns=['targetdataset','sourcedataset'])
self.targetdataset = None
def sample(self,batch,split,sourcesplit):
nsource = len(self.dataset.orig_data[sourcesplit])
ntarget = len(self.dataset.orig_data[split])
targetdataset = np.random.choice(ntarget,batch.batch_size)
# clear batch
batch.clear()
# find the negative dataset list of batch_size
sourcedataset = []
for target in targetdataset:
if split==sourcesplit:
swimmingpool = pool(target,nsource,shuffle=True)
else:
swimmingpool = pool(-1,nsource,shuffle=True)
sourcedataset.append(np.random.choice(swimmingpool))
sourcedataset = np.asarray(sourcedataset).reshape(-1,)
for target,source in zip(targetdataset,sourcedataset):
# build instance
instance = self.dataset.instances(target,source,split=split,sourcesplit=sourcesplit)
batch.append(instance)
distribution = np.concatenate([targetdataset.reshape(-1,1),sourcedataset[:,None]],axis=1)
self.distribution = pd.concat([self.distribution,\
pd.DataFrame(distribution,columns=['targetdataset','sourcedataset'])],axis=0,ignore_index=True)
self.targetdataset = targetdataset
return batch
class TestSampling(object):
def __init__(self,dataset):
self.dataset = dataset
self.distribution = pd.DataFrame(data=None,columns=['targetdataset','sourcedataset'])
def sample(self,batch,split,sourcesplit,targetdataset):
nsource = len(self.dataset.orig_data[sourcesplit])
# clear batch
batch.clear()
# find the negative dataset list of batch_size
swimmingpool = pool(targetdataset,nsource,shuffle=True) if split==sourcesplit else pool(-1,nsource,shuffle=True)
# double check divisibilty by batch size
sourcedataset = np.random.choice(swimmingpool,batch.batch_size,replace=False)
# iterate over batch negative datasets
for source in sourcedataset:
# build instance
instance = self.dataset.instances(targetdataset,source,split=split,sourcesplit=sourcesplit)
batch.append(instance)
distribution = np.concatenate([np.asarray(batch.batch_size*[targetdataset])[:,None],sourcedataset[:,None]],axis=1)
self.distribution = pd.concat([self.distribution,\
pd.DataFrame(distribution,columns=['targetdataset','sourcedataset'])],axis=0,ignore_index=True)
return batch
def sample_from_one_dataset(self,batch):
# clear batch
batch.clear()
# iterate over batch negative datasets
for _ in range(batch.batch_size):
# build instance
instance = self.dataset.instances()
batch.append(instance)
return batch