d2v_copy/sampling.py at main · khayhamz31/d2v_copy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 25 16:21:42 2020

@author: hsjomaa
"""

import tensorflow as tf
import pandas as pd
import random
import numpy as np
np.random.seed(318)
random.seed(3718)
tf.random.set_seed(0)
class Batch(object):

    def __init__(self,batch_size,fixed_shape = True):

        self.batch_size = batch_size
        self.fixed_shape = fixed_shape
        self.clear()

    def clear(self):
        # flattened triplets
        self.x = []
        # number of instances per item in triplets
        self.instances = []
        # number of features per item in triplets
        self.features = []
        # number of classes per item in triplets
        self.classes = []
        # model input
        self.input = None

    def append(self,instance):

        if len(self.x)==self.batch_size:

            self.clear()

        self.x.append(instance[0])
        self.instances.append(instance[1])
        self.features.append(instance[2])
        self.classes.append(instance[3])

    def collect(self):

        if len(self.x)!= self.batch_size and self.fixed_shape:
            raise(f'Batch formation incomplete!\n{len(self.x)}!={self.batch_size}')
        self.input = (tf.concat(self.x,axis=0),
                      tf.cast(tf.transpose(tf.concat(self.classes,axis=0)),dtype=tf.int32),
                      tf.cast(tf.transpose(tf.concat(self.features,axis=0)),dtype=tf.int32),
                      tf.cast(tf.transpose(tf.concat(self.instances,axis=0)),dtype=tf.int32),
                      )
        self.output = {'similaritytarget':tf.concat([tf.ones(self.batch_size),tf.zeros(self.batch_size)],axis=0)}

def pool(n,ntotal,shuffle):
    _pool = [_ for _ in list(range(ntotal)) if _!= n]
    if shuffle:
        random.shuffle(_pool)
    return _pool

class Sampling(object):
    def __init__(self,dataset):
        self.dataset          = dataset
        self.distribution     = pd.DataFrame(data=None,columns=['targetdataset','sourcedataset'])
        self.targetdataset   = None

    def sample(self,batch,split,sourcesplit):

        nsource  = len(self.dataset.orig_data[sourcesplit])
        ntarget  = len(self.dataset.orig_data[split])
        targetdataset = np.random.choice(ntarget,batch.batch_size)
        # clear batch
        batch.clear()
        # find the negative dataset list of batch_size
        sourcedataset = []
        for target in targetdataset:
            if split==sourcesplit:
                swimmingpool  = pool(target,nsource,shuffle=True)
            else:
                swimmingpool  = pool(-1,nsource,shuffle=True)
            sourcedataset.append(np.random.choice(swimmingpool))
        sourcedataset = np.asarray(sourcedataset).reshape(-1,)
        for target,source in zip(targetdataset,sourcedataset):
            # build instance
            instance = self.dataset.instances(target,source,split=split,sourcesplit=sourcesplit)
            batch.append(instance)

        distribution      = np.concatenate([targetdataset.reshape(-1,1),sourcedataset[:,None]],axis=1)
        self.distribution = pd.concat([self.distribution,\
                                       pd.DataFrame(distribution,columns=['targetdataset','sourcedataset'])],axis=0,ignore_index=True)

        self.targetdataset   = targetdataset
        return batch

class TestSampling(object):
    def __init__(self,dataset):
        self.dataset          = dataset
        self.distribution     = pd.DataFrame(data=None,columns=['targetdataset','sourcedataset'])

    def sample(self,batch,split,sourcesplit,targetdataset):

        nsource  = len(self.dataset.orig_data[sourcesplit])
        # clear batch
        batch.clear()
        # find the negative dataset list of batch_size
        swimmingpool  = pool(targetdataset,nsource,shuffle=True) if split==sourcesplit else pool(-1,nsource,shuffle=True)
        # double check divisibilty by batch size
        sourcedataset = np.random.choice(swimmingpool,batch.batch_size,replace=False)
        # iterate over batch negative datasets
        for source in sourcedataset:
            # build instance
            instance = self.dataset.instances(targetdataset,source,split=split,sourcesplit=sourcesplit)
            batch.append(instance)

        distribution      = np.concatenate([np.asarray(batch.batch_size*[targetdataset])[:,None],sourcedataset[:,None]],axis=1)
        self.distribution = pd.concat([self.distribution,\
                                       pd.DataFrame(distribution,columns=['targetdataset','sourcedataset'])],axis=0,ignore_index=True)

        return batch

    def sample_from_one_dataset(self,batch):

        # clear batch
        batch.clear()
        # iterate over batch negative datasets
        for _ in range(batch.batch_size):
            # build instance
            instance = self.dataset.instances()
            batch.append(instance)

        return batch