Merge pull request #26 from keroro824/tharun

Tharun24 · web-flow · commit 4d67a5e82db9 · 2020-03-18T11:07:42.000-05:00
fixed bugs in sampled softmax code
diff --git a/python_examples/config.py b/python_examples/config.py
@@ -3,21 +3,22 @@
 class config:
     data_path_train = '../dataset/Amazon/amazon_train.txt'
     data_path_test = '../dataset/Amazon/amazon_test.txt'
-    data_path = '../dataset/Amazon/amazon_train.txt'
-    GPUs = '' # empty string uses only CPU
-    num_threads = 96 # Only used when GPUs is empty string
+    GPUs = '0' # empty string uses only CPU
+    num_threads = 44 # Only used when GPUs is empty string
     lr = 0.0001
     ###
     feature_dim = 135909
     n_classes = 670091
     n_train = 490449
     n_test = 153025
-    n_epochs = 20
+    n_epochs = 2
     batch_size = 128
     hidden_dim = 128
     ###
-    log_file = 'log'
-
+    log_file = 'log_amz_ss'
     ### for sampled softmax
-    n_samples = 670091//10
-    max_label = 100
+    n_samples = n_classes//10
+    ### choose the max_labels per training sample. 
+    ### If the number of true labels is < max_label,
+    ### we will pad the rest of them with a dummy class (see data_generator_ss in util.py)
+    max_label = 1
diff --git a/python_examples/example_sampled_softmax.py b/python_examples/example_sampled_softmax.py
@@ -8,7 +8,7 @@
 from config import config
 from itertools import islice
 #from scipy.sparse import csr_matrix
-from util import data_generator, data_generator_tst
+from util import data_generator_ss, data_generator_tst
 
 ## Training Params
 def main():
@@ -36,23 +36,28 @@ def main():
     x_idxs = tf.placeholder(tf.int64, shape=[None,2])
     x_vals = tf.placeholder(tf.float32, shape=[None])
     x = tf.SparseTensor(x_idxs, x_vals, [batch_size,feature_dim])
-    y = tf.placeholder(tf.float32, shape=[None,n_classes])
+    y = tf.placeholder(tf.float32, shape=[None,max_label])
     #
     W1 = tf.Variable(tf.truncated_normal([feature_dim,hidden_dim], stddev=2.0/math.sqrt(feature_dim+hidden_dim)))
     b1 = tf.Variable(tf.truncated_normal([hidden_dim], stddev=2.0/math.sqrt(feature_dim+hidden_dim)))
     layer_1 = tf.nn.relu(tf.sparse_tensor_dense_matmul(x,W1)+b1)
     #
-    W2 = tf.Variable(tf.truncated_normal([hidden_dim,n_classes], stddev=2.0/math.sqrt(hidden_dim+n_classes)))
-    b2 = tf.Variable(tf.truncated_normal([n_classes], stddev=2.0/math.sqrt(n_classes+hidden_dim)))
-    logits = tf.matmul(layer_1,W2)+b2
+    if max_label>1: # an extra node for padding a dummy class
+        W2 = tf.Variable(tf.truncated_normal([hidden_dim,n_classes+1], stddev=2.0/math.sqrt(hidden_dim+n_classes)))
+        b2 = tf.Variable(tf.truncated_normal([n_classes+1], stddev=2.0/math.sqrt(n_classes+hidden_dim)))
+        logits = tf.matmul(layer_1,W2[:,:-1])+b2[:-1]
+    else:
+        W2 = tf.Variable(tf.truncated_normal([hidden_dim,n_classes], stddev=2.0/math.sqrt(hidden_dim+n_classes)))
+        b2 = tf.Variable(tf.truncated_normal([n_classes], stddev=2.0/math.sqrt(n_classes+hidden_dim)))
+        logits = tf.matmul(layer_1,W2)+b2
     #
     k=1
     if k==1:
         top_idxs = tf.argmax(logits, axis=1)
     else:
         top_idxs = tf.nn.top_k(logits, k=k, sorted=False)[1]
     #
-    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(tf.transpose(W2),b2,tf.reshape(y,[-1,max_label]),layer_1,n_samples,n_classes,remove_accidental_hits=False, num_true=max_label,partition_strategy='div'))
+    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(tf.transpose(W2), b2, y, layer_1, n_samples, n_classes, remove_accidental_hits=False, num_true=max_label, partition_strategy='div'))
     #
     train_step = tf.train.AdamOptimizer(lr).minimize(loss)
     #
@@ -65,7 +70,7 @@ def main():
     sess = tf.Session(config=Config)
     sess.run(tf.global_variables_initializer())
     #
-    training_data_generator = data_generator(train_files, batch_size, n_classes)
+    training_data_generator = data_generator_ss(train_files, batch_size, n_classes, max_label)
     steps_per_epoch = n_train//batch_size
     n_steps = n_epochs*steps_per_epoch
     n_check = 500
@@ -94,7 +99,7 @@ def main():
             sess.run(train_step, feed_dict={x_idxs:idxs_batch, x_vals:vals_batch, y:labels_batch})
             if i%steps_per_epoch==steps_per_epoch-1:
                 total_time+=time.time()-begin_time
-                print('Finished ',i,' steps. Time elapsed for last 100 batches = ',time.time()-begin_time)
+                print('Finished ',i,' steps. Time elapsed for last', i%n_check, 'batches = ',time.time()-begin_time)
                 n_steps_val = n_test//batch_size
                 test_data_generator = data_generator_tst(test_files, batch_size)
                 num_batches = 0
diff --git a/python_examples/util.py b/python_examples/util.py
@@ -1,6 +1,6 @@
 from itertools import islice
 import numpy as np
-
+from config import config
 
 def data_generator(files, batch_size, n_classes):
     while 1:
@@ -34,6 +34,36 @@ def data_generator(files, batch_size, n_classes):
                     lines = []
                     yield (idxs, vals, y_batch)
 
+def data_generator_ss(files, batch_size, n_classes, max_label):
+    while 1:
+        lines = []
+        for file in files:
+            with open(file,'r',encoding='utf-8') as f:
+                header = f.readline() # ignore the header
+                while True:
+                    temp = len(lines)
+                    lines += list(islice(f,batch_size-temp))
+                    if len(lines)!=batch_size:
+                        break
+                    idxs = []
+                    vals = []
+                    ##
+                    y_batch = [None for i in range(len(lines))]
+                    count = 0
+                    for line in lines:
+                        itms = line.strip().split(' ')
+                        ##
+                        y_batch[count] = [int(itm) for itm in itms[0].split(',')]
+                        if max_label>=len(y_batch[count]): # 
+                            y_batch[count] += [n_classes for i in range(max_label-len(y_batch[count]))]
+                        else:
+                            y_batch[count] = np.random.choice(y_batch[count], max_label, replace=False)
+                        ##
+                        idxs += [(count,int(itm.split(':')[0])) for itm in itms[1:]]
+                        vals += [float(itm.split(':')[1]) for itm in itms[1:]]
+                        count += 1
+                    lines = []
+                    yield (idxs, vals, y_batch)
 
 def data_generator_tst(files, batch_size):
     while 1: