transform function fix

saransh-mehta · saransh-mehta · commit 4f608c5b55a6 · 2020-06-15T13:34:23.000Z
diff --git a/utils/tranform_functions.py b/utils/tranform_functions.py
@@ -340,93 +340,6 @@ def generate_ngram_sequences(data, seq_len_right, seq_len_left):
                     i += 1
         return sequence_dict
 
-def validate_sequences(sequence_dict, seq_len_right, seq_len_left):
-    micro_sequences = []
-    macro_sequences = {}
-
-    for key in sequence_dict.keys():
-        score = sequence_dict[key]
-
-        if score < 1 and len(key.split()) <= seq_len_right:
-            micro_sequences.append(key)
-        else:
-            macro_sequences[key] = score
-
-    non_frag_sequences = []
-    macro_sequences_copy = macro_sequences.copy()
-
-    for sent in tqdm(micro_sequences, total = len(micro_sequences)):
-        for key in macro_sequences.keys():
-            if sent in key:
-                non_frag_sequences.append(key)
-                del macro_sequences_copy[key]
-
-        macro_sequences = macro_sequences_copy.copy()
-
-    for sent in non_frag_sequences:
-        macro_sequences[sent] = 0
-
-    for sent in micro_sequences:
-        macro_sequences[sent] = 0
-
-    return macro_sequences
-
-def create_fragment_detection_tsv(dataDir, readFile, wrtDir, transParamDict, isTrainFile=False):
-
-    """
-    This function transforms data for fragment detection task (detecting whether a sentence is incomplete/fragment or not).
-    It takes data in single sentence classification format and creates fragment samples from the sentences.
-    In the transformed file, label 1 and 0 represent fragment and non-fragment sentence respectively.
-    Following transformed files are written at wrtDir
-
-    - Fragment transformed tsv file containing fragment/non-fragment sentences and labels
-
-
-    For using this transform function, set ``transform_func`` : **create_fragment_detection_tsv** in transform file.
-    Args:
-        dataDir (:obj:`str`) : Path to the directory where the raw data files to be read are present..
-        readFile (:obj:`str`) : This is the file which is currently being read and transformed by the function.
-        wrtDir (:obj:`str`) : Path to the directory where to save the transformed tsv files.
-        transParamDict (:obj:`dict`, defaults to :obj:`None`): Dictionary requiring the following parameters as key-value
-            
-            - ``data_frac`` (defaults to 0.2) : Fraction of data to consider for making fragments.
-            - ``seq_len_right`` : (defaults to 3) : Right window length for making n-grams.
-            - ``seq_len_left`` (defaults to 2) : Left window length for making n-grams.
-            - ``sep`` (defaults to "\t") : column separator for input file.
-            - ``query_col`` (defaults to 2) : column number containing sentences. Counting starts from 0.
-
-    """
-
-    transParamDict.setdefault("data_frac", 0.2)
-    transParamDict.setdefault("seq_len_right", 3)
-    transParamDict.setdefault("seq_len_left", 2)
-    transParamDict.setdefault("sep", "\t")
-    transParamDict.setdefault("query_col", 2)
-
-    allDataDf = pd.read_csv(os.path.join(dataDir, readFile), sep=transParamDict["sep"], header=None)
-    sampledDataDf = allDataDf.sample(frac = float(transParamDict['data_frac']), random_state=42)
-
-    #2nd column is considered to have queries in dataframe, 0th uid, 1st label
-    # making n-gram with left and right window
-    seqDict = generate_ngram_sequences(data = list(sampledDataDf.iloc[:, int(transParamDict["query_col"])]),
-                                    seq_len_right = transParamDict['seq_len_right'],
-                                    seq_len_left = transParamDict['seq_len_left'])
-
-    fragDict = validate_sequences(seqDict, seq_len_right = transParamDict['seq_len_right'],
-                                seq_len_left = transParamDict['seq_len_left'])
-
-    finalDf = pd.DataFrame({'uid' : [i for i in range(len(fragDict)+len(allDataDf))],
-                            'label' : [1]*len(fragDict)+[0]*len(allDataDf),
-                            'query' : list(fragDict.keys())+list(allDataDf.iloc[:, int(transParamDict["query_col"]) ]) })
-
-    print('number of fragment samples : ', len(fragDict))
-    print('number of non-fragment samples : ', len(allDataDf))
-    # saving
-    print('writing fragment file for {} at {}'.format(readFile, wrtDir))
-
-    finalDf.to_csv(os.path.join(wrtDir, 'fragment_{}.tsv'.format(readFile.split('.')[0])), sep='\t',
-                index=False, header=False)
-
 def msmarco_query_type_to_tsv(dataDir, readFile, wrtDir, transParamDict, isTrainFile=False):
 
     """
@@ -573,79 +486,7 @@ def qqp_query_similarity_to_tsv(dataDir, readFile, wrtDir, transParamDict, isTra
                   index=False, header=False)   
     print('Test file saved at: {}'.format(os.path.join(wrtDir, 'qqp_query_similarity_test.tsv')))
     
-def msmarco_answerability_detection_to_tsv(dataDir, readFile, wrtDir, transParamDict, isTrainFile=False):
-    """
-    This function transforms the MSMARCO triples data available at `triples <https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz>`_ 
-    
-    The data contains triplets where the first entry is the query, second one is the context passage from which the query can be
-    answered (positive passage) , while the third entry is a context passage from which the query cannot be answered (negative passage).
-    Data is transformed into sentence pair classification format, with query-positive context pair labeled as 1 (answerable) 
-    and query-negative context pair labeled as 0 (non-answerable)
-    
-    Following transformed files are written at wrtDir
-
-    - Sentence pair transformed downsampled file.
-    - Sentence pair transformed train tsv file for answerability task
-    - Sentence pair transformed dev tsv file for answerability task
-    - Sentence pair transformed test tsv file for answerability task
-
-    For using this transform function, set ``transform_func`` : **msmarco_answerability_detection_to_tsv** in transform file.
 
-    Args:
-        dataDir (:obj:`str`) : Path to the directory where the raw data files to be read are present..
-        readFile (:obj:`str`) : This is the file which is currently being read and transformed by the function.
-        wrtDir (:obj:`str`) : Path to the directory where to save the transformed tsv files.
-        transParamDict (:obj:`dict`, defaults to :obj:`None`): Dictionary of function specific parameters. Not required for this transformation function.
-
-            - ``data_frac`` (defaults to 0.01) : Fraction of data to keep in downsampling as the original data size is too large.
-    """
-    transParamDict.setdefault("data_frac", 0.01)
-    sampleEvery = int(1/float(transParamDict["data_frac"]))
-    startId = 0
-    print('Making data from file {} ....'.format(readFile))
-    rf = open(os.path.join(dataDir, readFile))
-    sf = open(os.path.join(wrtDir, 'msmarco_triples_sampled.tsv'), 'w')
-    
-    # reading the big file line by line
-    for i, row in enumerate(rf):
-        # sampling
-        if i % 100000 == 0:
-            print("Processing {} rows...".format(i))
-            
-        if i % sampleEvery == 0:
-            rowData = row.split('\t')
-            posRowData = str(startId)+'\t'+str(1)+'\t'+ rowData[0]+'\t'+rowData[1]
-            negRowData = str(startId+1)+'\t'+str(0)+'\t'+ rowData[0]+'\t'+rowData[2].rstrip('\n')
-
-            #AN IMPORTANT POINT HERE IS TO STRIP THE row ending '\n' present after the negative 
-            # passage, otherwise it will hamper the dataframe.
-
-            #print(negRowData)
-            # writing the positive and negative into new sampled data file
-            sf.write(posRowData+'\n')
-            sf.write(negRowData+'\n')
-
-            #increasing id count
-            startId += 2
-    print('Total Number of rows in original data: ', i)
-    print('Number of answerable samples in downsampled data: ', int(startId / 2))
-    print('Number of non-answerable samples in downsampled data: ', int(startId / 2))
-    print('Downsampled msmarco triples tsv saved at: {}'.format(os.path.join(wrtDir, 'msmarco_triples_sampled.tsv')))
-    
-    #making train, test, dev split
-    sampledDf = pd.read_csv(os.path.join(wrtDir, 'msmarco_triples_sampled.tsv'), sep='\t', header=None)
-    trainDf, testDf = train_test_split(sampledDf, shuffle=True, random_state=SEED,
-                                          test_size=0.02)
-    trainDf.to_csv(os.path.join(wrtDir, 'msmarco_answerability_train.tsv'), sep='\t', index=False, header=False)
-    print('Train file written at: ', os.path.join(wrtDir, 'msmarco_answerability_train.tsv'))
-    
-    devDf, testDf = train_test_split(testDf, shuffle=True, random_state=SEED,
-                                          test_size=0.5)
-    devDf.to_csv(os.path.join(wrtDir, 'msmarco_answerability_dev.tsv'), sep='\t', index=False, header=False)
-    print('Dev file written at: ', os.path.join(wrtDir, 'msmarco_answerability_dev.tsv'))
-    
-    devDf.to_csv(os.path.join(wrtDir, 'msmarco_answerability_test.tsv'), sep='\t', index=False, header=False)
-    print('Test file written at: ', os.path.join(wrtDir, 'msmarco_answerability_test.tsv'))
     
 def query_correctness_to_tsv(dataDir, readFile, wrtDir, transParamDict, isTrainFile=False):