33import os
44import re
55import json
6+ import random
67import pandas as pd
78from tqdm import tqdm
9+ from collections import defaultdict
810from statistics import median
911from sklearn .model_selection import train_test_split
1012SEED = 42
@@ -440,12 +442,12 @@ def msmarco_query_type_to_tsv(dataDir, readFile, wrtDir, transParamDict, isTrain
440442 #saving
441443 print ('number of samples in final data : ' , len (dfKeep ))
442444 print ('writing for file {} at {}' .format (readFile , wrtDir ))
443- dfKeep .to_csv (os .path .join (wrtDir , 'querytype_{}.tsv' .format (readFile .split ( '.' )[ 0 ] )), sep = '\t ' ,
445+ dfKeep .to_csv (os .path .join (wrtDir , 'querytype_{}.tsv' .format (readFile .lower (). replace ( '.json' , '' ) )), sep = '\t ' ,
444446 index = False , header = False )
445447 if isTrainFile :
446448 allClasses = dfKeep ['query_type' ].unique ()
447449 labelMap = {lab : i for i , lab in enumerate (allClasses )}
448- labelMapPath = os .path .join (wrtDir , 'querytype_{}_label_map.joblib' .format (readFile .split ( '.' )[ 0 ] ))
450+ labelMapPath = os .path .join (wrtDir , 'querytype_{}_label_map.joblib' .format (readFile .lower (). replace ( '.json' , '' ) ))
449451 joblib .dump (labelMap , labelMapPath )
450452 print ('Created label map file at' , labelMapPath )
451453
@@ -668,4 +670,86 @@ def msmarco_answerability_detection_to_tsv(dataDir, readFile, wrtDir, transParam
668670 print ('Dev file written at: ' , os .path .join (wrtDir , 'msmarco_answerability_dev.tsv' ))
669671
670672 devDf .to_csv (os .path .join (wrtDir , 'msmarco_answerability_test.tsv' ), sep = '\t ' , index = False , header = False )
671- print ('Test file written at: ' , os .path .join (wrtDir , 'msmarco_answerability_test.tsv' ))
673+ print ('Test file written at: ' , os .path .join (wrtDir , 'msmarco_answerability_test.tsv' ))
674+
675+ def clinc_out_of_scope_to_tsv (dataDir , readFile , wrtDir , transParamDict , isTrainFile = False ):
676+
677+ """
678+
679+ For using this transform function, set ``transform_func`` : **clinc_out_of_scope_to_tsv** in transform file.
680+
681+ Args:
682+ dataDir (:obj:`str`) : Path to the directory where the raw data files to be read are present..
683+ readFile (:obj:`str`) : This is the file which is currently being read and transformed by the function.
684+ wrtDir (:obj:`str`) : Path to the directory where to save the transformed tsv files.
685+ transParamDict (:obj:`dict`, defaults to :obj:`None`): Dictionary requiring the following parameters as key-value
686+
687+ - ``samples_per_intent_train`` (defaults to 7) : Number of in-scope samples per intent to consider, as this data has imbalance for inscope and outscope
688+
689+ """
690+ transParamDict .setdefault ("samples_per_intent_train" , 7 )
691+
692+ print ("Making data from file {} ..." .format (readFile ))
693+ raw = json .load (open (os .path .join (dataDir , readFile )))
694+
695+ print ('Num of train samples in-scope: ' , len (raw ['train' ]))
696+ inScopeTrain = defaultdict (list )
697+ for sentence , intent in raw ['train' ]:
698+ inScopeTrain [intent ].append (sentence )
699+
700+ #sampling
701+ inscopeSampledTrain = []
702+ numSamplesPerInt = 7
703+ random .seed (SEED )
704+ for intent in inScopeTrain :
705+ inscopeSampledTrain += random .sample (inScopeTrain [intent ], int (transParamDict ["samples_per_intent_train" ]))
706+
707+ print ('Num of sampled train samples in-scope: ' , len (inscopeSampledTrain ))
708+ #out of scope train
709+ outscopeTrain = [sample [0 ] for sample in raw ['oos_train' ]]
710+ print ('Num of train out-scope samples: ' , len (outscopeTrain ))
711+
712+ #train data
713+ allTrain = inscopeSampledTrain + outscopeTrain
714+ allTrainLabels = [1 ]* len (inscopeSampledTrain ) + [0 ]* len (outscopeTrain )
715+
716+ #writing train data file
717+ trainF = open (os .path .join (wrtDir , 'clinc_outofscope_train.tsv' ), 'w' )
718+ for uid , (samp , lab ) in enumerate (zip (allTrain , allTrainLabels )):
719+ trainF .write ("{}\t {}\t {}\n " .format (uid , lab , samp ))
720+ print ('Train file written at: ' , os .path .join (wrtDir , 'clinc_outofscope_train.tsv' ))
721+ trainF .close ()
722+
723+ #making dev set
724+ inscopeDev = [sample [0 ] for sample in raw ['val' ]]
725+ outscopeDev = [sample [0 ] for sample in raw ['oos_val' ]]
726+ print ('Num of val out-scope samples: ' , len (outscopeDev ))
727+ print ('Num of val in-scope samples: ' , len (inscopeDev ))
728+
729+ #allDev = inscopeDev + outscopeDev
730+ allDev = outscopeDev
731+ #allDevLabels = [1]*inscopeDev + [0]*outscopeDev
732+ allDevLabels = [0 ]* len (outscopeDev )
733+
734+ #writing dev data file
735+ devF = open (os .path .join (wrtDir , 'clinc_outofscope_dev.tsv' ), 'w' )
736+ for uid , (samp , lab ) in enumerate (zip (allDev , allDevLabels )):
737+ devF .write ("{}\t {}\t {}\n " .format (uid , lab , samp ))
738+ print ('Dev file written at: ' , os .path .join (wrtDir , 'clinc_outofscope_dev.tsv' ))
739+ devF .close ()
740+
741+ #making test set
742+ inscopeTest = [sample [0 ] for sample in raw ['test' ]]
743+ outscopeTest = [sample [0 ] for sample in raw ['oos_test' ]]
744+ print ('Num of test out-scope samples: ' , len (outscopeTest ))
745+ print ('Num of test in-scope samples: ' , len (inscopeTest ))
746+
747+ allTest = inscopeTest + outscopeTest
748+ allTestLabels = [1 ]* len (inscopeTest ) + [0 ]* len (outscopeTest )
749+
750+ #writing test data file
751+ testF = open (os .path .join (wrtDir , 'clinc_outofscope_test.tsv' ), 'w' )
752+ for uid , (samp , lab ) in enumerate (zip (allTest , allTestLabels )):
753+ testF .write ("{}\t {}\t {}\n " .format (uid , lab , samp ))
754+ print ('Test file written at: ' , os .path .join (wrtDir , 'clinc_outofscope_test.tsv' ))
755+ testF .close ()
0 commit comments