| 
 | 1 | +import argparse  | 
 | 2 | +import os  | 
 | 3 | +import nltk  | 
 | 4 | + | 
 | 5 | +__author__ = "Gwena Cunha"  | 
 | 6 | + | 
 | 7 | +"""  | 
 | 8 | +    DialSum paper uses a modification of the AMI Meeting Corpus:  | 
 | 9 | +        - train/test/valid split is available in different directories  | 
 | 10 | +        - Original files format: file `in` with input and file `sum` with summaries, 1 per line  | 
 | 11 | +        - Needed for comparison  | 
 | 12 | +        - Paper: https://www.csie.ntu.edu.tw/~yvchen/doc/SLT18_DialSum.pdf  | 
 | 13 | +        - Code: https://github.com/MiuLab/DialSum  | 
 | 14 | +          | 
 | 15 | +    Objective of this code:  | 
 | 16 | +        - Convert AMI_DialSum data to CNN-DailyMail News Dataset story format  | 
 | 17 | +        - Resulting files: *.story  | 
 | 18 | +      | 
 | 19 | +    Code based on my previous code: https://github.com/gcunhase/AMICorpusXML  | 
 | 20 | +"""  | 
 | 21 | + | 
 | 22 | + | 
 | 23 | +def ensure_dir(directory):  | 
 | 24 | +    if not os.path.exists(directory):  | 
 | 25 | +        os.makedirs(directory)  | 
 | 26 | + | 
 | 27 | + | 
 | 28 | +def transform_to_story(args, sent_detector):  | 
 | 29 | +    """ Transform AMI Corpus into CNN-DailyMail News Dataset story format  | 
 | 30 | +    """  | 
 | 31 | +    print("Make .story files")  | 
 | 32 | +    data_dir = args.ami_dialsum_data_dir  | 
 | 33 | +    results_dir = args.results_story_dir  | 
 | 34 | + | 
 | 35 | +    for data_type in ['train', 'test', 'valid']:  | 
 | 36 | +        # Open in (meeting transcript) and sum (summary) files  | 
 | 37 | +        in_lines = open(data_dir + data_type + '/' + args.input_filename).readlines()  | 
 | 38 | +        sum_lines = open(data_dir + data_type + '/' + args.summary_filename).readlines()  | 
 | 39 | + | 
 | 40 | +        # Ensure results directory with train/test/valid directories exists  | 
 | 41 | +        results_dir_data_type = results_dir + data_type  | 
 | 42 | +        ensure_dir(results_dir_data_type)  | 
 | 43 | + | 
 | 44 | +        for i, (in_line, sum_line) in enumerate(zip(in_lines, sum_lines)):  | 
 | 45 | +            print(i)  | 
 | 46 | +            story_filename = '/in_{}.story'.format(i)  | 
 | 47 | +            story_file = open(results_dir_data_type + story_filename, 'w')  | 
 | 48 | + | 
 | 49 | +            # Write transcript  | 
 | 50 | +            story_file.write('{}\n'.format(in_line))  | 
 | 51 | + | 
 | 52 | +            # Separate summary into sentences  | 
 | 53 | +            sentences = sent_detector.tokenize(sum_line.strip())  | 
 | 54 | +            for sent in sentences:  | 
 | 55 | +                story_file.write('\n\n@highlight\n\n{}'.format(sent))  | 
 | 56 | +            story_file.close()  | 
 | 57 | + | 
 | 58 | + | 
 | 59 | +if __name__ == '__main__':  | 
 | 60 | +    parser = argparse.ArgumentParser(description='Converts file to .story')  | 
 | 61 | +    parser.add_argument('--ami_dialsum_data_dir', type=str, default='data/ami_dialsum_corpus/',  | 
 | 62 | +                        help='AMI DialSum Corpus directory')  | 
 | 63 | +    parser.add_argument('--input_filename', type=str, default='in',  | 
 | 64 | +                        help='AMI DialSum Corpus input filename')  | 
 | 65 | +    parser.add_argument('--summary_filename', type=str, default='sum',  | 
 | 66 | +                        help='AMI DialSum Corpus summary (target) filename')  | 
 | 67 | +    parser.add_argument('--results_story_dir', type=str,  | 
 | 68 | +                        default='data/ami_dialsum_corpus_stories/',  | 
 | 69 | +                        help='AMI Corpus .story files')  | 
 | 70 | +    args = parser.parse_args()  | 
 | 71 | + | 
 | 72 | +    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')  | 
 | 73 | +    transform_to_story(args, sent_detector)  | 
0 commit comments