PropagandaDetection/createdataframeslc.py at master · marcogdepinto/PropagandaDetection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
This file creates the dataframe for the SLC task.
"""

import os
import argparse
import pandas as pd
from createlabelsdataframeslc import CreateLabelsDataframeSLC


class CreateDataframeSLC:
    '''
    Task SLC
    The format of a tab-separated line of the gold label and the submission files for task SLC is:
    article_id   sentence_id    label
    where article_id and sentence_id are the identifiers of the article and the sentence
    (the first sentence has id 1) and label={propaganda/non-propaganda}.
    Gold and submission files must have the same number of rows as the number of sentences,
    i.e. of lines, in the article. In order to help participants preparing a submission, we provide
    template prediction files, which have the same format of the gold files where label
    is replaced with ?. Sentences are splitted using dots.
    '''
    @staticmethod
    def load_sentences_with_labels(path: str, path_to_labels: str, savepath: str):

        '''
        This function will be used to create the Pandas dataframe.
        It will also pickle the dataframe to use it later.
        :param path: str
        :param path_to_labels: str
        :param savepath: str
        :return: Pandas dataframe
        '''

        lst = []
        newlst = []
        idx = 1  # Index will be used to count the lines on each article

        print("Creating the labels dataframe..")
        # Creating the labels dataframe using the helper function
        labels_df = CreateLabelsDataframeSLC.load_labels(path_to_labels)

        print("Looping through the articles")
        # Creating the series to be used to match the indexes of the single words within the loop

        for dirs, subdir, files in os.walk(path):  # Go through the directory with the files
            for file in files:
                filepath = dirs + '/' + file
                article_id = str(file[:-4][7:])
                with open(filepath, 'r') as single_file:
                    # loop through all lines using f.readlines() method
                    for line in single_file.readlines():
                        # this is how you would loop through each letter
                        line = line.strip().split('\t')
                        lst.append([line, article_id, idx])
                        idx += 1
                    idx = 1 # Index back to one for new line on new article

        # print(lst)
        str_list = [x for x in lst if x[0] != ['']] # Removing the empty lists from the dataset
        # print(str_list)

        # Merging the labels dataframe with the sentences created in the previous loop
        sentences = pd.DataFrame(str_list, columns=['sentence', 'article_id', 'line'])
        # print(sentences)

        for index, tok in labels_df.iterrows():
            # Get word data
            identifier = tok['article_id']
            line = tok['line']
            is_propaganda = tok['is_propaganda']

            newlst.append([identifier, line, is_propaganda])
            # print(newlst)

        dataframe = pd.DataFrame(newlst, columns=['article_id', 'line', 'is_propaganda'])
        dataframe['line'] = pd.to_numeric(dataframe['line'])  # Line to numeric for the merge below
        # print(df)

        final_df = dataframe.merge(sentences, on=['line', 'article_id'], how='left')
        # print(final_df)

        # Create a pickle of the dataframe
        print('Saving dataframe..')
        final_df.to_pickle(savepath)
        print("Completed")

        return final_df


if __name__ == '__main__':
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("path", help="Path to get the train dataset")
    PARSER.add_argument("path_to_labels", help="Path to get the labels dataset")
    PARSER.add_argument("savepath", help="Path to save the pickle of the output dataframe")
    ARGS = PARSER.parse_args()
    LOAD_ARTICLES = CreateDataframeSLC.load_sentences_with_labels(ARGS.path,
                                                                  ARGS.path_to_labels,
                                                                  ARGS.savepath)