-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcreatedataframeslc.py
More file actions
99 lines (81 loc) · 4 KB
/
createdataframeslc.py
File metadata and controls
99 lines (81 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
This file creates the dataframe for the SLC task.
"""
import os
import argparse
import pandas as pd
from createlabelsdataframeslc import CreateLabelsDataframeSLC
class CreateDataframeSLC:
'''
Task SLC
The format of a tab-separated line of the gold label and the submission files for task SLC is:
article_id sentence_id label
where article_id and sentence_id are the identifiers of the article and the sentence
(the first sentence has id 1) and label={propaganda/non-propaganda}.
Gold and submission files must have the same number of rows as the number of sentences,
i.e. of lines, in the article. In order to help participants preparing a submission, we provide
template prediction files, which have the same format of the gold files where label
is replaced with ?. Sentences are splitted using dots.
'''
@staticmethod
def load_sentences_with_labels(path: str, path_to_labels: str, savepath: str):
'''
This function will be used to create the Pandas dataframe.
It will also pickle the dataframe to use it later.
:param path: str
:param path_to_labels: str
:param savepath: str
:return: Pandas dataframe
'''
lst = []
newlst = []
idx = 1 # Index will be used to count the lines on each article
print("Creating the labels dataframe..")
# Creating the labels dataframe using the helper function
labels_df = CreateLabelsDataframeSLC.load_labels(path_to_labels)
print("Looping through the articles")
# Creating the series to be used to match the indexes of the single words within the loop
for dirs, subdir, files in os.walk(path): # Go through the directory with the files
for file in files:
filepath = dirs + '/' + file
article_id = str(file[:-4][7:])
with open(filepath, 'r') as single_file:
# loop through all lines using f.readlines() method
for line in single_file.readlines():
# this is how you would loop through each letter
line = line.strip().split('\t')
lst.append([line, article_id, idx])
idx += 1
idx = 1 # Index back to one for new line on new article
# print(lst)
str_list = [x for x in lst if x[0] != ['']] # Removing the empty lists from the dataset
# print(str_list)
# Merging the labels dataframe with the sentences created in the previous loop
sentences = pd.DataFrame(str_list, columns=['sentence', 'article_id', 'line'])
# print(sentences)
for index, tok in labels_df.iterrows():
# Get word data
identifier = tok['article_id']
line = tok['line']
is_propaganda = tok['is_propaganda']
newlst.append([identifier, line, is_propaganda])
# print(newlst)
dataframe = pd.DataFrame(newlst, columns=['article_id', 'line', 'is_propaganda'])
dataframe['line'] = pd.to_numeric(dataframe['line']) # Line to numeric for the merge below
# print(df)
final_df = dataframe.merge(sentences, on=['line', 'article_id'], how='left')
# print(final_df)
# Create a pickle of the dataframe
print('Saving dataframe..')
final_df.to_pickle(savepath)
print("Completed")
return final_df
if __name__ == '__main__':
PARSER = argparse.ArgumentParser()
PARSER.add_argument("path", help="Path to get the train dataset")
PARSER.add_argument("path_to_labels", help="Path to get the labels dataset")
PARSER.add_argument("savepath", help="Path to save the pickle of the output dataframe")
ARGS = PARSER.parse_args()
LOAD_ARTICLES = CreateDataframeSLC.load_sentences_with_labels(ARGS.path,
ARGS.path_to_labels,
ARGS.savepath)