-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
127 lines (83 loc) · 3.57 KB
/
preprocessing.py
File metadata and controls
127 lines (83 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import constants
from logger import save_dataframe, read_dataframe
import numpy as np
import pandas as pd
import torch
class Vocabulary:
def __init__(self):
self.word2count = {}
self.word2index = {}
# TODO: Take these values from constants
self.index2word = {0: "<sos>", 1: "<eos>", 2: "<unk>"}
self.__n_words = len(self.index2word)
def __len__(self):
return self.__n_words
def collectWordsFrom(self, sentences):
for sentence in sentences:
for word in sentence:
self.__addWord(word)
def save(self, fname):
df = pd.DataFrame(columns=['word', 'index', 'count'])
df['word'] = list(self.word2index.keys())
df['index'] = df['word'].apply(lambda word: self.word2index[word])
df['count'] = df['word'].apply(lambda word: self.word2count[word])
save_dataframe(df, fname)
def load(self, fname):
df = read_dataframe(fname)
self.word2count = dict(zip(df['word'], df['count']))
self.word2index = dict(zip(df['word'], df['index']))
new_index2word = dict(zip(df['index'], df['word']))
self.index2word = {**new_index2word, **self.index2word}
self.__n_words = len(self.index2word)
def __addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.__n_words
self.word2count[word] = 1
self.index2word[self.__n_words] = word
self.__n_words += 1
else:
self.word2count[word] += 1
class NumberEncoder:
"""Encodes words as numbers and decodes numbers as words
using a given vocabulary"""
def __init__(self, vocabulary):
self.vocabulary = vocabulary
def encodeSentence(self, sentence):
numbers = [self.__encodeWord(word) for word in sentence]
numbers.append(constants.EOS_TOKEN)
return numbers
def decodeSentence(self, numbers):
# Remove EOS_TOKEN
numbers = numbers[:-1]
return [self.__decodeWord(number) for number in numbers]
def __encodeWord(self, word):
if word not in self.vocabulary.word2index.keys():
return constants.UNK_TOKEN
return self.vocabulary.word2index[word]
def __decodeWord(self, number):
return self.vocabulary.index2word[number]
class TensorEncoder:
def __init__(self, input_vocab, output_vocab):
self.input_num_encoder = NumberEncoder(input_vocab)
self.output_num_encoder = NumberEncoder(output_vocab)
def encode(self, df, input_column, output_columns):
cols = [input_column] + output_columns
df[input_column] = df[input_column].apply(self.input_num_encoder.encodeSentence)
for col in output_columns:
df[col] = df[col].apply(self.output_num_encoder.encodeSentence)
for col in cols:
df[col] = df[col].apply(self.__list_to_tensor)
return df
def decode(self, df, input_column, output_columns):
cols = [input_column] + output_columns
for col in cols:
if type(df[col].iloc[0]) == torch.Tensor:
df[col] = df[col].apply(self.__tensor_to_list)
df[input_column] = df[input_column].apply(self.input_num_encoder.decodeSentence)
for col in output_columns:
df[col] = df[col].apply(self.output_num_encoder.decodeSentence)
return df
def __list_to_tensor(self, sentence):
return torch.tensor(sentence, dtype=torch.long, device=constants.DEVICE).view(-1, 1)
def __tensor_to_list(self, tensor):
return tensor.view(-1).tolist()