1- # seq2vec
1+ # seq2vec 0.4.0
22Turn sequence of words into a fix-length representation vector
33
4+ This is a version to refactor all the seq2vec structures and use customed layers in yklz.
45
56## Install
67```
@@ -17,9 +18,9 @@ python setup.py install
1718
1819Simple hash:
1920``` python
20- from seq2vec import HashSeq2Vec
21+ from seq2vec import Seq2VecHash
2122
22- transformer = HashSeq2Vec (vector_length = 100 )
23+ transformer = Seq2VecHash (vector_length = 100 )
2324seqs = [
2425 [' 我' , ' 有' , ' 一個' , ' 蘋果' ],
2526 [' 我' , ' 有' , ' pineapple' ],
@@ -46,36 +47,229 @@ array([[ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
4647'''
4748```
4849
49- TFIDF:
50- ``` python
51-
52- ```
53-
5450Sequence-to-sequence auto-encoder:
55- ``` python
5651
57- ```
52+ * LSTM to LSTM auto-encoder with word embedding (RNN to RNN architecture)
5853
59- Seq2Seq
60- ``` python
61- from seq2vec import Seq2SeqAutoEncoderUseWordHash
54+ ``` python
55+ from seq2vec.word2vec import GensimWord2vec
56+ from seq2vec import Seq2VecR2RWord
57+
58+ # load Gensim word2vec from word2vec_model_path
59+ word2vec = GensimWord2vec(word2vec_model_path)
60+
61+ transformer = Seq2VecR2RWord(
62+ word2vec_model = word2vec,
63+ max_length = 20 ,
64+ latent_size = 300 ,
65+ encoding_size = 300 ,
66+ learning_rate = 0.05
67+ )
68+
69+ train_seq = [
70+ [' 我' , ' 有' , ' 一個' , ' 蘋果' ],
71+ [' 我' , ' 有' , ' 筆' ],
72+ [' 一個' , ' 鳳梨' ],
73+ ]
74+ test_seq = [
75+ [' 我' , ' 愛' , ' 吃' , ' 鳳梨' ],
76+ ]
77+ transformer.fit(train_seq)
78+ result = transformer.transform(test_seq)
79+ ```
80+
81+ * CNN to LSTM auto-encoder with word embedding (CNN to RNN architecture)
6282
63- transformer = Seq2SeqAutoEncoderUseWordHash(
64- max_index = 1000 ,
65- max_length = 10 ,
66- latent_size = 20 ,
67- )
83+ ``` python
84+ from seq2vec.word2vec import GensimWord2vec
85+ from seq2vec import Seq2VecC2RWord
86+
87+ # load Gensim word2vec from word2vec_model_path
88+ word2vec = GensimWord2vec(word2vec_model_path)
89+
90+ transformer = Seq2VecC2RWord(
91+ word2vec_model = word2vec,
92+ max_length = 20 ,
93+ latent_size = 300 ,
94+ conv_size = 5 ,
95+ channel_size = 10 ,
96+ learning_rate = 0.05 ,
97+ )
98+
99+ train_seq = [
100+ [' 我' , ' 有' , ' 一個' , ' 蘋果' ],
101+ [' 我' , ' 有' , ' 筆' ],
102+ [' 一個' , ' 鳳梨' ],
103+ ]
104+ test_seq = [
105+ [' 我' , ' 愛' , ' 吃' , ' 鳳梨' ],
106+ ]
107+ transformer.fit(train_seq)
108+ result = transformer.transform(test_seq)
109+ ```
68110
69- train_seq = [
111+ * CNN to LSTM auto-encoder with char embedding (CNN to RNN architecture)
112+
113+ ``` python
114+ from seq2vec.word2vec import GensimWord2vec
115+ from seq2vec import Seq2VecC2RChar
116+
117+ # load Gensim word2vec from word2vec_model_path
118+ word2vec = GensimWord2vec(word2vec_model_path)
119+
120+ transformer = Seq2VecC2RChar(
121+ word2vec_model = word2vec,
122+ max_index = 1000 ,
123+ max_length = 20 ,
124+ embedding_size = 200 ,
125+ latent_size = 200 ,
126+ learning_rate = 0.05 ,
127+ channel_size = 10 ,
128+ conv_size = 5
129+ )
130+
131+ train_seq = [
70132 [' 我' , ' 有' , ' 一個' , ' 蘋果' ],
71133 [' 我' , ' 有' , ' 筆' ],
72134 [' 一個' , ' 鳳梨' ],
73- ]
74- test_seq = [
135+ ]
136+ test_seq = [
75137 [' 我' , ' 愛' , ' 吃' , ' 鳳梨' ],
76- ]
77- transformer.fit(train_seq)
78- result = transformer.transform(test_seq)
138+ ]
139+ transformer.fit(train_seq)
140+ result = transformer.transform(test_seq)
141+ ```
142+
143+ * LSTM to LSTM auto-encoder with hash word embedding (RNN to RNN architecture)
144+
145+ ``` python
146+ from seq2vec import Seq2VecR2RHash
147+
148+ transformer = Seq2VecR2RHash(
149+ max_index = 1000 ,
150+ max_length = 10 ,
151+ latent_size = 20 ,
152+ embedding_size = 200 ,
153+ encoding_size = 300 ,
154+ learning_rate = 0.05
155+ )
156+
157+ train_seq = [
158+ [' 我' , ' 有' , ' 一個' , ' 蘋果' ],
159+ [' 我' , ' 有' , ' 筆' ],
160+ [' 一個' , ' 鳳梨' ],
161+ ]
162+ test_seq = [
163+ [' 我' , ' 愛' , ' 吃' , ' 鳳梨' ],
164+ ]
165+ transformer.fit(train_seq)
166+ result = transformer.transform(test_seq)
167+ ```
168+
169+ ### Training with generator on file
170+
171+ We provide an example with LSTM to LSTM auto-encoder (word embedding).
172+
173+ Use the following training method while lack of memory is an issue for you.
174+
175+ The file should be a tokenized txt file splitted by whitespace with a sequence
176+ per line.
177+
178+ ``` python
179+ from seq2vec.word2vec import GensimWord2vec
180+
181+ from seq2vec.model import Seq2VecR2RWord
182+ from seq2vec.transformer import WordEmbeddingTransformer
183+ from seq2vec.util import DataGenterator
184+
185+ word2vec = GensimWord2vec(word2vec_model_path)
186+ max_length = 20
187+
188+ transformer = Seq2VecR2RWord(
189+ word2vec_model = word2vec,
190+ max_length = max_length,
191+ latent_size = 200 ,
192+ encoding_size = 300 ,
193+ learning_rate = 0.05
194+ )
195+
196+ train_data = DataGenterator(
197+ corpus_for_training_path,
198+ transformer.input_transformer,
199+ transformer.output_transformer,
200+ batch_size = 128
201+ )
202+ test_data = DataGenterator(
203+ corpus_for_validation_path,
204+ transformer.input_transformer,
205+ transformer.output_transformer,
206+ batch_size = 128
207+ )
208+
209+ transformer.fit_generator(
210+ train_data,
211+ test_data,
212+ epochs = 10 ,
213+ batch_number = 1250 # The number of batch per epoch
214+ )
215+
216+ transformer.save_model(model_path) # save your model
217+
218+ # You can reload your model and retrain it.
219+ transformer.load_model(model_path)
220+ transformer.fit_generator(
221+ train_data,
222+ test_data,
223+ epochs = 10 ,
224+ batch_number = 1250 # The number of batch per epoch
225+ )
226+ ```
227+
228+ ### Customized your seq2vec model with our auto-encoder framework
229+
230+ You can customize your seq2vec model easily with our framework.
231+
232+ ``` python
233+ import keras
234+ from seq2vec.model import TrainableSeq2VecBase
235+
236+ class YourSeq2Vec (TrainableSeq2VecBase ):
237+
238+ def __init__ (self
239+ max_length ,
240+ latent_size ,
241+ learning_rate
242+ ):
243+ # initialize your setting and set input_transformer
244+ # and output_transformer
245+ # Input and output transformers transform data from
246+ # raw sequence into Keras Layer input format
247+ # See seq2vec.transformer for more detail
248+
249+ self .input_transformer = YourInputTransformer()
250+ self .output_transformer = YourOutputTransformer()
251+
252+ # add your customized layer
253+ self .custom_objects = {}
254+ self .custom_objects[customized_class_name] = customized_class
255+
256+ super (YourSeq2Vec, self ).__init__ (
257+ max_length,
258+ latent_size,
259+ learning_rate
260+ )
261+
262+ def create_model (self ):
263+ # create and compile your model in this function
264+ # You should return your model and encoder here
265+ # encoder is the one encoded input sequences
266+
267+ model.compile(loss)
268+ return model, encoder
269+
270+ def load_model (self , file_path ):
271+ # load your seq2vec model here and set its attribute values
272+ self .model = self .load_customed_model(file_path)
79273```
80274
81275## Lint
@@ -86,6 +280,6 @@ pylint --rcfile=./yoctol-pylintrc/.pylintrc seq2vec
86280
87281## Test
88282```
89- python setup.py test
283+ python -m unittest
90284```
91285
0 commit comments