Skip to content

Commit 85b57d1

Browse files
authored
Merge pull request #22 from Yoctol/seq2vec-0.4.0
Seq2vec 0.4.0
2 parents 89aebb4 + a3500d0 commit 85b57d1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+1867
-1784
lines changed

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,11 @@ ENV/
8787

8888
# Rope project settings
8989
.ropeproject
90+
91+
# model files
92+
*.h5
93+
*.model
94+
*_best
95+
96+
#dict
97+
*.dict

README.md

Lines changed: 219 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
# seq2vec
1+
# seq2vec 0.4.0
22
Turn sequence of words into a fix-length representation vector
33

4+
This is a version to refactor all the seq2vec structures and use customed layers in yklz.
45

56
## Install
67
```
@@ -17,9 +18,9 @@ python setup.py install
1718

1819
Simple hash:
1920
```python
20-
from seq2vec import HashSeq2Vec
21+
from seq2vec import Seq2VecHash
2122

22-
transformer = HashSeq2Vec(vector_length=100)
23+
transformer = Seq2VecHash(vector_length=100)
2324
seqs = [
2425
['', '', '一個', '蘋果'],
2526
['', '', 'pineapple'],
@@ -46,36 +47,229 @@ array([[ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
4647
'''
4748
```
4849

49-
TFIDF:
50-
```python
51-
52-
```
53-
5450
Sequence-to-sequence auto-encoder:
55-
```python
5651

57-
```
52+
* LSTM to LSTM auto-encoder with word embedding (RNN to RNN architecture)
5853

59-
Seq2Seq
60-
```python
61-
from seq2vec import Seq2SeqAutoEncoderUseWordHash
54+
```python
55+
from seq2vec.word2vec import GensimWord2vec
56+
from seq2vec import Seq2VecR2RWord
57+
58+
# load Gensim word2vec from word2vec_model_path
59+
word2vec = GensimWord2vec(word2vec_model_path)
60+
61+
transformer = Seq2VecR2RWord(
62+
word2vec_model=word2vec,
63+
max_length=20,
64+
latent_size=300,
65+
encoding_size=300,
66+
learning_rate=0.05
67+
)
68+
69+
train_seq = [
70+
['', '', '一個', '蘋果'],
71+
['', '', ''],
72+
['一個', '鳳梨'],
73+
]
74+
test_seq = [
75+
['', '', '', '鳳梨'],
76+
]
77+
transformer.fit(train_seq)
78+
result = transformer.transform(test_seq)
79+
```
80+
81+
* CNN to LSTM auto-encoder with word embedding (CNN to RNN architecture)
6282

63-
transformer = Seq2SeqAutoEncoderUseWordHash(
64-
max_index=1000,
65-
max_length=10,
66-
latent_size=20,
67-
)
83+
```python
84+
from seq2vec.word2vec import GensimWord2vec
85+
from seq2vec import Seq2VecC2RWord
86+
87+
# load Gensim word2vec from word2vec_model_path
88+
word2vec = GensimWord2vec(word2vec_model_path)
89+
90+
transformer = Seq2VecC2RWord(
91+
word2vec_model=word2vec,
92+
max_length=20,
93+
latent_size=300,
94+
conv_size=5,
95+
channel_size=10,
96+
learning_rate=0.05,
97+
)
98+
99+
train_seq = [
100+
['', '', '一個', '蘋果'],
101+
['', '', ''],
102+
['一個', '鳳梨'],
103+
]
104+
test_seq = [
105+
['', '', '', '鳳梨'],
106+
]
107+
transformer.fit(train_seq)
108+
result = transformer.transform(test_seq)
109+
```
68110

69-
train_seq = [
111+
* CNN to LSTM auto-encoder with char embedding (CNN to RNN architecture)
112+
113+
```python
114+
from seq2vec.word2vec import GensimWord2vec
115+
from seq2vec import Seq2VecC2RChar
116+
117+
# load Gensim word2vec from word2vec_model_path
118+
word2vec = GensimWord2vec(word2vec_model_path)
119+
120+
transformer = Seq2VecC2RChar(
121+
word2vec_model=word2vec,
122+
max_index=1000,
123+
max_length=20,
124+
embedding_size=200,
125+
latent_size=200,
126+
learning_rate=0.05,
127+
channel_size=10,
128+
conv_size=5
129+
)
130+
131+
train_seq = [
70132
['', '', '一個', '蘋果'],
71133
['', '', ''],
72134
['一個', '鳳梨'],
73-
]
74-
test_seq = [
135+
]
136+
test_seq = [
75137
['', '', '', '鳳梨'],
76-
]
77-
transformer.fit(train_seq)
78-
result = transformer.transform(test_seq)
138+
]
139+
transformer.fit(train_seq)
140+
result = transformer.transform(test_seq)
141+
```
142+
143+
* LSTM to LSTM auto-encoder with hash word embedding (RNN to RNN architecture)
144+
145+
```python
146+
from seq2vec import Seq2VecR2RHash
147+
148+
transformer = Seq2VecR2RHash(
149+
max_index=1000,
150+
max_length=10,
151+
latent_size=20,
152+
embedding_size=200,
153+
encoding_size=300,
154+
learning_rate=0.05
155+
)
156+
157+
train_seq = [
158+
['', '', '一個', '蘋果'],
159+
['', '', ''],
160+
['一個', '鳳梨'],
161+
]
162+
test_seq = [
163+
['', '', '', '鳳梨'],
164+
]
165+
transformer.fit(train_seq)
166+
result = transformer.transform(test_seq)
167+
```
168+
169+
### Training with generator on file
170+
171+
We provide an example with LSTM to LSTM auto-encoder (word embedding).
172+
173+
Use the following training method while lack of memory is an issue for you.
174+
175+
The file should be a tokenized txt file splitted by whitespace with a sequence
176+
per line.
177+
178+
```python
179+
from seq2vec.word2vec import GensimWord2vec
180+
181+
from seq2vec.model import Seq2VecR2RWord
182+
from seq2vec.transformer import WordEmbeddingTransformer
183+
from seq2vec.util import DataGenterator
184+
185+
word2vec = GensimWord2vec(word2vec_model_path)
186+
max_length = 20
187+
188+
transformer = Seq2VecR2RWord(
189+
word2vec_model=word2vec,
190+
max_length=max_length,
191+
latent_size=200,
192+
encoding_size=300,
193+
learning_rate=0.05
194+
)
195+
196+
train_data = DataGenterator(
197+
corpus_for_training_path,
198+
transformer.input_transformer,
199+
transformer.output_transformer,
200+
batch_size=128
201+
)
202+
test_data = DataGenterator(
203+
corpus_for_validation_path,
204+
transformer.input_transformer,
205+
transformer.output_transformer,
206+
batch_size=128
207+
)
208+
209+
transformer.fit_generator(
210+
train_data,
211+
test_data,
212+
epochs=10,
213+
batch_number=1250 # The number of batch per epoch
214+
)
215+
216+
transformer.save_model(model_path) # save your model
217+
218+
# You can reload your model and retrain it.
219+
transformer.load_model(model_path)
220+
transformer.fit_generator(
221+
train_data,
222+
test_data,
223+
epochs=10,
224+
batch_number=1250 # The number of batch per epoch
225+
)
226+
```
227+
228+
### Customized your seq2vec model with our auto-encoder framework
229+
230+
You can customize your seq2vec model easily with our framework.
231+
232+
```python
233+
import keras
234+
from seq2vec.model import TrainableSeq2VecBase
235+
236+
class YourSeq2Vec(TrainableSeq2VecBase):
237+
238+
def __init__(self
239+
max_length,
240+
latent_size,
241+
learning_rate
242+
):
243+
# initialize your setting and set input_transformer
244+
# and output_transformer
245+
# Input and output transformers transform data from
246+
# raw sequence into Keras Layer input format
247+
# See seq2vec.transformer for more detail
248+
249+
self.input_transformer = YourInputTransformer()
250+
self.output_transformer = YourOutputTransformer()
251+
252+
# add your customized layer
253+
self.custom_objects = {}
254+
self.custom_objects[customized_class_name] = customized_class
255+
256+
super(YourSeq2Vec, self).__init__(
257+
max_length,
258+
latent_size,
259+
learning_rate
260+
)
261+
262+
def create_model(self):
263+
# create and compile your model in this function
264+
# You should return your model and encoder here
265+
# encoder is the one encoded input sequences
266+
267+
model.compile(loss)
268+
return model, encoder
269+
270+
def load_model(self, file_path):
271+
# load your seq2vec model here and set its attribute values
272+
self.model = self.load_customed_model(file_path)
79273
```
80274

81275
## Lint
@@ -86,6 +280,6 @@ pylint --rcfile=./yoctol-pylintrc/.pylintrc seq2vec
86280

87281
## Test
88282
```
89-
python setup.py test
283+
python -m unittest
90284
```
91285

circle.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ dependencies:
1717

1818
test:
1919
override:
20-
- python setup.py test
20+
- python -m unittest

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ keras
55
yoctol_utils
66
gensim
77
h5py
8+
yoctol_keras_layer_zoo

seq2vec/__init__.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
from .hash_text import HashSeq2Vec
2-
from .seq2seq_auto_encoder import Seq2SeqAutoEncoderUseWordHash
3-
from .seq2seq_word2vec import Seq2SeqWord2Vec
4-
from .seq2seq_cnn3D import Seq2SeqCNN
5-
from .seq2seq_char2vec import Seq2SeqChar2vec
1+
from .model import Seq2VecC2RChar
2+
from .model import Seq2VecR2RHash
3+
from .model import Seq2VecC2RWord
4+
from .model import Seq2VecR2RWord
5+
from .model import Seq2VecBase
6+
from .model import Seq2VecHash
7+
from .model import TrainableSeq2VecBase

0 commit comments

Comments
 (0)