1
+ # Read text captions
2
+ def readTextFile (path ):
3
+ with open (path ) as f :
4
+ captions = f .read ()
5
+ return captions
6
+
7
+ # Location of captions
8
+ captions = readTextFile ('files/captions.txt' )
9
+ captions = captions .split ("\n " )[1 :- 1 ]
10
+ print (len (captions )) # Total captions
11
+
12
+ # Creating dictionary - {"image name": ["caption1","caption2"...]}
13
+ description = {}
14
+ for x in captions :
15
+ parts = x .split (',' )
16
+ img_name = parts [0 ][:- 4 ]
17
+ coment = parts [1 ]
18
+ if description .get (img_name ) is None :
19
+ description [img_name ] = []
20
+ description [img_name ].append (coment )
21
+
22
+ # All libraries
23
+ import numpy as np
24
+ import matplotlib .pyplot as plt
25
+ import keras
26
+ import re
27
+ import tensorflow as tf
28
+ from tf .keras .applications .resnet50 import ResNet50 ,preprocess_input
29
+ from keras .preprocessing import image
30
+ from keras .utils import load_img ,img_to_array
31
+ from keras .models import Model
32
+ from tf .keras .preprocessing .sequence import pad_sequences
33
+ from keras .utils import to_categorical
34
+ from keras .layers import Dense ,Dropout ,Embedding ,LSTM
35
+ from keras .layers .merging import add
36
+
37
+ # Data cleaning
38
+ # Don't remove stopwords because we need to make meaningful words. Also stemming will also not applicable because we require texts has correct vocabulary
39
+ # Remove numbers, lower() , punctuations remove
40
+ def clean_text (sentence ):
41
+ sentence = sentence .lower ()
42
+ sentence = re .sub ("[^a-z]+" ," " ,sentence )
43
+ sentence = sentence .split ()
44
+
45
+ sentence = [s for s in sentence if len (s )> 1 ]
46
+ sentence = " " .join (sentence )
47
+ return sentence
48
+
49
+ # clean all captions
50
+ for key ,caption_list in description .items ():
51
+ for i in range (len (caption_list )):
52
+ caption_list [i ] = clean_text (caption_list [i ])
53
+
54
+ # Total number of words across all the sentences
55
+ total_words = []
56
+ for key in description .keys ():
57
+ [total_words .append (i ) for des in description [key ] for i in des .split ()]
58
+ print (len (total_words ))
59
+
60
+ # Filter words from the vocab according to the certain threshold frequency
61
+ import collections
62
+ counter = collections .Counter (total_words )
63
+ freq_cnt = dict (counter )
64
+
65
+ # Sort this dictionary according to freq count
66
+ sorted_freq_cnt = sorted (freq_cnt .items (),reverse = True ,key = lambda x :x [1 ])
67
+
68
+ # Filtering
69
+ threshold = 5
70
+ sorted_freq_cnt = [x for x in sorted_freq_cnt if x [1 ]> threshold ]
71
+ total_words = [x [0 ] for x in sorted_freq_cnt ]
72
+
73
+ # Prepare train/test data
74
+ train_filedata = readTextFile ("files/Flickr_8k.trainImages.txt" )
75
+ test_filedata = readTextFile ("files/Flickr_8k.testImages.txt" )
76
+
77
+ train = [row .split ("." )[0 ] for row in train_filedata .split ("\n " )[:- 1 ]]
78
+ test = [row .split ("." )[0 ] for row in test_filedata .split ("\n " )[:- 1 ]]
79
+
80
+ # Prepare description for the training data
81
+ # Tweak - add <S> and <e> token to our training data
82
+ train_description = {}
83
+ for img_id in train :
84
+ train_description [img_id ] = []
85
+ for cap in description [img_id ]:
86
+ cap_to_append = "startseq " + cap + " endseq"
87
+ train_description [img_id ].append (cap_to_append )
88
+
89
+
90
+ # Transfer learning
91
+ # Step 1. Image feature extraction
92
+ model = ResNet50 (weights = 'imagenet' ,input_shape = (224 ,224 ,3 )) # Using pretrained ResNet50 model for extracting preprocessing images
93
+ model .summary ()
94
+
95
+ new_model = Model (model .input ,model .layers [- 2 ].output ) # Removing last 2 layers of ResNet50 model
96
+ new_model .summary ()
97
+
98
+ def preprocess_img (img ):
99
+ img = load_img (img ,target_size = (224 ,224 ))
100
+ img = img_to_array (img )
101
+ img = np .expand_dims (img ,axis = 0 )
102
+ # normalisation -> preprocess_input
103
+ img = preprocess_input (img )
104
+ return img
105
+
106
+ def encode_image (img ):
107
+ img = preprocess_img (img )
108
+ feature_vector = new_model .predict (img ,verbose = 0 )
109
+ # print(feature_vector.shape)
110
+ feature_vector = feature_vector .reshape ((- 1 ,))
111
+ return feature_vector
112
+
113
+ # encode all train images
114
+ encoding_train = {}
115
+ # image_id --> feature vector extrcted from resnet
116
+ for ix ,img_id in enumerate (train ):
117
+ img_path = "files/Images/" + img_id + ".jpg"
118
+ encoding_train [img_id ] = encode_image (img_path )
119
+ # if ix%100==0:
120
+ # print(ix)
121
+
122
+ # encode all test images
123
+ encoding_test = {}
124
+ # image_id --> feature vector extrcted from resnet
125
+ for ix ,img_id in enumerate (test ):
126
+ img_path = "files/Images/" + img_id + ".jpg"
127
+ encoding_test [img_id ] = encode_image (img_path )
128
+ # if ix%100==0:
129
+ # print(ix)
130
+
131
+
132
+ word_to_idx = {}
133
+ idx_to_word = {}
134
+ for i ,word in enumerate (total_words ):
135
+ word_to_idx [word ] = i + 1
136
+ idx_to_word [i + 1 ] = word
137
+ word_to_idx ['startseq' ] = 2573
138
+ word_to_idx ['endseq' ] = 2574
139
+ idx_to_word [2573 ] = 'startseq'
140
+ idx_to_word [2574 ] = 'endseq'
141
+
142
+ # Model training
143
+ # RNN model ->
144
+ # Find max length of any caption to decide RNN model size
145
+ max_len = 0
146
+ for key in train_description .keys ():
147
+ for cap in train_description [key ]:
148
+ max_len = max (max_len ,len (cap .split ())) # Max length of any caption
149
+
150
+ # Data Loader(generator)
151
+ def data_generator (train_description ,encoding_train ,word_to_idx ,max_len ,batch_size ,vocab_size = 2574 ):
152
+ x1 ,x2 ,y = [],[],[]
153
+ n = 0
154
+
155
+ while True :
156
+ for key ,desc_list in train_description .items ():
157
+ n += 1
158
+ photo = encoding_train [key ]
159
+ for desc in desc_list :
160
+ seq = [word_to_idx [word ] for word in desc .split () if word in word_to_idx .keys ()]
161
+ for i in range (1 ,len (seq )):
162
+ xi = seq [0 :i ]
163
+ yi = seq [i ]
164
+
165
+ xi = pad_sequences ([xi ],maxlen = max_len ,value = 0 ,padding = 'post' )[0 ]
166
+ yi = to_categorical ([yi - 1 ],num_classes = vocab_size )[0 ]
167
+ x1 .append (photo ) # 2048
168
+ x2 .append (xi ) # 35 -> glove
169
+ y .append (yi ) # vocab_size->2574
170
+
171
+ if n == batch_size :
172
+ yield [[np .array (x1 ),np .array (x2 )],np .array (y )]
173
+ x1 ,x2 ,y = [],[],[]
174
+ n = 0
175
+
176
+ # WORD EMBEDDINGS
177
+ # The text data should be embedded before passing to RNN/LSTM layer
178
+ f = open ("files/glove.6B.50d.txt" , encoding = 'utf8' )
179
+ embedding_index = {}
180
+
181
+ for line in f :
182
+ values = line .split ()
183
+ word = values [0 ]
184
+ word_embedding = np .array (values [1 :],dtype = 'float' )
185
+ embedding_index [word ] = word_embedding
186
+
187
+ def get_embedding_matrix (vocab_size = 2574 ):
188
+ emb_dim = 50
189
+ matrix = np .zeros ((vocab_size ,emb_dim ))
190
+ for word ,idx in word_to_idx .items ():
191
+ embedding_vector = embedding_index .get (word )
192
+ if embedding_vector is not None :
193
+ matrix [idx ] = embedding_vector
194
+ return matrix
195
+
196
+
197
+ embedding_matrix = get_embedding_matrix ()
198
+
199
+ vocab_size = 2574
200
+ from keras .layers import Input
201
+ input_img_features = Input (shape = (2048 ,))
202
+ input_img1 = Dropout (0.3 )(input_img_features )
203
+ input_img2 = Dense (256 ,activation = "relu" )(input_img1 )
204
+
205
+ # Captions as input => batch_size*35 -> batch_size*35*50 -> 256
206
+ input_captions = Input (shape = (max_len ,))
207
+ # Now here we use customize embedding and not the glove vector embedding yet
208
+ input_cap1 = Embedding (input_dim = vocab_size ,output_dim = 50 ,mask_zero = True )(input_captions )
209
+ input_cap2 = Dropout (0.3 )(input_cap1 )
210
+ input_cap3 = LSTM (256 )(input_cap2 )
211
+
212
+ # Add inputs and decode them
213
+ decoder1 = add ([input_img2 ,input_cap3 ])
214
+ decoder2 = Dense (256 ,activation = 'relu' )(decoder1 )
215
+ outputs = Dense (vocab_size ,activation = 'softmax' )(decoder2 )
216
+
217
+ # COMBINED MODEL
218
+ model = Model (inputs = [input_img_features ,input_captions ],outputs = outputs )
219
+
220
+ # Important thing -- Embedding layer # Here we defined the matrix to be choose for the words with integers
221
+ model .layers [2 ].set_weights ([embedding_matrix ])
222
+ model .layers [2 ].trainable = False
223
+ model .compile (loss = "categorical_crossentropy" ,optimizer = "adam" )
224
+
225
+ print (model .summary ())
226
+
227
+ # Training of Model
228
+ epochs = 10
229
+ batch_size = 3 # no if images per batch
230
+ steps = len (train_description )// batch_size
231
+ def train ():
232
+ for i in range (epochs ):
233
+ generator = data_generator (train_description ,encoding_train ,word_to_idx ,max_len ,batch_size )
234
+ model .fit (generator ,epochs = 1 ,steps_per_epoch = steps ,verbose = 1 )
235
+ model .save ("models/" + "9" + '.h5' )
236
+
237
+ train ()
238
+
239
+ # Prediction Function
240
+ def predict_caption (photo ):
241
+ in_text = "startseq"
242
+ for i in range (max_len ):
243
+ sequence = [word_to_idx [w ] for w in in_text .split () if w in word_to_idx ]
244
+ sequence = pad_sequences ([sequence ],maxlen = max_len ,padding = 'post' )
245
+ ypred = model .predict ([photo ,sequence ])
246
+ ypred = ypred .argmax () # word with max probability -> greedy sampling
247
+ word = idx_to_word [ypred + 1 ]
248
+ in_text += (' ' + word )
249
+ if word == 'endseq' :
250
+ break
251
+ final_caption = in_text .split ()[1 :- 1 ]
252
+ final_caption = ' ' .join (final_caption )
253
+ return final_caption
254
+
255
+
256
+ # Pick some random images
257
+ for i in range (15 ):
258
+ no = np .random .randint (0 ,1000 )
259
+ all_img_names = list (encoding_test .keys ())
260
+ img_name = all_img_names [no ]
261
+ photo_2048 = encoding_test [img_name ].reshape ((1 ,2048 ))
262
+
263
+ caption = predict_caption (photo_2048 )
264
+
265
+ i = plt .imread ("files/Images/" + img_name + ".jpg" )
266
+ print (caption )
267
+ plt .imshow (i )
268
+ plt .axis ("off" )
269
+ plt .show ()
0 commit comments