Skip to content

Commit 2bb1705

Browse files
committed
Image captioning model
1 parent 2717a2c commit 2bb1705

File tree

4 files changed

+291
-39
lines changed

4 files changed

+291
-39
lines changed

Image-Captioning/image-captioning.py

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
# Read text captions
2+
def readTextFile(path):
3+
with open(path) as f:
4+
captions = f.read()
5+
return captions
6+
7+
# Location of captions
8+
captions = readTextFile('files/captions.txt')
9+
captions = captions.split("\n")[1:-1]
10+
print(len(captions)) # Total captions
11+
12+
# Creating dictionary - {"image name": ["caption1","caption2"...]}
13+
description = {}
14+
for x in captions:
15+
parts = x.split(',')
16+
img_name = parts[0][:-4]
17+
coment = parts[1]
18+
if description.get(img_name) is None:
19+
description[img_name] = []
20+
description[img_name].append(coment)
21+
22+
# All libraries
23+
import numpy as np
24+
import matplotlib.pyplot as plt
25+
import keras
26+
import re
27+
import tensorflow as tf
28+
from tf.keras.applications.resnet50 import ResNet50,preprocess_input
29+
from keras.preprocessing import image
30+
from keras.utils import load_img,img_to_array
31+
from keras.models import Model
32+
from tf.keras.preprocessing.sequence import pad_sequences
33+
from keras.utils import to_categorical
34+
from keras.layers import Dense,Dropout,Embedding,LSTM
35+
from keras.layers.merging import add
36+
37+
# Data cleaning
38+
# Don't remove stopwords because we need to make meaningful words. Also stemming will also not applicable because we require texts has correct vocabulary
39+
# Remove numbers, lower() , punctuations remove
40+
def clean_text(sentence):
41+
sentence = sentence.lower()
42+
sentence = re.sub("[^a-z]+"," ",sentence)
43+
sentence = sentence.split()
44+
45+
sentence = [s for s in sentence if len(s)>1]
46+
sentence = " ".join(sentence)
47+
return sentence
48+
49+
# clean all captions
50+
for key,caption_list in description.items():
51+
for i in range(len(caption_list)):
52+
caption_list[i] = clean_text(caption_list[i])
53+
54+
# Total number of words across all the sentences
55+
total_words = []
56+
for key in description.keys():
57+
[total_words.append(i) for des in description[key] for i in des.split()]
58+
print(len(total_words))
59+
60+
# Filter words from the vocab according to the certain threshold frequency
61+
import collections
62+
counter = collections.Counter(total_words)
63+
freq_cnt = dict(counter)
64+
65+
# Sort this dictionary according to freq count
66+
sorted_freq_cnt = sorted(freq_cnt.items(),reverse=True,key=lambda x:x[1])
67+
68+
# Filtering
69+
threshold = 5
70+
sorted_freq_cnt = [x for x in sorted_freq_cnt if x[1]>threshold]
71+
total_words = [x[0] for x in sorted_freq_cnt]
72+
73+
# Prepare train/test data
74+
train_filedata = readTextFile("files/Flickr_8k.trainImages.txt")
75+
test_filedata = readTextFile("files/Flickr_8k.testImages.txt")
76+
77+
train = [row.split(".")[0] for row in train_filedata.split("\n")[:-1]]
78+
test = [row.split(".")[0] for row in test_filedata.split("\n")[:-1]]
79+
80+
# Prepare description for the training data
81+
# Tweak - add <S> and <e> token to our training data
82+
train_description = {}
83+
for img_id in train:
84+
train_description[img_id] = []
85+
for cap in description[img_id]:
86+
cap_to_append = "startseq " + cap + " endseq"
87+
train_description[img_id].append(cap_to_append)
88+
89+
90+
# Transfer learning
91+
# Step 1. Image feature extraction
92+
model = ResNet50(weights='imagenet',input_shape=(224,224,3)) # Using pretrained ResNet50 model for extracting preprocessing images
93+
model.summary()
94+
95+
new_model = Model(model.input,model.layers[-2].output) # Removing last 2 layers of ResNet50 model
96+
new_model.summary()
97+
98+
def preprocess_img(img):
99+
img = load_img(img,target_size=(224,224))
100+
img = img_to_array(img)
101+
img = np.expand_dims(img,axis=0)
102+
# normalisation -> preprocess_input
103+
img = preprocess_input(img)
104+
return img
105+
106+
def encode_image(img):
107+
img = preprocess_img(img)
108+
feature_vector = new_model.predict(img,verbose=0)
109+
# print(feature_vector.shape)
110+
feature_vector = feature_vector.reshape((-1,))
111+
return feature_vector
112+
113+
# encode all train images
114+
encoding_train = {}
115+
# image_id --> feature vector extrcted from resnet
116+
for ix,img_id in enumerate(train):
117+
img_path = "files/Images/"+img_id+".jpg"
118+
encoding_train[img_id] = encode_image(img_path)
119+
# if ix%100==0:
120+
# print(ix)
121+
122+
# encode all test images
123+
encoding_test = {}
124+
# image_id --> feature vector extrcted from resnet
125+
for ix,img_id in enumerate(test):
126+
img_path = "files/Images/"+img_id+".jpg"
127+
encoding_test[img_id] = encode_image(img_path)
128+
# if ix%100==0:
129+
# print(ix)
130+
131+
132+
word_to_idx = {}
133+
idx_to_word = {}
134+
for i,word in enumerate(total_words):
135+
word_to_idx[word] = i+1
136+
idx_to_word[i+1] = word
137+
word_to_idx['startseq'] = 2573
138+
word_to_idx['endseq'] = 2574
139+
idx_to_word[2573] = 'startseq'
140+
idx_to_word[2574] = 'endseq'
141+
142+
# Model training
143+
# RNN model ->
144+
# Find max length of any caption to decide RNN model size
145+
max_len=0
146+
for key in train_description.keys():
147+
for cap in train_description[key]:
148+
max_len = max(max_len,len(cap.split())) # Max length of any caption
149+
150+
# Data Loader(generator)
151+
def data_generator(train_description,encoding_train,word_to_idx,max_len,batch_size,vocab_size=2574):
152+
x1,x2,y = [],[],[]
153+
n=0
154+
155+
while True:
156+
for key,desc_list in train_description.items():
157+
n+=1
158+
photo = encoding_train[key]
159+
for desc in desc_list:
160+
seq = [word_to_idx[word] for word in desc.split() if word in word_to_idx.keys()]
161+
for i in range(1,len(seq)):
162+
xi = seq[0:i]
163+
yi = seq[i]
164+
165+
xi = pad_sequences([xi],maxlen=max_len,value=0,padding='post')[0]
166+
yi = to_categorical([yi-1],num_classes=vocab_size)[0]
167+
x1.append(photo) # 2048
168+
x2.append(xi) # 35 -> glove
169+
y.append(yi) # vocab_size->2574
170+
171+
if n==batch_size:
172+
yield [[np.array(x1),np.array(x2)],np.array(y)]
173+
x1,x2,y = [],[],[]
174+
n=0
175+
176+
# WORD EMBEDDINGS
177+
# The text data should be embedded before passing to RNN/LSTM layer
178+
f = open("files/glove.6B.50d.txt" , encoding='utf8')
179+
embedding_index = {}
180+
181+
for line in f:
182+
values = line.split()
183+
word = values[0]
184+
word_embedding = np.array(values[1:],dtype='float')
185+
embedding_index[word] = word_embedding
186+
187+
def get_embedding_matrix(vocab_size=2574):
188+
emb_dim = 50
189+
matrix = np.zeros((vocab_size,emb_dim))
190+
for word,idx in word_to_idx.items():
191+
embedding_vector = embedding_index.get(word)
192+
if embedding_vector is not None:
193+
matrix[idx] = embedding_vector
194+
return matrix
195+
196+
197+
embedding_matrix = get_embedding_matrix()
198+
199+
vocab_size = 2574
200+
from keras.layers import Input
201+
input_img_features = Input(shape=(2048,))
202+
input_img1 = Dropout(0.3)(input_img_features)
203+
input_img2 = Dense(256,activation="relu")(input_img1)
204+
205+
# Captions as input => batch_size*35 -> batch_size*35*50 -> 256
206+
input_captions = Input(shape=(max_len,))
207+
# Now here we use customize embedding and not the glove vector embedding yet
208+
input_cap1 = Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_captions)
209+
input_cap2 = Dropout(0.3)(input_cap1)
210+
input_cap3 = LSTM(256)(input_cap2)
211+
212+
# Add inputs and decode them
213+
decoder1 = add([input_img2,input_cap3])
214+
decoder2 = Dense(256,activation='relu')(decoder1)
215+
outputs = Dense(vocab_size,activation='softmax')(decoder2)
216+
217+
# COMBINED MODEL
218+
model = Model(inputs=[input_img_features,input_captions],outputs=outputs)
219+
220+
# Important thing -- Embedding layer # Here we defined the matrix to be choose for the words with integers
221+
model.layers[2].set_weights([embedding_matrix])
222+
model.layers[2].trainable = False
223+
model.compile(loss="categorical_crossentropy",optimizer="adam")
224+
225+
print(model.summary())
226+
227+
# Training of Model
228+
epochs = 10
229+
batch_size = 3 # no if images per batch
230+
steps = len(train_description)//batch_size
231+
def train():
232+
for i in range(epochs):
233+
generator = data_generator(train_description,encoding_train,word_to_idx,max_len,batch_size)
234+
model.fit(generator,epochs=1,steps_per_epoch=steps,verbose=1)
235+
model.save("models/"+"9"+'.h5')
236+
237+
train()
238+
239+
# Prediction Function
240+
def predict_caption(photo):
241+
in_text = "startseq"
242+
for i in range(max_len):
243+
sequence = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
244+
sequence = pad_sequences([sequence],maxlen=max_len,padding='post')
245+
ypred = model.predict([photo,sequence])
246+
ypred = ypred.argmax() # word with max probability -> greedy sampling
247+
word = idx_to_word[ypred+1]
248+
in_text += (' ' + word)
249+
if word=='endseq':
250+
break
251+
final_caption = in_text.split()[1:-1]
252+
final_caption = ' '.join(final_caption)
253+
return final_caption
254+
255+
256+
# Pick some random images
257+
for i in range(15):
258+
no = np.random.randint(0,1000)
259+
all_img_names = list(encoding_test.keys())
260+
img_name = all_img_names[no]
261+
photo_2048 = encoding_test[img_name].reshape((1,2048))
262+
263+
caption = predict_caption(photo_2048)
264+
265+
i = plt.imread("files/Images/"+img_name+".jpg")
266+
print(caption)
267+
plt.imshow(i)
268+
plt.axis("off")
269+
plt.show()

Image-Captioning/models/9.h5

18.5 MB
Binary file not shown.

Image-Captioning/readme.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Image Captioning Model
2+
3+
* Uses pretrained ResNet50 model and Glove embeddings to caption any image
4+
5+
<pre>
6+
Model ARCHITECTURE
7+
img feature --------> MODEL --> Next word in sequence ----
8+
partial sequence ---> |
9+
| _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
10+
Partial caption ----> RNN
11+
\
12+
\ Feed forward network ----> predicted word,next
13+
/ ending with softmax in the sequence of
14+
/ partial caption
15+
Image vector
16+
</pre>
17+
18+
## <a href="https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8">Link</a> for this model
19+
20+
## Installation
21+
pip install tensorflow
22+
pip install keras

Keylogger/keylogger.py

Lines changed: 0 additions & 39 deletions
This file was deleted.

0 commit comments

Comments
 (0)