Skip to content

Commit 7e2baa3

Browse files
committed
Initial code for 2nd tutorial "Captcha to text"
1 parent 270e77c commit 7e2baa3

File tree

12 files changed

+872
-19
lines changed

12 files changed

+872
-19
lines changed

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
## [0.1.3] - 2022-20-12
1+
## [0.1.4] - 2022-12-21
2+
### Added:
3+
- added mltu.augmentors (RandomBrightness, RandomRotate, RandomErodeDilate) - used for simple image augmentation;
4+
5+
## [0.1.3] - 2022-12-20
26

37
Initial release of mltu (Machine Learning Training Utilities)
48

README.md

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,16 @@
11
# MLTU - Machine Learning Training Utilities (TensorFlow)
22
Machine Learning Training Utilities with TensorFlow 2.* and Python 3
33

4-
## Installation:
5-
Clone the repository and install the requirements:
4+
# Installation:
5+
To use MLTU in your own project, you can install it from PyPI:
66
```bash
7-
git clone https://github.com/pythonlessons/mltu.git
7+
pip install mltu
88
```
9-
cd into the repository
9+
When running tutorials, it's necessary to install mltu for a specific tutorial, for example:
1010
```bash
11-
cd mltu
12-
```
13-
Install the requirements:
14-
```bash
15-
pip install -r requirements.txt
16-
```
17-
18-
Install the mltu package
19-
```bash
20-
pip install .
11+
pip install mltu==0.1.3
2112
```
13+
Each tutorial has its own requirements.txt file for a specific mltu version. As this project progress, the newest versions may have breaking changes, so it's recommended to use the same version as in the tutorial.
2214

2315
# Tutorials and Examples:
24-
...
16+
1. [Text Recognition With TensorFlow and CTC network](https://pylessons.com/ctc-text-recognition), code in ```Tutorials\01_image_to_word``` folder;

Tutorials/01_image_to_word/README.md

Lines changed: 505 additions & 1 deletion
Large diffs are not rendered by default.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
mltu==0.1.3
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import stow
2+
from datetime import datetime
3+
4+
from mltu.configs import BaseModelConfigs
5+
6+
class ModelConfigs(BaseModelConfigs):
7+
def __init__(self):
8+
super().__init__()
9+
self.model_path = stow.join('Models/02_captcha_to_text', datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
10+
self.vocab = ''
11+
self.height = 50
12+
self.width = 200
13+
self.max_text_length = 0
14+
self.batch_size = 64
15+
self.learning_rate = 1e-3
16+
self.train_epochs = 1000
17+
self.train_workers = 20
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import cv2
2+
import typing
3+
import numpy as np
4+
5+
from mltu.inferenceModel import OnnxInferenceModel
6+
from mltu.utils.text_utils import ctc_decoder, get_cer
7+
8+
class ImageToWordModel(OnnxInferenceModel):
9+
def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
10+
super().__init__(*args, **kwargs)
11+
self.char_list = char_list
12+
13+
def predict(self, image: np.ndarray):
14+
image = cv2.resize(image, self.input_shape[:2][::-1])
15+
16+
image_pred = np.expand_dims(image, axis=0).astype(np.float32)
17+
18+
preds = self.model.run(None, {self.input_name: image_pred})[0]
19+
20+
text = ctc_decoder(preds, self.char_list)[0]
21+
22+
return text
23+
24+
25+
if __name__ == "__main__":
26+
import pandas as pd
27+
from tqdm import tqdm
28+
from mltu.configs import BaseModelConfigs
29+
30+
configs = BaseModelConfigs.load("Models/02_captcha_to_text/202212211205/configs.yaml")
31+
32+
model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab)
33+
34+
df = pd.read_csv("Models/02_captcha_to_text/202212211205/val.csv").values.tolist()
35+
36+
accum_cer = []
37+
for image_path, label in tqdm(df):
38+
image = cv2.imread(image_path)
39+
40+
prediction_text = model.predict(image)
41+
42+
cer = get_cer(prediction_text, label)
43+
print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}")
44+
45+
# resize image by 3 times for visualization
46+
# image = cv2.resize(image, (image.shape[1] * 3, image.shape[0] * 3))
47+
# cv2.imshow(prediction_text, image)
48+
# cv2.waitKey(0)
49+
# cv2.destroyAllWindows()
50+
51+
accum_cer.append(cer)
52+
53+
print(f"Average CER: {np.average(accum_cer)}")
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from keras import layers
2+
from keras.models import Model
3+
4+
from mltu.model_utils import residual_block
5+
6+
def train_model(input_dim, output_dim, activation='leaky_relu', dropout=0.2):
7+
8+
inputs = layers.Input(shape=input_dim, name="input")
9+
10+
# normalize images here instead in preprocessing step
11+
input = layers.Lambda(lambda x: x / 255)(inputs)
12+
13+
x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout)
14+
15+
x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout)
16+
x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout)
17+
18+
x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout)
19+
x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout)
20+
21+
x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout)
22+
x7 = residual_block(x6, 32, activation=activation, skip_conv=True, strides=1, dropout=dropout)
23+
24+
x8 = residual_block(x7, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout)
25+
x9 = residual_block(x8, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout)
26+
27+
squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9)
28+
29+
blstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(squeezed)
30+
blstm = layers.Dropout(dropout)(blstm)
31+
32+
output = layers.Dense(output_dim + 1, activation='softmax', name="output")(blstm)
33+
34+
model = Model(inputs=inputs, outputs=output)
35+
return model
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import tensorflow as tf
2+
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices('GPU')]
3+
except: pass
4+
5+
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
6+
7+
from mltu.dataProvider import DataProvider
8+
from mltu.preprocessors import ImageReader
9+
from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding
10+
from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate
11+
from mltu.losses import CTCloss
12+
from mltu.callbacks import Model2onnx, TrainLogger
13+
from mltu.metrics import CWERMetric
14+
15+
from model import train_model
16+
from configs import ModelConfigs
17+
18+
import stow
19+
from urllib.request import urlopen
20+
from io import BytesIO
21+
from zipfile import ZipFile
22+
23+
def download_and_unzip(url, extract_to='Datasets'):
24+
http_response = urlopen(url)
25+
zipfile = ZipFile(BytesIO(http_response.read()))
26+
zipfile.extractall(path=extract_to)
27+
28+
if not stow.exists(stow.join('Datasets', 'captcha_images_v2')):
29+
download_and_unzip('https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip', extract_to='Datasets')
30+
31+
dataset, vocab, max_len = [], set(), 0
32+
for file in stow.ls(stow.join('Datasets', 'captcha_images_v2')):
33+
dataset.append([stow.relpath(file), file.name])
34+
vocab.update(list(file.name))
35+
max_len = max(max_len, len(file.name))
36+
37+
configs = ModelConfigs()
38+
39+
# Save vocab and maximum text length to configs
40+
configs.vocab = "".join(vocab)
41+
configs.max_text_length = max_len
42+
configs.save()
43+
44+
data_provider = DataProvider(
45+
dataset=dataset,
46+
skip_validation=True,
47+
batch_size=configs.batch_size,
48+
data_preprocessors=[ImageReader()],
49+
transformers=[
50+
ImageResizer(configs.width, configs.height),
51+
LabelIndexer(configs.vocab),
52+
LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab))
53+
],
54+
)
55+
56+
train_data_provider, val_data_provider = data_provider.split()
57+
58+
train_data_provider.augmentors = [RandomBrightness(), RandomRotate(), RandomErodeDilate()]
59+
60+
model = train_model(
61+
input_dim = (configs.height, configs.width, 3),
62+
output_dim = len(configs.vocab),
63+
)
64+
65+
# Compile the model and print summary
66+
model.compile(
67+
optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate),
68+
loss=CTCloss(),
69+
metrics=[CWERMetric()],
70+
run_eagerly=False
71+
)
72+
model.summary(line_length=110)
73+
# Define path to save the model
74+
stow.mkdir(configs.model_path)
75+
76+
# Define callbacks
77+
earlystopper = EarlyStopping(monitor='val_CER', patience=40, verbose=1)
78+
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor='val_CER', verbose=1, save_best_only=True, mode='min')
79+
trainLogger = TrainLogger(configs.model_path)
80+
tb_callback = TensorBoard(f'{configs.model_path}/logs', update_freq=1)
81+
reduceLROnPlat = ReduceLROnPlateau(monitor='val_CER', factor=0.9, min_delta=1e-10, patience=20, verbose=1, mode='auto')
82+
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
83+
84+
# Train the model
85+
model.fit(
86+
train_data_provider,
87+
validation_data=val_data_provider,
88+
epochs=configs.train_epochs,
89+
callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
90+
workers=configs.train_workers
91+
)
92+
93+
# Save training and validation datasets as csv files
94+
train_data_provider.to_csv(stow.join(configs.model_path, 'train.csv'))
95+
val_data_provider.to_csv(stow.join(configs.model_path, 'val.csv'))

mltu/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.1.3"
1+
__version__ = "0.1.4"

mltu/augmentors.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import cv2
2+
import typing
3+
import numpy as np
4+
5+
""" Implemented augmentors:
6+
- RandomBrightness
7+
- RandomRotate
8+
- RandomErodeDilate
9+
"""
10+
11+
class Augmentor:
12+
""" Object that should be inherited by all augmentors
13+
Args:
14+
image (np.ndarray): Image to augment
15+
annotation (np.ndarray): Annotation to augment
16+
17+
Returns:
18+
typing.Tuple[np.ndarray, np.ndarray]: Augmented image and mask
19+
"""
20+
def __init__(self, random_chance: float=0.5) -> None:
21+
"""
22+
Args:
23+
random_chance (float, optional): Chance of applying the augmentor. Defaults to 0.5.
24+
"""
25+
self._random_chance = random_chance
26+
27+
def __call__(self, image: np.ndarray, annotation: np.ndarray) -> typing.Tuple[np.ndarray, np.ndarray]:
28+
if np.random.random() <= self._random_chance:
29+
pass
30+
31+
return image, annotation
32+
33+
class RandomBrightness(Augmentor):
34+
""" Randomly adjust image brightness
35+
36+
Args:
37+
image (np.ndarray): Image to be adjusted
38+
annotation (np.ndarray): Annotation to be adjusted
39+
40+
Returns:
41+
image (np.ndarray): Adjusted image
42+
annotation (np.ndarray): Adjusted annotation
43+
"""
44+
def __init__(self, random_chance:float=0.5, delta:int=100)->None:
45+
"""
46+
Args:
47+
random_chance (float): Float between 0.0 and 1.0 setting bounds for random probability
48+
delta (int): Integer value for brightness adjustment
49+
"""
50+
assert delta >= 0.0
51+
assert delta <= 255.0
52+
53+
self._random_chance = random_chance
54+
self._delta = delta
55+
56+
def __call__(self, image:np.ndarray, annotation:np.ndarray)->typing.Tuple[np.ndarray, np.ndarray]:
57+
if np.random.rand() <= self._random_chance:
58+
59+
image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
60+
61+
value = 1 + np.random.uniform(-self._delta, self._delta) / 255
62+
63+
hsv = np.array(image, dtype = np.float32)
64+
65+
hsv[:, :, 1] = hsv[:, :, 1] * value
66+
hsv[:, :, 2] = hsv[:, :, 2] * value
67+
68+
hsv = np.uint8(np.clip(hsv, 0, 255))
69+
70+
image = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
71+
72+
return image, annotation
73+
74+
class RandomRotate(Augmentor):
75+
""" Randomly rotate image
76+
77+
Args:
78+
image (np.ndarray): Image to be rotated
79+
annotation (np.ndarray): Annotation to be rotated
80+
81+
Returns:
82+
image (np.ndarray): Rotated image
83+
annotation (np.ndarray): Rotated annotation
84+
"""
85+
def __init__(self, random_chance:float=0.5, angle:int=10, borderValue:typing.Tuple[int, int, int]=(255, 255, 255))->None:
86+
"""
87+
Args:
88+
random_chance (float): Float between 0.0 and 1.0 setting bounds for random probability
89+
angle (int): Integer value for rotation angle, in degrees
90+
borderValue (tuple): Tuple of 3 integers, setting border color for image rotation
91+
"""
92+
self._random_chance = random_chance
93+
self._angle = angle
94+
self._borderValue = borderValue
95+
96+
def __call__(self, image:np.ndarray, annotation:np.ndarray)->typing.Tuple[np.ndarray, np.ndarray]:
97+
if np.random.rand() <= self._random_chance:
98+
99+
angle = np.random.uniform(-self._angle, self._angle)
100+
101+
h, w, _ = image.shape
102+
m = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1)
103+
image = cv2.warpAffine(image, m, (w, h), borderValue=self._borderValue)
104+
# Check if annotation is image mask
105+
if not isinstance(annotation, str):
106+
annotation = cv2.warpAffine(annotation, m, (w, h), borderValue=self._borderValue)
107+
108+
return image, annotation
109+
110+
class RandomErodeDilate:
111+
""" Randomly erode and dilate image
112+
113+
Args:
114+
image (np.ndarray): Image to be eroded and dilated
115+
116+
Returns:
117+
image (np.ndarray): Eroded and dilated image
118+
"""
119+
def __init__(self, random_chance:float=0.5, kernel_size:typing.Tuple[int, int]=(1, 1))->None:
120+
"""
121+
Args:
122+
random_chance (float): Float between 0.0 and 1.0 setting bounds for random probability
123+
kernel_size (tuple): Tuple of 2 integers, setting kernel size for erosion and dilation
124+
"""
125+
self._random_chance = random_chance
126+
self._kernel_size = kernel_size
127+
128+
def __call__(self, image:np.ndarray, annotation)->typing.Tuple[np.ndarray, np.ndarray]:
129+
if np.random.rand() <= self._random_chance:
130+
131+
kernel = np.ones(self._kernel_size, np.uint8)
132+
133+
if np.random.rand() <= 0.5:
134+
image = cv2.erode(image, kernel, iterations=1)
135+
else:
136+
image = cv2.dilate(image, kernel, iterations=1)
137+
138+
return image, annotation

0 commit comments

Comments
 (0)