Initial code for 2nd tutorial "Captcha to text"

pythonlessons · pythonlessons · commit 7e2baa38cc9a · 2022-12-21T15:59:29.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,8 @@
-## [0.1.3] - 2022-20-12
+## [0.1.4] - 2022-12-21
+### Added:
+- added mltu.augmentors (RandomBrightness, RandomRotate, RandomErodeDilate) - used for simple image augmentation;
+
+## [0.1.3] - 2022-12-20
 
 Initial release of mltu (Machine Learning Training Utilities)
 
diff --git a/README.md b/README.md
@@ -1,24 +1,16 @@
 # MLTU - Machine Learning Training Utilities (TensorFlow)
 Machine Learning Training Utilities with TensorFlow 2.* and Python 3
 
-## Installation:
-Clone the repository and install the requirements:
+# Installation:
+To use MLTU in your own project, you can install it from PyPI:
 ```bash
-git clone https://github.com/pythonlessons/mltu.git
+pip install mltu
 ```
-cd into the repository
+When running tutorials, it's necessary to install mltu for a specific tutorial, for example:
 ```bash
-cd mltu
-```
-Install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-Install the mltu package
-```bash
-pip install .
+pip install mltu==0.1.3
 ```
+Each tutorial has its own requirements.txt file for a specific mltu version. As this project progress, the newest versions may have breaking changes, so it's recommended to use the same version as in the tutorial.
 
 # Tutorials and Examples:
-...
+1. [Text Recognition With TensorFlow and CTC network](https://pylessons.com/ctc-text-recognition), code in ```Tutorials\01_image_to_word``` folder;
diff --git a/Tutorials/01_image_to_word/README.md b/Tutorials/01_image_to_word/README.md
diff --git a/Tutorials/01_image_to_word/requiremenets.txt b/Tutorials/01_image_to_word/requiremenets.txt
@@ -0,0 +1 @@
+mltu==0.1.3
diff --git a/Tutorials/02_captcha_to_text/configs.py b/Tutorials/02_captcha_to_text/configs.py
@@ -0,0 +1,17 @@
+import stow
+from datetime import datetime
+
+from mltu.configs import BaseModelConfigs
+
+class ModelConfigs(BaseModelConfigs):
+    def __init__(self):
+        super().__init__()
+        self.model_path = stow.join('Models/02_captcha_to_text', datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
+        self.vocab = ''
+        self.height = 50
+        self.width = 200
+        self.max_text_length = 0
+        self.batch_size = 64
+        self.learning_rate = 1e-3
+        self.train_epochs = 1000
+        self.train_workers = 20
diff --git a/Tutorials/02_captcha_to_text/inferenceModel.py b/Tutorials/02_captcha_to_text/inferenceModel.py
@@ -0,0 +1,53 @@
+import cv2
+import typing
+import numpy as np
+
+from mltu.inferenceModel import OnnxInferenceModel
+from mltu.utils.text_utils import ctc_decoder, get_cer
+
+class ImageToWordModel(OnnxInferenceModel):
+    def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.char_list = char_list
+
+    def predict(self, image: np.ndarray):
+        image = cv2.resize(image, self.input_shape[:2][::-1])
+
+        image_pred = np.expand_dims(image, axis=0).astype(np.float32)
+
+        preds = self.model.run(None, {self.input_name: image_pred})[0]
+
+        text = ctc_decoder(preds, self.char_list)[0]
+
+        return text
+
+
+if __name__ == "__main__":
+    import pandas as pd
+    from tqdm import tqdm
+    from mltu.configs import BaseModelConfigs
+
+    configs = BaseModelConfigs.load("Models/02_captcha_to_text/202212211205/configs.yaml")
+
+    model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab)
+
+    df = pd.read_csv("Models/02_captcha_to_text/202212211205/val.csv").values.tolist()
+
+    accum_cer = []
+    for image_path, label in tqdm(df):
+        image = cv2.imread(image_path)
+
+        prediction_text = model.predict(image)
+
+        cer = get_cer(prediction_text, label)
+        print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}")
+
+        # resize image by 3 times for visualization
+        # image = cv2.resize(image, (image.shape[1] * 3, image.shape[0] * 3))
+        # cv2.imshow(prediction_text, image)
+        # cv2.waitKey(0)
+        # cv2.destroyAllWindows()
+
+        accum_cer.append(cer)
+
+    print(f"Average CER: {np.average(accum_cer)}")
diff --git a/Tutorials/02_captcha_to_text/model.py b/Tutorials/02_captcha_to_text/model.py
@@ -0,0 +1,35 @@
+from keras import layers
+from keras.models import Model
+
+from mltu.model_utils import residual_block
+
+def train_model(input_dim, output_dim, activation='leaky_relu', dropout=0.2):
+    
+    inputs = layers.Input(shape=input_dim, name="input")
+
+    # normalize images here instead in preprocessing step
+    input = layers.Lambda(lambda x: x / 255)(inputs)
+
+    x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout)
+
+    x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout)
+    x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout)
+
+    x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout)
+    x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout)
+
+    x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout)
+    x7 = residual_block(x6, 32, activation=activation, skip_conv=True, strides=1, dropout=dropout)
+
+    x8 = residual_block(x7, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout)
+    x9 = residual_block(x8, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout)
+
+    squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9)
+
+    blstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(squeezed)
+    blstm = layers.Dropout(dropout)(blstm)
+
+    output = layers.Dense(output_dim + 1, activation='softmax', name="output")(blstm)
+
+    model = Model(inputs=inputs, outputs=output)
+    return model
diff --git a/Tutorials/02_captcha_to_text/train.py b/Tutorials/02_captcha_to_text/train.py
@@ -0,0 +1,95 @@
+import tensorflow as tf
+try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices('GPU')]
+except: pass
+
+from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
+
+from mltu.dataProvider import DataProvider
+from mltu.preprocessors import ImageReader
+from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding
+from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate
+from mltu.losses import CTCloss
+from mltu.callbacks import Model2onnx, TrainLogger
+from mltu.metrics import CWERMetric
+
+from model import train_model
+from configs import ModelConfigs
+
+import stow
+from urllib.request import urlopen
+from io import BytesIO
+from zipfile import ZipFile
+
+def download_and_unzip(url, extract_to='Datasets'):
+    http_response = urlopen(url)
+    zipfile = ZipFile(BytesIO(http_response.read()))
+    zipfile.extractall(path=extract_to)
+
+if not stow.exists(stow.join('Datasets', 'captcha_images_v2')):
+    download_and_unzip('https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip', extract_to='Datasets')
+
+dataset, vocab, max_len = [], set(), 0
+for file in stow.ls(stow.join('Datasets', 'captcha_images_v2')):
+    dataset.append([stow.relpath(file), file.name])
+    vocab.update(list(file.name))
+    max_len = max(max_len, len(file.name))
+
+configs = ModelConfigs()
+
+# Save vocab and maximum text length to configs
+configs.vocab = "".join(vocab)
+configs.max_text_length = max_len
+configs.save()
+
+data_provider = DataProvider(
+    dataset=dataset,
+    skip_validation=True,
+    batch_size=configs.batch_size,
+    data_preprocessors=[ImageReader()],
+    transformers=[
+        ImageResizer(configs.width, configs.height),
+        LabelIndexer(configs.vocab),
+        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab))
+        ],
+)
+
+train_data_provider, val_data_provider = data_provider.split()
+
+train_data_provider.augmentors = [RandomBrightness(), RandomRotate(), RandomErodeDilate()]
+
+model = train_model(
+    input_dim = (configs.height, configs.width, 3),
+    output_dim = len(configs.vocab),
+)
+
+# Compile the model and print summary
+model.compile(
+    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
+    loss=CTCloss(), 
+    metrics=[CWERMetric()],
+    run_eagerly=False
+)
+model.summary(line_length=110)
+# Define path to save the model
+stow.mkdir(configs.model_path)
+
+# Define callbacks
+earlystopper = EarlyStopping(monitor='val_CER', patience=40, verbose=1)
+checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor='val_CER', verbose=1, save_best_only=True, mode='min')
+trainLogger = TrainLogger(configs.model_path)
+tb_callback = TensorBoard(f'{configs.model_path}/logs', update_freq=1)
+reduceLROnPlat = ReduceLROnPlateau(monitor='val_CER', factor=0.9, min_delta=1e-10, patience=20, verbose=1, mode='auto')
+model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
+
+# Train the model
+model.fit(
+    train_data_provider,
+    validation_data=val_data_provider,
+    epochs=configs.train_epochs,
+    callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
+    workers=configs.train_workers
+)
+
+# Save training and validation datasets as csv files
+train_data_provider.to_csv(stow.join(configs.model_path, 'train.csv'))
+val_data_provider.to_csv(stow.join(configs.model_path, 'val.csv'))
diff --git a/mltu/__init__.py b/mltu/__init__.py
@@ -1 +1 @@
-__version__ = "0.1.3"
+__version__ = "0.1.4"
diff --git a/mltu/augmentors.py b/mltu/augmentors.py
@@ -0,0 +1,138 @@
+import cv2
+import typing
+import numpy as np
+
+""" Implemented augmentors:
+- RandomBrightness
+- RandomRotate
+- RandomErodeDilate
+"""
+
+class Augmentor:
+    """ Object that should be inherited by all augmentors
+    Args:
+        image (np.ndarray): Image to augment
+        annotation (np.ndarray): Annotation to augment
+
+    Returns:
+        typing.Tuple[np.ndarray, np.ndarray]: Augmented image and mask
+    """
+    def __init__(self, random_chance: float=0.5) -> None:
+        """
+        Args:
+            random_chance (float, optional): Chance of applying the augmentor. Defaults to 0.5.
+        """
+        self._random_chance = random_chance
+
+    def __call__(self, image: np.ndarray, annotation: np.ndarray) -> typing.Tuple[np.ndarray, np.ndarray]:
+        if np.random.random() <= self._random_chance:
+            pass
+
+        return image, annotation
+
+class RandomBrightness(Augmentor):
+    """ Randomly adjust image brightness
+
+    Args:
+        image (np.ndarray): Image to be adjusted
+        annotation (np.ndarray): Annotation to be adjusted
+
+    Returns:
+        image (np.ndarray): Adjusted image
+        annotation (np.ndarray): Adjusted annotation
+    """
+    def __init__(self, random_chance:float=0.5, delta:int=100)->None:
+        """ 
+        Args:
+            random_chance (float): Float between 0.0 and 1.0 setting bounds for random probability
+            delta (int): Integer value for brightness adjustment
+        """
+        assert delta >= 0.0
+        assert delta <= 255.0
+
+        self._random_chance = random_chance
+        self._delta = delta
+
+    def __call__(self, image:np.ndarray, annotation:np.ndarray)->typing.Tuple[np.ndarray, np.ndarray]:
+        if np.random.rand() <= self._random_chance:
+
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+
+            value = 1 + np.random.uniform(-self._delta, self._delta) / 255
+
+            hsv = np.array(image, dtype = np.float32)
+
+            hsv[:, :, 1] = hsv[:, :, 1] * value
+            hsv[:, :, 2] = hsv[:, :, 2] * value
+
+            hsv = np.uint8(np.clip(hsv, 0, 255))
+
+            image = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
+
+        return image, annotation
+
+class RandomRotate(Augmentor):
+    """ Randomly rotate image
+
+    Args:
+        image (np.ndarray): Image to be rotated
+        annotation (np.ndarray): Annotation to be rotated
+
+    Returns:
+        image (np.ndarray): Rotated image
+        annotation (np.ndarray): Rotated annotation
+    """
+    def __init__(self, random_chance:float=0.5, angle:int=10, borderValue:typing.Tuple[int, int, int]=(255, 255, 255))->None:
+        """
+        Args:
+            random_chance (float): Float between 0.0 and 1.0 setting bounds for random probability
+            angle (int): Integer value for rotation angle, in degrees
+            borderValue (tuple): Tuple of 3 integers, setting border color for image rotation
+        """
+        self._random_chance = random_chance
+        self._angle = angle
+        self._borderValue = borderValue
+
+    def __call__(self, image:np.ndarray, annotation:np.ndarray)->typing.Tuple[np.ndarray, np.ndarray]:
+        if np.random.rand() <= self._random_chance:
+
+            angle = np.random.uniform(-self._angle, self._angle)
+
+            h, w, _ = image.shape
+            m = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1)
+            image = cv2.warpAffine(image, m, (w, h), borderValue=self._borderValue)
+            # Check if annotation is image mask
+            if not isinstance(annotation, str):
+                annotation = cv2.warpAffine(annotation, m, (w, h), borderValue=self._borderValue)
+
+        return image, annotation
+
+class RandomErodeDilate:
+    """ Randomly erode and dilate image
+
+    Args:
+        image (np.ndarray): Image to be eroded and dilated
+
+    Returns:
+        image (np.ndarray): Eroded and dilated image
+    """
+    def __init__(self, random_chance:float=0.5, kernel_size:typing.Tuple[int, int]=(1, 1))->None:
+        """
+        Args:
+            random_chance (float): Float between 0.0 and 1.0 setting bounds for random probability
+            kernel_size (tuple): Tuple of 2 integers, setting kernel size for erosion and dilation
+        """
+        self._random_chance = random_chance
+        self._kernel_size = kernel_size
+
+    def __call__(self, image:np.ndarray, annotation)->typing.Tuple[np.ndarray, np.ndarray]:
+        if np.random.rand() <= self._random_chance:
+
+            kernel = np.ones(self._kernel_size, np.uint8)
+
+            if np.random.rand() <= 0.5:
+                image = cv2.erode(image, kernel, iterations=1)
+            else:
+                image = cv2.dilate(image, kernel, iterations=1)
+
+        return image, annotation
diff --git a/mltu/dataProvider.py b/mltu/dataProvider.py
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.1.3"`
	`1`	`+__version__ = "0.1.4"`