Skip to content

Commit 978bf19

Browse files
committed
Merge branch 'feature/sound_to_text'
2 parents cd89cbb + d13fbfd commit 978bf19

File tree

23 files changed

+811
-35
lines changed

23 files changed

+811
-35
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,7 @@ Each tutorial has its own requirements.txt file for a specific mltu version. As
1414

1515
# Tutorials and Examples:
1616
1. [Text Recognition With TensorFlow and CTC network](https://pylessons.com/ctc-text-recognition), code in ```Tutorials\01_image_to_word``` folder;
17-
2. [TensorFlow OCR model for reading Captchas](https://pylessons.com/tensorflow-ocr-captcha), code in ```Tutorials\02_captcha_to_text``` folder;
17+
2. [TensorFlow OCR model for reading Captchas](https://pylessons.com/tensorflow-ocr-captcha), code in ```Tutorials\02_captcha_to_text``` folder;
18+
3. [Handwriting words recognition with TensorFlow](https://pylessons.com/handwriting-recognition), code in ```Tutorials\03_handwriting_recognition``` folder;
19+
4. [Handwritten sentence recognition with TensorFlow](https://pylessons.com/handwritten-sentence-recognition), code in ```Tutorials\04_sentence_recognition``` folder;
20+
5. [Introduction to speech recognition with TensorFlow](https://pylessons.com/speech-recognition), code in ```Tutorials\05_speech_recognition``` folder;

Tutorials/03_handwriting_recognition/README.md

Lines changed: 325 additions & 0 deletions
Large diffs are not rendered by default.

Tutorials/03_handwriting_recognition/configs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def __init__(self):
1111
self.height = 32
1212
self.width = 128
1313
self.max_text_length = 0
14-
self.batch_size = 64
15-
self.learning_rate = 0.001
14+
self.batch_size = 16
15+
self.learning_rate = 0.0005
1616
self.train_epochs = 1000
1717
self.train_workers = 20

Tutorials/03_handwriting_recognition/inferenceModel.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ def predict(self, image: np.ndarray):
2626
from tqdm import tqdm
2727
from mltu.configs import BaseModelConfigs
2828

29-
configs = BaseModelConfigs.load("Models/03_handwriting_recognition/202212290905/configs.yaml")
29+
configs = BaseModelConfigs.load("Models/03_handwriting_recognition/202301111911/configs.yaml")
3030

3131
model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab)
3232

33-
df = pd.read_csv("Models/03_handwriting_recognition/202212290905/val.csv").values.tolist()
33+
df = pd.read_csv("Models/03_handwriting_recognition/202301111911/val.csv").values.tolist()
3434

3535
accum_cer = []
3636
for image_path, label in tqdm(df):
@@ -43,4 +43,10 @@ def predict(self, image: np.ndarray):
4343

4444
accum_cer.append(cer)
4545

46+
# resize by 4x
47+
image = cv2.resize(image, (image.shape[1] * 4, image.shape[0] * 4))
48+
cv2.imshow("Image", image)
49+
cv2.waitKey(0)
50+
cv2.destroyAllWindows()
51+
4652
print(f"Average CER: {np.average(accum_cer)}")

Tutorials/03_handwriting_recognition/train.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ def download_and_unzip(url, extract_to='Datasets', chunk_size=1024*1024):
108108
optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate),
109109
loss=CTCloss(),
110110
metrics=[CWERMetric(padding_token=len(configs.vocab))],
111-
run_eagerly=False
112111
)
113112
model.summary(line_length=110)
114113

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Handwritten sentence recognition with TensorFlow
2+
## Unlock the power of handwritten sentence recognition with TensorFlow and CTC loss. From digitizing notes to transcribing historical documents and automating exam grading
3+
4+
5+
## **Detailed tutorial**:
6+
## [Handwritten sentence recognition with TensorFlow](https://pylessons.com/handwritten-sentence-recognition)
7+
8+
<p align="center">
9+
<img src="https://pylessons.com/media/Tutorials/TensorFlow-CAPTCHA-solver/handwritten-sentence-recognition/handwritten-sentence-recognition_pbLia4E.png">
10+
</p>

Tutorials/04_sentence_recognition/configs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ def __init__(self):
1212
self.width = 1408
1313
self.max_text_length = 0
1414
self.batch_size = 32
15-
self.learning_rate = 0.001
15+
self.learning_rate = 0.0005
1616
self.train_epochs = 1000
1717
self.train_workers = 20

Tutorials/04_sentence_recognition/inferenceModel.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44

55
from mltu.inferenceModel import OnnxInferenceModel
66
from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer
7+
from mltu.transformers import ImageResizer
78

89
class ImageToWordModel(OnnxInferenceModel):
910
def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
1011
super().__init__(*args, **kwargs)
1112
self.char_list = char_list
1213

1314
def predict(self, image: np.ndarray):
14-
image = cv2.resize(image, self.input_shape[:2][::-1])
15+
image = ImageResizer.resize_maintaining_aspect_ratio(image, *self.input_shape[:2][::-1])
1516

1617
image_pred = np.expand_dims(image, axis=0).astype(np.float32)
1718

@@ -26,11 +27,11 @@ def predict(self, image: np.ndarray):
2627
from tqdm import tqdm
2728
from mltu.configs import BaseModelConfigs
2829

29-
configs = BaseModelConfigs.load("Models/04_sentence_recognition/202301060816/configs.yaml")
30+
configs = BaseModelConfigs.load("Models/04_sentence_recognition/202301131202/configs.yaml")
3031

3132
model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab)
3233

33-
df = pd.read_csv("Models/04_sentence_recognition/202301060816/val.csv").values.tolist()
34+
df = pd.read_csv("Models/04_sentence_recognition/202301131202/val.csv").values.tolist()
3435

3536
accum_cer, accum_wer = [], []
3637
for image_path, label in tqdm(df):
@@ -40,9 +41,16 @@ def predict(self, image: np.ndarray):
4041

4142
cer = get_cer(prediction_text, label)
4243
wer = get_wer(prediction_text, label)
43-
print(f"Image: {image_path}; Label: ({label}); Prediction: ({prediction_text}); CER: {cer}; WER: {wer}")
44+
print("Image: ", image_path)
45+
print("Label:", label)
46+
print("Prediction: ", prediction_text)
47+
print(f"CER: {cer}; WER: {wer}")
4448

4549
accum_cer.append(cer)
4650
accum_wer.append(wer)
4751

52+
cv2.imshow(prediction_text, image)
53+
cv2.waitKey(0)
54+
cv2.destroyAllWindows()
55+
4856
print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}")

Tutorials/04_sentence_recognition/train.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import stow
1919
from tqdm import tqdm
2020

21-
# Must download and extract datasets manually from https://fki.tic.heia-fr.ch/databases/download-the-iam-handwriting-database
21+
# Must download and extract datasets manually from https://fki.tic.heia-fr.ch/databases/download-the-iam-handwriting-database to Datasets\IAM_Sentences
2222
sentences_txt_path = stow.join('Datasets', 'IAM_Sentences', 'ascii', 'sentences.txt')
2323
sentences_folder_path = stow.join('Datasets', 'IAM_Sentences', 'sentences')
2424

@@ -63,7 +63,7 @@
6363
batch_size=configs.batch_size,
6464
data_preprocessors=[ImageReader()],
6565
transformers=[
66-
ImageResizer(configs.width, configs.height, keep_aspect_ratio=False),
66+
ImageResizer(configs.width, configs.height, keep_aspect_ratio=True),
6767
LabelIndexer(configs.vocab),
6868
LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
6969
],
@@ -102,7 +102,7 @@
102102
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor='val_CER', verbose=1, save_best_only=True, mode='min')
103103
trainLogger = TrainLogger(configs.model_path)
104104
tb_callback = TensorBoard(f'{configs.model_path}/logs', update_freq=1)
105-
reduceLROnPlat = ReduceLROnPlateau(monitor='val_CER', factor=0.9, min_delta=1e-10, patience=10, verbose=1, mode='auto')
105+
reduceLROnPlat = ReduceLROnPlateau(monitor='val_CER', factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode='auto')
106106
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
107107

108108
# Train the model

Tutorials/05_sound_to_text/README.md

Whitespace-only changes.

0 commit comments

Comments
 (0)