|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import json |
| 4 | +import glob |
| 5 | +import os |
| 6 | +from pathlib import Path |
| 7 | +import numpy as np |
| 8 | +import cv2 |
| 9 | +from skimage.filters.rank import entropy |
| 10 | +from skimage.morphology import disk |
| 11 | + |
| 12 | +TEST_FOLDER="test/" |
| 13 | +TEST_TXT=TEST_FOLDER+"test_img_id_gt.txt" |
| 14 | + |
| 15 | +def binarize(img): |
| 16 | + # calculate local entropy |
| 17 | + entr = entropy(img, disk(5)) |
| 18 | + # Normalize and negate entropy values |
| 19 | + MAX_ENTROPY = 8.0 |
| 20 | + MAX_PIX_VAL = 255 |
| 21 | + negative = 1 - (entr / MAX_ENTROPY) |
| 22 | + u8img = (negative * MAX_PIX_VAL).astype(np.uint8) |
| 23 | + # Global thresholding |
| 24 | + ret, mask = cv2.threshold(u8img, 0, MAX_PIX_VAL, cv2.THRESH_OTSU) |
| 25 | + # mask out text |
| 26 | + masked = cv2.bitwise_and(img, img, mask=mask) |
| 27 | + # fill in the holes to estimate the background |
| 28 | + kernel = np.ones((35, 35), np.uint8) |
| 29 | + background = cv2.dilate(masked, kernel, iterations=1) |
| 30 | + # By subtracting background from the original image, we get a clean text image |
| 31 | + text_only = cv2.absdiff(img, background) |
| 32 | + # Negate and increase contrast |
| 33 | + neg_text_only = (MAX_PIX_VAL - text_only) * 1.15 |
| 34 | + # clamp the image within u8 range |
| 35 | + ret, clamped = cv2.threshold(neg_text_only, 255, MAX_PIX_VAL, cv2.THRESH_TRUNC) |
| 36 | + clamped_u8 = clamped.astype(np.uint8) |
| 37 | + # Do final adaptive thresholding to binarize image |
| 38 | + processed = cv2.adaptiveThreshold(clamped_u8, MAX_PIX_VAL, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, |
| 39 | + 31, 2) |
| 40 | + return processed |
| 41 | + |
| 42 | +def main(): |
| 43 | + """ |
| 44 | + Preprocess GNHK dataset into text line images |
| 45 | + """ |
| 46 | + os.makedirs(os.path.dirname(TEST_FOLDER), exist_ok=True) |
| 47 | + open(TEST_TXT, 'w').close() # clear the file |
| 48 | + images = glob.glob('eng*.jpg') |
| 49 | + |
| 50 | + for img_idx, image in enumerate(images): |
| 51 | + img_id = Path(image).stem |
| 52 | + print(img_idx, img_id) |
| 53 | + # open corresponding JSON annotation file |
| 54 | + with open(img_id + ".json") as f: |
| 55 | + data = json.load(f) |
| 56 | + line_indices = set(map(lambda obj: obj["line_idx"], data)) |
| 57 | + img = cv2.imread(image, cv2.IMREAD_GRAYSCALE) |
| 58 | + img = binarize(img) |
| 59 | + for idx in sorted(list(line_indices)): |
| 60 | + objects = list(filter(lambda obj: obj["line_idx"] == idx, data)) |
| 61 | + # discard math symbols, scribbles, illegible text, and printed text |
| 62 | + objects = list(filter(lambda obj: obj["text"] != "%math%" and obj["text"] != "%SC%" and obj["text"] != "%NA%" and obj["type"] != "P", objects)) |
| 63 | + if not objects: |
| 64 | + continue |
| 65 | + objects = sorted(objects, key=lambda x: x['polygon']['x0']) |
| 66 | + label = " ".join(map(lambda obj: obj["text"], objects)) |
| 67 | + print(img_id, idx, label) |
| 68 | + |
| 69 | + # create mask for the words |
| 70 | + mask = np.zeros(img.shape[0:2], dtype=np.uint8) |
| 71 | + for obj in objects: |
| 72 | + region = [ |
| 73 | + [obj["polygon"]["x0"], obj["polygon"]["y0"]], |
| 74 | + [obj["polygon"]["x1"], obj["polygon"]["y1"]], |
| 75 | + [obj["polygon"]["x2"], obj["polygon"]["y2"]], |
| 76 | + [obj["polygon"]["x3"], obj["polygon"]["y3"]] |
| 77 | + ] |
| 78 | + points = np.array([region]) |
| 79 | + cv2.drawContours(mask, [points], -1, (255, 255, 255), -1, cv2.LINE_AA) |
| 80 | + masked = cv2.bitwise_and(img, img, mask = mask) |
| 81 | + bg = np.ones_like(img, np.uint8) * 255 |
| 82 | + cv2.bitwise_not(bg, bg, mask = mask) |
| 83 | + overlay = bg + masked |
| 84 | + # crop bounding rectangle of the text region |
| 85 | + l = list(map(lambda obj: [ |
| 86 | + [obj["polygon"]["x0"], obj["polygon"]["y0"]], |
| 87 | + [obj["polygon"]["x1"], obj["polygon"]["y1"]], |
| 88 | + [obj["polygon"]["x2"], obj["polygon"]["y2"]], |
| 89 | + [obj["polygon"]["x3"], obj["polygon"]["y3"]] |
| 90 | + ], objects)) |
| 91 | + flat = [item for sublist in l for item in sublist] |
| 92 | + pts = np.array(flat) |
| 93 | + rect = cv2.boundingRect(pts) |
| 94 | + x, y, w, h = rect |
| 95 | + cropped = overlay[y:y+h, x:x+w].copy() |
| 96 | + |
| 97 | + # discard image if width > 2000 after resizing to height=96 while keeping aspect ratio |
| 98 | + height, width = cropped.shape |
| 99 | + ratio = 96.0 / height |
| 100 | + new_width = int(width * ratio) |
| 101 | + if new_width > 2000: |
| 102 | + continue |
| 103 | + |
| 104 | + cv2.imwrite(TEST_FOLDER + img_id + '_line'+ str(idx) + '.jpg', cropped) |
| 105 | + with open(TEST_TXT, 'a', encoding='utf-8') as test_txt: |
| 106 | + test_txt.write(img_id + '_line'+ str(idx) + '.jpg' + ',' + label + '\n') |
| 107 | + |
| 108 | +if __name__ == '__main__': |
| 109 | + main() |
| 110 | + |
0 commit comments