diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 00000000..a9f766d5 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,51 @@ +name: Docker Image CI + +on: + workflow_dispatch: + push: + branches: [ "master", "fix-alpha-shape" ] + +env: + REPO_NAME: ${{ github.repository }} + +jobs: + + build: + + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + # we need tags for docker version tagging + fetch-tags: true + fetch-depth: 0 + - # Activate cache export feature to reduce build time of images + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERIO_USERNAME }} + password: ${{ secrets.DOCKERIO_PASSWORD }} + - name: define image name from repo name + run: echo "IMAGE_NAME=ghcr.io/${REPO_NAME,,}" >> $GITHUB_ENV + - name: Build the Docker image + # build both tags at the same time + run: make docker-build DOCKER_TAG="docker.io/ocrd/cis -t ${{ env.IMAGE_NAME }}" + - name: Test the Docker image + run: docker run --rm ${{ env.IMAGE_NAME }} ocrd-cis-ocropy-segment -h + - name: Push to Dockerhub + run: docker push docker.io/ocrd/cis + - name: Push to Github Container Registry + run: docker push ${{ env.IMAGE_NAME }} + diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 00000000..54c46713 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,32 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: PyPI CD + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel build twine + pip install -r requirements.txt + - name: Build and publish + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + run: | + python -m build . + ls -l dist + twine upload --verbose dist/ocrd*{tar.gz,whl} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..c50810f0 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,33 @@ +name: Test ocrd_cis installation and run tests + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + os: [ "ubuntu-22.04" ] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: '11' + - name: Install ocrd_cis + run: make install + - name: Test ocrd_cis + run: make test V= diff --git a/.gitignore b/.gitignore index fb28879b..aca5a739 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ env-dir/* /venv* /build /dist +TAGS +*.log +download/ diff --git a/Dockerfile b/Dockerfile index f58112b8..0fa98fb4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,24 @@ -FROM ocrd/core:latest AS base -ENV VERSION="Di 12. Mai 13:26:35 CEST 2020" +ARG DOCKER_BASE_IMAGE +FROM $DOCKER_BASE_IMAGE AS base +ARG VCS_REF +ARG BUILD_DATE +LABEL \ + maintainer="https://github.com/cisocrgroup/ocrd_cis/issues" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.vcs-url="https://github.com/cisocrgroup/ocrd_cis" \ + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.title="ocrd_cis" \ + org.opencontainers.image.description="Ocropy OCR and CIS post-correction bindings" \ + org.opencontainers.image.source="https://github.com/cisocrgroup/ocrd_cis" \ + org.opencontainers.image.documentation="https://github.com/cisocrgroup/ocrd_cis/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core + ENV GITURL="https://github.com/cisocrgroup" -ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf" + +SHELL ["/bin/bash", "-c"] # deps RUN apt-get update \ @@ -16,14 +33,14 @@ RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ FROM base AS profiler RUN apt-get update \ && apt-get -y install --no-install-recommends cmake g++ libcppunit-dev libxerces-c-dev \ - && git clone ${GITURL}/Profiler --branch devel --single-branch /build \ - && cd /build \ + && git clone ${GITURL}/Profiler --branch devel --single-branch /build/Profiler \ + && pushd /build/Profiler \ && cmake -DCMAKE_BUILD_TYPE=release . \ && make compileFBDic trainFrequencyList runDictSearch profiler \ && mkdir /apps \ && cp bin/compileFBDic bin/trainFrequencyList bin/profiler bin/runDictSearch /apps/ \ - && cd / \ - && rm -rf /build + && popd \ + && rm -rf /build/Profiler FROM profiler AS languagemodel # install the profiler's language backend @@ -32,27 +49,36 @@ COPY --from=profiler /apps/trainFrequencyList /apps/ COPY --from=profiler /apps/runDictSearch /apps/ RUN apt-get update \ && apt-get -y install --no-install-recommends icu-devtools \ - && git clone ${GITURL}/Resources --branch master --single-branch /build \ - && cd /build/lexica \ + && git clone ${GITURL}/Resources --branch master --single-branch /build/Resources \ + && pushd /build/Resources/lexica \ && PATH=$PATH:/apps make \ && PATH=$PATH:/apps make test \ && PATH=$PATH:/apps make install \ - && cd / \ - && rm -rf /build + && popd \ + && rm -rf /build/Resources FROM base AS postcorrection # install ocrd_cis (python) -VOLUME ["/data"] +WORKDIR /build/ocrd_cis COPY --from=languagemodel /etc/profiler/languages /etc/profiler/languages COPY --from=profiler /apps/profiler /apps/ COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linux-gnu/ COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/ COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/ -COPY . /build +COPY . . +COPY ocrd-tool.json . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +# prepackage ocrd-all-module-dir.json +RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json +# install everything and reduce image size RUN apt-get update \ && apt-get -y install --no-install-recommends gcc wget default-jre-headless \ - && cd /build \ && make install \ - && make test \ - && cd / \ - && rm -rf /build + # test always fail, resources not available for download. Resources should be made available + # somewhere else, e.g. github.com/OCR-D/assets + # && make test \ + && rm -rf /build/ocrd_cis + +WORKDIR /data +VOLUME /data diff --git a/Makefile b/Makefile index 730ba3f4..ac2edacc 100644 --- a/Makefile +++ b/Makefile @@ -2,23 +2,71 @@ PY ?= python3 PIP ?= pip3 V ?= > /dev/null 2>&1 PKG = ocrd_cis +DOCKER_TAG ?= ocrd/cis +DOCKER_BASE_IMAGE ?= docker.io/ocrd/core:latest +DOCKER ?= docker +SHELL = bash + +help: + @echo "" + @echo " Targets" + @echo "" + @echo " install Install ocrd_cis" + @echo " install-dev Install in editable mode" + @echo " build Build source and binary distribution" + @echo " docker Build Docker image" + @echo " test Run unit tests" + @echo "" + @echo " Variables" + @echo "" + @echo " DOCKER_TAG '$(DOCKER_TAG)'" + @echo " PY '$(PY)'" + @echo " PIP '$(PIP)'" install: - ${PIP} install --upgrade pip . -install-devel: - ${PIP} install --upgrade pip -e . + ${PIP} install . + +install-devel install-dev: + ${PIP} install -e . + +build: + ${PIP} install build + ${PY} -m build . + uninstall: ${PIP} uninstall ${PKG} -docker-build: Dockerfile - docker build -t flobar/ocrd_cis:latest . +docker-build docker: Dockerfile + $(DOCKER) build \ + --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ + --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ + --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ + -t $(DOCKER_TAG):latest . + docker-push: docker-build - docker push flobar/ocrd_cis:latest + $(DOCKER) push $(DOCKER_TAG):latest + +TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash))) +INDENT != MAX=; for NAME in $(TEST_SCRIPTS:tests/%=%); do if test $${\#MAX} -lt $${\#NAME}; then MAX=$${NAME//?/_}; fi; done; echo $$MAX +indent = `WHAT=$1; WITH=$(INDENT); echo $$WHAT$${WITH:$${\#WHAT}}` +format_tr = "$(call indent,$1):\t%U\t%S\t%E\t%P\t(%Mk)" +format_th = "$(call indent)\tuser\tsystem\telapsed\tCPU\tmaxRSS" -TEST_SCRIPTS=$(sort $(wildcard tests/run_*.bash)) .PHONY: $(TEST_SCRIPTS) $(TEST_SCRIPTS): - bash $@ $V + OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f $(call format_tr,$(@F)) bash -x $@ $V + OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f $(call format_tr,$(@F)) bash -x $@ $V + +test: export OCRD_OVERRIDE_LOGLEVEL=DEBUG +test: export OCRD_MISSING_OUTPUT=ABORT +test: export OCRD_MAX_MISSING_OUTPUTS=-1 test: $(TEST_SCRIPTS) - echo $^ -.PHONY: install test + @echo =====single-processing test results===== + @echo -e $(call format_th) + @cat test_serially.log + @echo =====4-page-parallel test results===== + @echo -e $(call format_th) + @cat test_parallel.log + @$(RM) test_serially.log test_parallel.log + +.PHONY: install install-dev install-devel build uninstall test docker docker-build docker-push diff --git a/ocrd_cis/__init__.py b/ocrd_cis/__init__.py index 6f37f4f7..9d22fe3e 100644 --- a/ocrd_cis/__init__.py +++ b/ocrd_cis/__init__.py @@ -1,3 +1,2 @@ from .javaprocess import JavaAligner from .javaprocess import JavaPostCorrector -from .ocrd_tool import get_ocrd_tool diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index ffe53fd8..395f7b07 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -1,150 +1,123 @@ from __future__ import absolute_import +from __future__ import annotations + import click import json import os -import Levenshtein -from ocrd import Processor +from typing import Optional, List, Dict, Type + +from rapidfuzz.distance import Levenshtein + +from ocrd import Processor, OcrdPage, OcrdPageResult from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd_utils import MIMETYPE_PAGE -from ocrd_utils import getLogger from ocrd_utils import getLevelName -from ocrd_utils import make_file_id -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import to_xml -from ocrd_models.ocrd_page_generateds import TextEquivType +from ocrd_models.ocrd_page import TextRegionType, TextEquivType from ocrd_cis import JavaAligner -from ocrd_cis import get_ocrd_tool + @click.command() @ocrd_cli_options def ocrd_cis_align(*args, **kwargs): - return ocrd_cli_wrap_processor(Aligner, *args, **kwargs) - -class Aligner(Processor): - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-align'] - kwargs['version'] = ocrd_tool['version'] - super(Aligner, self).__init__(*args, **kwargs) + return ocrd_cli_wrap_processor(CISAligner, *args, **kwargs) - if hasattr(self, 'workspace'): - self.log = getLogger('cis.Processor.Aligner') +class CISAligner(Processor): + @property + def executable(self): + return 'ocrd-cis-align' - def process(self): - ifgs = self.input_file_grp.split(",") # input file groups - if len(ifgs) < 2: - raise Exception("need at least two input file groups to align") - ifts = self.zip_input_files(ifgs) # input file tuples - for _id, ift in enumerate(ifts): - alignments = json.loads(self.run_java_aligner(ift)) - pcgts = self.align(alignments, ift) - # keep the right part after OCR-D-...-filename - # and prepend output_file_grp - input_file = ift[0].input_file - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts), - ) - self.log.info('created file %s', out) + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + assert len(input_pcgts) >= 2 + alignments = json.loads(self.run_java_aligner(input_pcgts)) + pcgts = self.align(alignments, input_pcgts) + return OcrdPageResult(pcgts) - def align(self, alignments, ift): + def align(self, alignments: List[Dict], pcgts: List[OcrdPage]) -> OcrdPage: """align the alignment objects with the according input file tuples""" - for t in ift: - self.log.debug("tuple %s", os.path.basename(t.input_file.url)) - pcgtst = self.open_input_file_tuples(ift) i = 0 - for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()): + file_groups = self.input_file_grp.split(',') + for mi, mr in enumerate(pcgts[0].get_Page().get_AllRegions(classes=['Text'])): for mj, _ in enumerate(mr.get_TextLine()): - for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()): - self.log.debug("[%d] %s", iiii, u.Unicode) - for xx in mr.get_TextLine()[mj].get_Word(): - for iiii, u in enumerate(xx.get_TextEquiv()): - self.log.debug("[%d] %s", iiii, u.Unicode) - lines = [] - for ii, t in enumerate(ift): + for ii, page in enumerate(pcgts): if i >= len(alignments): break - tr = pcgtst[ii].get_Page().get_TextRegion() + tr = page.get_Page().get_AllRegions(classes=['Text']) region = tr[mi].get_TextLine()[mj] - lines.append(Alignment(t, region, alignments[i])) + lines.append(Alignment(file_groups[ii], page, region, alignments[i])) self.align_lines(lines) i += 1 - return pcgtst[0] + return pcgts[0] - def align_lines(self, lines): + def align_lines(self, lines: List[Alignment]) -> None: """align the given line alignment with the lines""" if not lines: return - if len(lines[0].region.get_TextEquiv()) > 1: - del lines[0].region.get_TextEquiv()[1:] + if len(lines[0].region.TextEquiv) > 1: + del lines[0].region.TextEquiv[1:] for i, line in enumerate(lines): if lines[0].region.get_TextEquiv() is None: lines[0].region.TextEquiv = [] - self.log.debug('line alignment: %s [%s - %s]', - get_textequiv_unicode(line.region), - line.region.get_id(), - line.input_file.input_file_group) - ddt = line.input_file.input_file_group + "/" + line.region.get_id() - if i != 0: + self.logger.debug( + 'line alignment: %s [%s - %s]', + get_textequiv_unicode(line.region), + line.region.get_id(), + line.file_grp + ) + ddt = line.file_grp + "/" + line.region.get_id() + if i > 0: te = TextEquivType( Unicode=get_textequiv_unicode(line.region), conf=get_textequiv_conf(line.region), dataType="other", - dataTypeDetails="ocrd-cis-line-alignment:" + ddt) + dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}") lines[0].region.add_TextEquiv(te) else: - self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i) - lines[0].region.get_TextEquiv()[i].set_dataType("other") - lines[0].region.get_TextEquiv()[i].set_dataTypeDetails( + self.logger.debug("len: %i, i: %i", len(lines[0].region.TextEquiv), i) + lines[0].region.TextEquiv[i].set_dataType("other") + lines[0].region.TextEquiv[i].set_dataTypeDetails( "ocrd-cis-line-alignment-master-ocr:" + ddt) - lines[0].region.get_TextEquiv()[i].set_index(i+1) + lines[0].region.TextEquiv[i].set_index(i+1) self.align_words(lines) - def align_words(self, lines): - # self.log.info(json.dumps(lines[0].alignment)) + def align_words(self, lines: List[Alignment]) -> None: + # self.logger.info(json.dumps(lines[0].alignment)) mregion = lines[0].region.get_Word() oregion = [lines[i].region.get_Word() for i in range(1, len(lines))] for word in lines[0].alignment['wordAlignments']: - self.log.debug("aligning word %s", word['master']) + self.logger.debug("aligning word %s", word['master']) master, rest = self.find_word([word['master']], mregion, "master") mregion = rest if master is None or len(master) != 1: - self.log.warn("cannot find {}; giving up".format(word['master'])) + self.logger.warn("cannot find {}; giving up".format(word['master'])) # raise Exception("cannot find {}; giving up".format(word['master'])) return others = list() for i, other in enumerate(word['alignments']): match, rest = self.find_word(other, oregion[i]) if match is None: - self.log.warn("cannot find {}; giving up".format(other)) + self.logger.warn(f"cannot find {other}; giving up") return others.append(match) oregion[i] = rest words = list() words.append( - Alignment(lines[0].input_file, master, lines[0].alignment)) + Alignment(lines[0].file_grp, lines[0].pcgts, master, lines[0].alignment)) for i, other in enumerate(others): words.append(Alignment( - lines[i+1].input_file, + lines[i+1].file_grp, + lines[i+1].pcgts, other, lines[i+1].alignment)) self.align_word_regions(words) - def align_word_regions(self, words): + def align_word_regions(self, words: List[Alignment]) -> None: def te0(x): - return x.get_TextEquiv()[0] + return x.TextEquiv[0] for i, word in enumerate(words): if not word.region: - ifg = word.input_file.input_file_group - self.log.debug("(empty) word alignment: [%s]", ifg) + ifg = word.file_grp + self.logger.debug("(empty) word alignment: [%s]", ifg) te = TextEquivType( dataType="other", dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg) @@ -153,50 +126,42 @@ def te0(x): continue _str = " ".join([te0(x).Unicode for x in word.region]) _id = ",".join([x.get_id() for x in word.region]) - ifg = word.input_file.input_file_group - ddt = word.input_file.input_file_group + "/" + _id + ifg = word.file_grp + ddt = word.file_grp + "/" + _id # if conf is none it is most likely ground truth data conf = min([float(te0(x).get_conf() or "1.0") for x in word.region]) - self.log.debug("word alignment: %s [%s - %s]", _str, _id, ifg) + self.logger.debug(f"word alignment: {_str} [{_id} - {ifg}]") if i != 0: te = TextEquivType( - Unicode=_str, - conf=conf, - dataType="other", - dataTypeDetails="ocrd-cis-word-alignment:" + ddt) + Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}") words[0].region[0].add_TextEquiv(te) else: words[0].region[0].get_TextEquiv()[i].set_dataType("other") - words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails( - "ocrd-cis-word-alignment-master-ocr:" + ddt) + words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails(f"ocrd-cis-word-alignment-master-ocr:{ddt}") words[0].region[0].get_TextEquiv()[i].set_index(i+1) def find_word(self, tokens, regions, t="other"): - self.log.debug("tokens = %s [%s]", tokens, t) + tokens_str = f"tokens = {tokens} [{t}]" + self.logger.debug(tokens_str) for i, _ in enumerate(regions): n = self.match_tokens(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again with levenshtein - self.log.warn( - "could not find tokens = %s [%s]; trying again", - tokens, t) + self.logger.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_lev(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again to match token within another one - self.log.warn( - "could not find tokens = %s [%s]; trying again", - tokens, t) + self.logger.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_within(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) - # nothing could be found return tuple([None, regions]) @@ -212,7 +177,7 @@ def match_tokens_lev(self, tokens, regions, i): def f(a, b): k = 3 # int(len(a)/3) d = Levenshtein.distance(a, b) - self.log.debug("lev %s <=> %s: %d (%d)", a, b, d, d) + self.logger.debug(f"lev {a} <=> {b}: {d} ({d})") return d <= 1 or d <= k return self.match_tokens_lambda(tokens, regions, i, f) @@ -227,14 +192,15 @@ def match_tokens_lambda(self, tokens, regions, i, f): Returns 0 if nothing could be matched. """ for j, token in enumerate(tokens): - if j + i >= len(regions): + sum_i_j = j + i + if sum_i_j >= len(regions): return 0 - if not regions[i+j].get_TextEquiv()[0].Unicode: - self.log.warn("cannot find %s", token) + unicode = regions[sum_i_j].TextEquiv[0].Unicode + if not unicode: + self.logger.warn(f"cannot find {token}") return 0 - self.log.debug('checking %s with %s', token, - regions[i+j].get_TextEquiv()[0].Unicode) - if f(token, regions[i+j].get_TextEquiv()[0].Unicode): + self.logger.debug(f'checking {token} with {unicode}') + if f(token, unicode): continue if j == 0: return 0 @@ -244,69 +210,29 @@ def match_tokens_lambda(self, tokens, regions, i, f): i += 1 return i + len(tokens) - def open_input_file_tuples(self, ift): - """ - opens all xml files of the given input file tuple - and returns them as tuples - """ - res = list() - for ifile in ift: - pcgts = ifile.open() - res.append(pcgts) - return tuple(res) - - def zip_input_files(self, ifgs): - """Zip files of the given input file groups""" - files = list() - for ifg in ifgs: - self.log.info("input file group: %s", ifg) - ifiles = sorted( - self.workspace.mets.find_files(fileGrp=ifg), - key=lambda ifile: ifile.url) - for i in ifiles: - self.log.debug("sorted file: %s %s", - os.path.basename(i.url), i.ID) - ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles] - files.append(ifiles) - return zip(*files) - - def read_lines_from_input_file(self, ifile): - self.log.info("reading input file: %s", ifile) + def run_java_aligner(self, input_pcgts: List[OcrdPage]) -> str: lines = list() - pcgts = ifile.open() - for region in pcgts.get_Page().get_TextRegion(): - for line in region.get_TextLine(): - lines.append(get_textequiv_unicode(line)) - return lines - - def run_java_aligner(self, ifs): - lines = list() - for ifile in ifs: - lines.append(self.read_lines_from_input_file(ifile)) + for pcgts in input_pcgts: + lines.append([get_textequiv_unicode(line) + for line in pcgts.get_Page().get_AllTextLines()]) + # JavaAligner expects a strange input format lines = zip(*lines) _input = [x.strip() for t in lines for x in t] for i in _input: - self.log.debug("input line: %s", i) - n = len(ifs) - self.log.debug("starting java client") - p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel())) + self.logger.debug("input line: %s", i) + n = len(input_pcgts) + self.logger.debug("starting java client") + p = JavaAligner(n, getLevelName(self.logger.getEffectiveLevel())) return p.run("\n".join(_input)) -class FileAlignment: - def __init__(self, workspace, ifile, ifg): - self.workspace = workspace - self.input_file = ifile - self.input_file_group = ifg - self.log = getLogger('cis.FileAlignment') - - def open(self): - self.log.info("opening: %s", os.path.basename(self.input_file.url)) - return page_from_file(self.workspace.download_file(self.input_file)) - - class Alignment: - def __init__(self, ifile, region, alignment): - self.input_file = ifile + file_grp: str + pcgts: OcrdPage + region: TextRegionType + alignment: dict + def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: dict): + self.file_grp = file_grp + self.pcgts = pcgts self.region = region self.alignment = alignment diff --git a/ocrd_cis/data/__main__.py b/ocrd_cis/data/__main__.py index 3d8ef735..8fdcddd6 100644 --- a/ocrd_cis/data/__main__.py +++ b/ocrd_cis/data/__main__.py @@ -1,18 +1,18 @@ -import pkg_resources import sys +from ocrd_utils import resource_filename def main(): usage = 'usage: ' + sys.argv[0] + ' -jar|-3gs|-model|-config' if '-h' in sys.argv: print(usage) elif '-jar' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) + print(resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) elif '-3gs' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/3gs.csv.gz')) + print(resource_filename('ocrd_cis', 'data/3gs.csv.gz')) elif '-model' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/model.zip')) + print(resource_filename('ocrd_cis', 'data/model.zip')) elif '-config' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/config.json')) + print(resource_filename('ocrd_cis', 'data/config.json')) else: raise ValueError(usage) diff --git a/ocrd_cis/div/cutter.py b/ocrd_cis/div/cutter.py index ee187a1b..6dc6a9a9 100644 --- a/ocrd_cis/div/cutter.py +++ b/ocrd_cis/div/cutter.py @@ -26,7 +26,7 @@ def bounding_box(coord_points): def resize_keep_ratio(image, baseheight=48): hpercent = (baseheight / float(image.size[1])) wsize = int((float(image.size[0] * float(hpercent)))) - image = image.resize((wsize, baseheight), Image.ANTIALIAS) + image = image.resize((wsize, baseheight), Image.LANCZOS) return image diff --git a/ocrd_cis/div/eval.py b/ocrd_cis/div/eval.py index 6efe90c6..f47682ff 100644 --- a/ocrd_cis/div/eval.py +++ b/ocrd_cis/div/eval.py @@ -1,6 +1,6 @@ import os from PIL import Image -from Levenshtein import distance +from rapidfuzz.distance.Levenshtein import distance path = '/mnt/c/Users/chris/Documents/projects/OCR-D/daten/gt/lines/' diff --git a/ocrd_cis/div/stats.py b/ocrd_cis/div/stats.py index ea385d98..6f9c9816 100644 --- a/ocrd_cis/div/stats.py +++ b/ocrd_cis/div/stats.py @@ -4,7 +4,7 @@ from ocrd import Processor from ocrd_cis import get_ocrd_tool from ocrd_models.ocrd_page_generateds import parse -from Levenshtein import distance +from rapidfuzz.distance import Levenshtein class Stats(Processor): @@ -81,7 +81,7 @@ def process(self): # print(line.get_TextEquiv()[2].dataType) unicodeline = line.get_TextEquiv()[i].Unicode - d[i] += distance(gtline, unicodeline) + d[i] += Levenshtein.distance(gtline, unicodeline) # words = line.get_Word() # for word in words: diff --git a/ocrd_cis/javaprocess.py b/ocrd_cis/javaprocess.py index ce2f6bfd..72915d68 100644 --- a/ocrd_cis/javaprocess.py +++ b/ocrd_cis/javaprocess.py @@ -1,12 +1,11 @@ import subprocess import json -import pkg_resources -from ocrd_utils import getLogger +from ocrd_utils import getLogger, resource_filename from pathlib import Path MAIN = "de.lmu.cis.ocrd.cli.Main" -JAR = pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar') +JAR = str(resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) def JavaAligner(n, loglvl): """Create a java process that calls -c align -D '{"n":n}'""" diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index 74c0d0c9..472ea5ab 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -1,6 +1,7 @@ { "git_url": "https://github.com/cisocrgroup/ocrd_cis", - "version": "0.1.5", + "version": "0.2.0", + "dockerhub": "ocrd/cis", "tools": { "ocrd-cis-ocropy-binarize": { "executable": "ocrd-cis-ocropy-binarize", @@ -12,17 +13,9 @@ "preprocessing/optimization/grayscale_normalization", "preprocessing/optimization/deskewing" ], - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-IMG-BIN", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with Ocropy v1", "parameters": { "method": { "type": "string", @@ -75,15 +68,9 @@ "steps": [ "preprocessing/optimization/deskewing" ], - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Deskew regions with ocropy (by annotating orientation angle and adding AlternativeImage)", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Deskew regions with Ocropy v1 (by annotating orientation angle and adding AlternativeImage)", "parameters": { "maxskew": { "type": "number", @@ -106,17 +93,9 @@ "steps": [ "preprocessing/optimization/despeckling" ], - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-IMG-DESPECK", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Despeckle pages / regions / lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Despeckle pages / regions / lines with Ocropy v1", "parameters": { "noise_maxsize": { "type": "number", @@ -147,14 +126,8 @@ "layout/segmentation/region", "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "description": "Clip text regions / lines at intersections with neighbours", "parameters": { "level-of-operation": { @@ -185,13 +158,9 @@ "steps": [ "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], - "description": "Resegment text lines", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Improve coordinates of text lines", "parameters": { "level-of-operation": { "type": "string", @@ -205,6 +174,11 @@ "description": "source for new line polygon candidates ('lineest' for line estimation, i.e. how Ocropy would have segmented text lines; 'baseline' tries to re-polygonize from the baseline annotation; 'ccomps' avoids crossing connected components by majority rule)", "default": "lineest" }, + "baseline_only": { + "type": "boolean", + "description": "ignore existing textline coords completely and use baseline as input if possible", + "default": false + }, "dpi": { "type": "number", "format": "float", @@ -217,10 +191,16 @@ "description": "share of foreground pixels that must be retained by the output polygons", "default": 0.75 }, + "spread": { + "type": "number", + "format": "float", + "description": "distance in points (pt) from the foreground to project textline labels into the background for polygonal contours; if zero, project half a scale/capheight", + "default": 2.4 + }, "extend_margins": { "type": "number", "format": "integer", - "description": "number of pixels to extend the input polygons in all directions", + "description": "(ignored)", "default": 3 } } @@ -234,12 +214,8 @@ "preprocessing/optimization/dewarping" ], "description": "Dewarp line images with ocropy", - "input_file_grp": [ - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "dpi": { "type": "number", @@ -275,15 +251,9 @@ "steps": [ "recognition/text-recognition" ], - "description": "Recognize text in (binarized+deskewed+dewarped) lines with ocropy", - "input_file_grp": [ - "OCR-D-SEG-LINE", - "OCR-D-SEG-WORD", - "OCR-D-SEG-GLYPH" - ], - "output_file_grp": [ - "OCR-D-OCR-OCRO" - ], + "description": "Recognize text in (binarized+deskewed+dewarped) lines with Ocropy v1", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "textequiv_level": { "type": "string", @@ -293,29 +263,37 @@ }, "model": { "type": "string", - "description": "ocropy model to apply (e.g. fraktur.pyrnn)" + "format": "uri", + "content-type": "application/gzip", + "description": "ocropy model to apply (e.g. fraktur.pyrnn.gz)" } - } - }, - "ocrd-cis-ocropy-rec": { - "executable": "ocrd-cis-ocropy-rec", - "categories": [ - "Text recognition and optimization" - ], - "steps": [ - "recognition/text-recognition" - ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "description": "Recognize text snippets", - "parameters": { - "model": { - "type": "string", - "description": "ocropy model to apply (e.g. fraktur.pyrnn)" + }, + "resources": [ + { + "url": "https://github.com/zuphilip/ocropy-models/raw/master/en-default.pyrnn.gz", + "name": "en-default.pyrnn.gz", + "description": "Default ocropy model for English", + "size": 83826134 + }, + { + "url": "https://github.com/zuphilip/ocropy-models/raw/master/fraktur.pyrnn.gz", + "name": "fraktur.pyrnn.gz", + "description": "Default ocropy fraktur model", + "size": 43882365 + }, + { + "url": "https://github.com/jze/ocropus-model_fraktur/raw/master/fraktur.pyrnn.gz", + "name": "fraktur-jze.pyrnn.gz", + "description": "ocropy fraktur model by github.com/jze", + "size": 2961298 + }, + { + "url": "https://github.com/chreul/OCR_Testdata_EarlyPrintedBooks/raw/master/LatinHist-98000.pyrnn.gz", + "name": "LatinHist.pyrnn.gz", + "description": "ocropy historical latin model by github.com/chreul", + "size": 16989864 } - } + ] }, "ocrd-cis-ocropy-segment": { "executable": "ocrd-cis-ocropy-segment", @@ -326,14 +304,9 @@ "layout/segmentation/region", "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], - "description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with Ocropy v1", "parameters": { "dpi": { "type": "number", @@ -425,21 +398,21 @@ "steps": [ "recognition/text-recognition" ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "description": "train model with ground truth from mets data", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "train Ocropy v1 text recognition model with PAGE ground truth from the input fileGrp extracted as file pairs into the output fileGrp", "parameters": { "textequiv_level": { "type": "string", - "description": "PAGE XML hierarchy level granularity", + "description": "hierarchy level to extract GT pairs from", "enum": ["line", "word", "glyph"], "default": "line" }, "model": { "type": "string", - "description": "load model or crate new one (e.g. fraktur.pyrnn)" + "format": "uri", + "content-type": "application/gzip", + "description": "load model (e.g. 'fraktur.pyrnn.gz') to init weights, or none to train from scratch" }, "ntrain": { "type": "number", @@ -449,7 +422,8 @@ }, "outputpath": { "type": "string", - "description": "(existing) path for the trained model" + "default": "output", + "description": "directory path for the trained model" } } }, @@ -461,15 +435,9 @@ "steps": [ "recognition/post-correction" ], - "input_file_grp": [ - "OCR-D-OCR-1", - "OCR-D-OCR-2", - "OCR-D-OCR-N" - ], - "output_file_grp": [ - "OCR-D-ALIGNED" - ], - "description": "Align multiple OCRs and/or GTs" + "input_file_grp_cardinality": [2, -1], + "output_file_grp_cardinality": 1, + "description": "Align multiple OCRs and/or GTs textually on line/word level" }, "ocrd-cis-postcorrect": { "executable": "ocrd-cis-postcorrect", @@ -480,12 +448,8 @@ "recognition/post-correction" ], "description": "Post correct OCR results", - "input_file_grp": [ - "OCR-D-LINE-ALIGNED" - ], - "output_file_grp": [ - "OCR-D-POST-CORRECTED" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "maxCandidates": { "description": "Maximum number of considered correction candidates per suspicious token", diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py deleted file mode 100644 index 8cd184fb..00000000 --- a/ocrd_cis/ocrd_tool.py +++ /dev/null @@ -1,7 +0,0 @@ -import json -from pkg_resources import resource_string - - -def get_ocrd_tool(): - return json.loads( - resource_string(__name__, 'ocrd-tool.json').decode('utf8')) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 6092d3d5..9a55301d 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,38 +1,21 @@ from __future__ import absolute_import +from logging import Logger +from typing import Optional -import os.path import cv2 import numpy as np from PIL import Image -#import kraken.binarization +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) -from ocrd import Processor - -from .. import get_ocrd_tool from . import common -from .common import ( - pil2array, array2pil, - # binarize, - remove_noise) - -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) +from .common import array2pil, determine_zoom, pil2array, remove_noise -TOOL = 'ocrd-cis-ocropy-binarize' -def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): - LOG = getLogger('processor.OcropyBinarize') - LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) +def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): + logger.debug(f'Binarizing {pil_image.width}x{pil_image.height} image with method={method}') if method == 'none': # useful if the images are already binary, # but lack image attribute `binarized` @@ -54,42 +37,33 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo if method == 'global': # global thresholding - _, th = cv2.threshold(img,threshold*255,255,cv2.THRESH_BINARY) + _, th = cv2.threshold(img, threshold * 255, 255, cv2.THRESH_BINARY) elif method == 'otsu': # Otsu's thresholding - _, th = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, th = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) elif method == 'gauss-otsu': # Otsu's thresholding after Gaussian filtering blur = cv2.GaussianBlur(img, (5, 5), 0) - _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) else: raise Exception('unknown binarization method %s' % method) return Image.fromarray(th), 0 - class OcropyBinarize(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-binarize' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyBinarize, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() - def setup(self): - self.logger = getLogger('processor.OcropyBinarize') - if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy': - self.logger.critical('requested method %s does not support grayscale normalized output', - self.parameter['method']) - raise Exception('only method=ocropy allows grayscale=true') + method = self.parameter['method'] + if self.parameter['grayscale'] and method != 'ocropy': + self.logger.critical(f'Requested method {method} does not support grayscale normalized output') + raise ValueError('only method=ocropy allows grayscale=true') - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested + Iterate over the PAGE-XML element hierarchy down to the requested ``level-of-operation``. Next, for each file, crop each segment image according to the layout @@ -105,80 +79,61 @@ def process(self): Reference each new image in the AlternativeImage of the element. - Produce a new output file by serialising the resulting hierarchy. + Return a PAGE-XML with new AlternativeImage(s) and the arguments + for ``workspace.save_image_file``. """ level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) + assert self.workspace + self.logger.debug(f'Level of operation: "{level}"') - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) + pcgts = input_pcgts[0] + assert pcgts + page = pcgts.get_Page() + assert page - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, feature_filter='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - if level == 'page': - self.process_page(page, page_image, page_xywh, zoom, - input_file.pageId, file_id) - else: - if level == 'table': - regions = page.get_TableRegion() - else: # region - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh, feature_filter='binarized') - if level == 'region': - self.process_region(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue - lines = region.get_TextLine() - if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', - page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, feature_filter='binarized') - self.process_line(line, line_image, line_xywh, zoom, - input_file.pageId, region.id, - file_id + '_' + region.id + '_' + line.id) + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, feature_filter='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + result = OcrdPageResult(pcgts) + if level == 'page': + try: + result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id)) + except ValueError as e: + self.logger.error(e) + else: + if level == 'table': + regions = page.get_TableRegion() + else: # region + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning(f"Page '{page_id}' contains no regions") + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_filter='binarized') + if level == 'region': + try: + result.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id)) + continue + except ValueError as e: + self.logger.error(e) + lines = region.get_TextLine() + if not lines: + self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines") + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, feature_filter='binarized') + try: + result.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id)) + except ValueError as e: + self.logger.error(e) + return result - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id): + def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage: if not page_image.width or not page_image.height: - self.logger.warning("Skipping page '%s' with zero size", page_id) - return - self.logger.info("About to binarize page '%s'", page_id) + raise ValueError(f"Skipping page '{page_id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}'") + features = page_xywh['features'] if 'angle' in page_xywh and page_xywh['angle']: # orientation has already been annotated (by previous deskewing), @@ -186,67 +141,64 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id): maxskew = 0 else: maxskew = self.parameter['maxskew'] - bin_image, angle = binarize(page_image, - method=self.parameter['method'], - maxskew=maxskew, - threshold=self.parameter['threshold'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + self.logger, + page_image, + method=self.parameter['method'], + maxskew=maxskew, + threshold=self.parameter['threshold'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' page_xywh['angle'] = angle if self.parameter['noise_maxsize']: - bin_image = remove_noise( - bin_image, maxsize=self.parameter['noise_maxsize']) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) features += ',despeckled' # annotate angle in PAGE (to allow consumers of the AlternativeImage # to do consistent coordinate transforms, and non-consumers # to redo the rotation themselves): orientation = -page_xywh['angle'] - orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] + orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) - # update METS (add the image file): if self.parameter['grayscale']: - file_id += '.IMG-NRM' + suffix = '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + suffix = '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, - file_id, - page_id=page_id, - file_grp=self.output_file_grp) # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + alt_image = AlternativeImageType(comments=features) + page.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) - def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id): + def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage: if not region_image.width or not region_image.height: - self.logger.warning("Skipping region '%s' with zero size", region.id) - return - self.logger.info("About to binarize page '%s' region '%s'", page_id, region.id) + raise ValueError(f"Skipping region '{region.id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'") features = region_xywh['features'] if 'angle' in region_xywh and region_xywh['angle']: # orientation has already been annotated (by previous deskewing), # so skip deskewing here: - bin_image, _ = binarize(region_image, - method=self.parameter['method'], - maxskew=0, - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, _ = binarize( + self.logger, + region_image, + method=self.parameter['method'], + maxskew=0, + nrm=self.parameter['grayscale'], + zoom=zoom) else: - bin_image, angle = binarize(region_image, - method=self.parameter['method'], - maxskew=self.parameter['maxskew'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + self.logger, + region_image, + method=self.parameter['method'], + maxskew=self.parameter['maxskew'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' region_xywh['angle'] = angle - bin_image = remove_noise(bin_image, - maxsize=self.parameter['noise_maxsize']) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' # annotate angle in PAGE (to allow consumers of the AlternativeImage @@ -255,35 +207,30 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ orientation = -region_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] region.set_orientation(orientation) - # update METS (add the image file): + suffix = f'{region.id}' if self.parameter['grayscale']: - file_id += '.IMG-NRM' + suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + suffix += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, - file_id, - page_id=page_id, - file_grp=self.output_file_grp) # update PAGE (reference the image file): - region.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + alt_image = AlternativeImageType(comments=features) + region.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) - def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, file_id): + def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage: if not line_image.width or not line_image.height: - self.logger.warning("Skipping line '%s' with zero size", line.id) - return - self.logger.info("About to binarize page '%s' region '%s' line '%s'", - page_id, region_id, line.id) + raise ValueError(f"Skipping line '{line.id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") features = line_xywh['features'] - bin_image, angle = binarize(line_image, - method=self.parameter['method'], - maxskew=self.parameter['maxskew'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + self.logger, + line_image, + method=self.parameter['method'], + maxskew=self.parameter['maxskew'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' # annotate angle in PAGE (to allow consumers of the AlternativeImage @@ -292,25 +239,19 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, fi #orientation = -angle #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] #line.set_orientation(orientation) # does not exist on line level! - self.logger.warning("cannot add orientation %.2f to page '%s' region '%s' line '%s'", - -angle, page_id, region_id, line.id) - bin_image = remove_noise(bin_image, - maxsize=self.parameter['noise_maxsize']) + self.logger.warning( + f"Cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", -angle) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' - # update METS (add the image file): + suffix = f'{region_id}_{line.id}' if self.parameter['grayscale']: - file_id += '.IMG-NRM' + suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + suffix += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, - file_id, - page_id=page_id, - file_grp=self.output_file_grp) # update PAGE (reference the image file): - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + alt_image = AlternativeImageType(comments=features) + line.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 8f84efe6..18a0c115 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,50 +1,36 @@ from __future__ import absolute_import +from logging import Logger +from typing import Optional -import os.path import numpy as np from PIL import Image, ImageStat, ImageOps from shapely.geometry import Polygon from shapely.prepared import prep -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) -from ocrd import Processor +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - coordinates_of_segment, - polygon_from_points, bbox_from_polygon, + coordinates_of_segment, + crop_image, image_from_polygon, + polygon_from_points, polygon_mask, - crop_image, - MIMETYPE_PAGE ) -from .. import get_ocrd_tool +from .common import array2pil, determine_zoom, pil2array from .ocrolib import midrange, morph -from .common import ( - # binarize, - pil2array, array2pil -) -TOOL = 'ocrd-cis-ocropy-clip' class OcropyClip(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-clip' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyClip, self).__init__(*args, **kwargs) + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: + """Clip text regions / lines of a page at intersections with neighbours. - def process(self): - """Clip text regions / lines of the workspace at intersections with neighbours. - - Open and deserialise PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective image, then iterate over the element hierarchy down to the requested ``level-of-operation``. @@ -64,7 +50,7 @@ def process(self): Reference each new image in the AlternativeImage of the element. - Produce a new output file by serialising the resulting hierarchy. + Return the resulting OcrdPage. """ # This makes best sense for overlapping segmentation, like current GT # or Tesseract layout analysis. Most notably, it can suppress graphics @@ -74,39 +60,26 @@ def process(self): # connected component analysis after implicit binarization could be # suboptimal, and the explicit binarization after clipping could be, # too. However, region-level clipping _must_ be run before region-level - # deskewing, because that would make segments incomensurable with their + # deskewing, because that would make segments incommensurable with their # neighbours. - LOG = getLogger('processor.OcropyClip') level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - # FIXME: what about text regions inside table regions? - regions = list(page.get_TextRegion()) - num_texts = len(regions) - regions += ( + assert self.workspace + self.logger.debug(f'Level of operation: "{level}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page + + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + # The zoom is not used anywhere + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + ret = OcrdPageResult(pcgts) + + # FIXME: what about text regions inside table regions? + regions = list(page.get_TextRegion()) + num_texts = len(regions) + regions += ( page.get_AdvertRegion() + page.get_ChartRegion() + page.get_ChemRegion() + @@ -119,149 +92,121 @@ def process(self): page.get_SeparatorRegion() + page.get_TableRegion() + page.get_UnknownRegion()) - if not num_texts: - LOG.warning('Page "%s" contains no text regions', page_id) - background = ImageStat.Stat(page_image) - # workaround for Pillow#4925 - if len(background.bands) > 1: - background = tuple(background.median) - else: - background = background.median[0] + if not num_texts: + self.logger.warning(f'Page "{page_id}" contains no text regions') + background = ImageStat.Stat(page_image) + # workaround for Pillow#4925 + if len(background.bands) > 1: + background = tuple(background.median) + else: + background = background.median[0] + if level == 'region': + background_image = Image.new(page_image.mode, page_image.size, background) + page_array = pil2array(page_image) + page_bin = np.array(page_array <= midrange(page_array), np.uint8) + # in absolute coordinates merely for comparison/intersection + shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] + # in relative coordinates for mask/cropping + polygons = [coordinates_of_segment(region, page_image, page_xywh) for region in regions] + for i, polygon in enumerate(polygons[num_texts:], num_texts): + # for non-text regions, extend mask by 3 pixels in each direction + # to ensure they do not leak components accidentally + # (accounts for bad cropping of such regions in GT): + polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open + polygons[i] = polygon + masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] + for i, region in enumerate(regions): + if i >= num_texts: + break # keep non-text regions unchanged if level == 'region': - background_image = Image.new(page_image.mode, page_image.size, background) - page_array = pil2array(page_image) - page_bin = np.array(page_array <= midrange(page_array), np.uint8) - # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(region.get_Coords().points)) - for region in regions] - # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(region, page_image, page_coords) - for region in regions] - for i, polygon in enumerate(polygons[num_texts:], num_texts): - # for non-text regions, extend mask by 3 pixels in each direction - # to ensure they do not leak components accidentally - # (accounts for bad cropping of such regions in GT): - polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open - polygons[i] = polygon - masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) - for polygon in polygons] - for i, region in enumerate(regions): - if i >= num_texts: - break # keep non-text regions unchanged - if level == 'region': - if region.get_AlternativeImage(): - # FIXME: This should probably be an exception (bad workflow configuration). - LOG.warning('Page "%s" region "%s" already contains image data: skipping', - page_id, region.id) - continue - shape = prep(shapes[i]) - neighbours = [(regionj, maskj) for shapej, regionj, maskj - in zip(shapes[:i] + shapes[i+1:], - regions[:i] + regions[i+1:], - masks[:i] + masks[i+1:]) - if shape.intersects(shapej)] - if neighbours: - self.process_segment(region, masks[i], polygons[i], - neighbours, background_image, - page_image, page_coords, page_bin, - input_file.pageId, file_id + '_' + region.id) + if region.get_AlternativeImage(): + # FIXME: This should probably be an exception (bad workflow configuration). + self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping') continue - # level == 'line': - lines = region.get_TextLine() - if not lines: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + shape = prep(shapes[i]) + neighbours = [ + (regionj, maskj) for shapej, regionj, maskj in + zip(shapes[:i] + shapes[i + 1:], regions[:i] + regions[i + 1:], masks[:i] + masks[i + 1:]) + if shape.intersects(shapej)] + if neighbours: + ret.images.append(self.process_segment( + region, masks[i], polygons[i], neighbours, background_image, + page_image, page_xywh, page_bin, page_id)) + continue + # level == 'line': + lines = region.get_TextLine() + if not lines: + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') + continue + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_selector='binarized') + background_image = Image.new(region_image.mode, region_image.size, background) + region_array = pil2array(region_image) + region_bin = np.array(region_array <= midrange(region_array), np.uint8) + # in absolute coordinates merely for comparison/intersection + shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines] + # in relative coordinates for mask/cropping + polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines] + masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons] + for j, line in enumerate(lines): + if line.get_AlternativeImage(): + # FIXME: This should probably be an exception (bad workflow configuration). + self.logger.warning( + f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping') continue - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - background_image = Image.new(region_image.mode, region_image.size, background) - region_array = pil2array(region_image) - region_bin = np.array(region_array <= midrange(region_array), np.uint8) - # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(line.get_Coords().points)) - for line in lines] - # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(line, region_image, region_coords) - for line in lines] - masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) - for polygon in polygons] - for j, line in enumerate(lines): - if line.get_AlternativeImage(): - # FIXME: This should probably be an exception (bad workflow configuration). - LOG.warning('Page "%s" region "%s" line "%s" already contains image data: skipping', - page_id, region.id, line.id) - continue - shape = prep(shapes[j]) - neighbours = [(linej, maskj) for shapej, linej, maskj - in zip(shapes[:j] + shapes[j+1:], - lines[:j] + lines[j+1:], - masks[:j] + masks[j+1:]) - if shape.intersects(shapej)] - if neighbours: - self.process_segment(line, masks[j], polygons[j], - neighbours, background_image, - region_image, region_coords, region_bin, - input_file.pageId, file_id + '_' + region.id + '_' + line.id) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def process_segment(self, segment, segment_mask, segment_polygon, neighbours, - background_image, parent_image, parent_coords, parent_bin, - page_id, file_id): - LOG = getLogger('processor.OcropyClip') + shape = prep(shapes[j]) + neighbours = [ + (linej, maskj) for shapej, linej, maskj in + zip(shapes[:j] + shapes[j + 1:], lines[:j] + lines[j + 1:], masks[:j] + masks[j + 1:]) + if shape.intersects(shapej)] + if neighbours: + ret.images.append(self.process_segment( + line, masks[j], polygons[j], neighbours, background_image, + region_image, region_coords, region_bin, page_id)) + return ret + + def process_segment( + self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, + parent_bin, page_id + ) -> OcrdPageResultImage: # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( [feature for feature in parent_coords['features'].split(',') - if feature in ['binarized', 'grayscale_normalized', - 'despeckled', 'dewarped']]) + ',clipped' + if feature in ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped']]) + ',clipped' # mask segment within parent image: segment_image = image_from_polygon(parent_image, segment_polygon) segment_bbox = bbox_from_polygon(segment_polygon) for neighbour, neighbour_mask in neighbours: if not np.any(segment_mask > neighbour_mask): - LOG.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"', - neighbour.id, segment.id, page_id) + self.logger.info( + f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"') continue # find connected components that (only) belong to the neighbour: - intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour - intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively + intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour + intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively num_intruders = np.count_nonzero(intruders) num_foreground = np.count_nonzero(segment_mask * parent_bin) if not num_intruders: continue - LOG.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"', - segment.id, neighbour.id, num_intruders, num_foreground, page_id) + self.logger.debug( + f'segment "{segment.id}" vs neighbour "{neighbour.id}": suppressing {num_intruders} of ' + f'{num_foreground} pixels on page "{page_id}"') # suppress in segment_mask so these intruders can stay in the neighbours # (are not removed from both sides) segment_mask -= intruders # suppress in derived image result to be annotated clip_mask = array2pil(intruders) - segment_image.paste(background_image, mask=clip_mask) # suppress in raw image + segment_image.paste(background_image, mask=clip_mask) # suppress in raw image if segment_image.mode in ['RGB', 'L', 'RGBA', 'LA']: # for consumers that do not have to rely on our # guessed background color, but can cope with transparency: segment_image.putalpha(ImageOps.invert(clip_mask)) # recrop segment into rectangle, just as image_from_segment would do # (and also clipping with background colour): - segment_image = crop_image(segment_image,box=segment_bbox) - # update METS (add the image file): - file_path = self.workspace.save_image_file( - segment_image, - file_id=file_id + '.IMG-CLIP', - page_id=page_id, - file_grp=self.output_file_grp) + segment_image = crop_image(segment_image, box=segment_bbox) # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + suffix = f'{segment.id}.IMG_CLIP' + alternative_image = AlternativeImageType(comments=features) + segment.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(segment_image, suffix, alternative_image) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index d84e42b3..bae4dac0 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -1,4 +1,5 @@ from __future__ import absolute_import +from typing import Optional import warnings import logging @@ -7,8 +8,10 @@ from scipy.ndimage import measurements, filters, interpolation, morphology from scipy import stats, signal #from skimage.morphology import convex_hull_image +from skimage.morphology import medial_axis +import networkx as nx from PIL import Image - +from ocrd_models import OcrdExif from . import ocrolib from .ocrolib import morph, psegutils, sl # for decorators (type-checks etc): @@ -317,6 +320,7 @@ def check_line(binary, zoom=1.0): ##if w<1.5*h: return "line too short %s"%(binary.shape,) if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,) if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,) + return None ratio = w*1.0/h _, ncomps = measurements.label(binary) lo = int(0.5*ratio+0.5) @@ -346,6 +350,7 @@ def check_region(binary, zoom=1.0): if h>5000/zoom: return "image too tall for a region image %s"%(binary.shape,) if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,) if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,) + return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) @@ -370,9 +375,10 @@ def check_page(binary, zoom=1.0): if np.mean(binary)10000/zoom: return "image too tall for a page image %s"%(binary.shape,) + if h>20000/zoom: return "image too tall for a page image %s"%(binary.shape,) if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,) - if w>10000/zoom: return "image too wide for a page image %s"%(binary.shape,) + if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,) + return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) @@ -450,18 +456,18 @@ def on_press(event): @checks(ABINARY2,NUMBER) def compute_images(binary, scale, maximages=5): - """Finds (and removes) large connected foreground components. + """Detects large connected foreground components that could be images. Parameters: - ``binary``, a bool or int array of the page image, with 1=black - ``scale``, square root of average bbox area of characters - - ``maximages``, maximum number of large components to keep + - ``maximages``, maximum number of images to find (This could be drop-capitals, line drawings or photos.) - Returns a same-size bool array as a mask image. + Returns a same-size image label array. """ if maximages == 0: - return binary == -1 + return np.zeros_like(binary, int) images = binary # d0 = odd(max(2,scale/5)) # d1 = odd(max(2,scale/8)) @@ -473,7 +479,7 @@ def compute_images(binary, scale, maximages=5): images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages) DSAVE('images1_large', images+0.6*binary) if not images.any(): - return images > 0 + return np.zeros_like(binary, int) # 2- open horizontally and vertically to suppress # v/h-lines; these will be detected separately, # and it is dangerous to combine them into one @@ -498,14 +504,237 @@ def compute_images(binary, scale, maximages=5): images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages) DSAVE('images5_selected', images+0.6*binary) if not images.any(): - return images > 0 + return np.zeros_like(binary, int) # 6- dilate a little to get a smooth contour without gaps dilated = morph.r_dilation(images, (odd(scale),odd(scale))) images = morph.propagate_labels_majority(binary, dilated+1) images = morph.spread_labels(images, maxdist=scale)==2 + images, _ = morph.label(images) DSAVE('images6_dilated', images+0.6*binary) # we could repeat reconstruct-dilate here... - return images > 0 + return images + +@checks(ABINARY2,NUMBER) +def compute_seplines(binary, scale, maxseps=0): + """Detects thin connected foreground components that could be separators. + + Parameters: + - ``binary``, a bool or int array of the page image, with 1=black + - ``scale``, square root of average bbox area of characters + - ``maxseps``, maximum number of separators to find + (This could be horizontal, vertical or oblique, even slightly warped and discontinuous lines.) + + Returns a same-size separator label array. + """ + # tries to find a compromise for the following issues, + # potentially occurring in combination (or all at once): + # - non-congiguous or broken lines (due to thin ink or low contrast) + # - skewed, curved or warped lines (due to non-planar photography or irregular typography) + # - very close or overlapping text (due to show-through or bad binarization) + # - superimposed fg noise (due to bad binarization) that may connect text and non-text + # - intersecting vertical and horizontal lines, even closed shapes (enclosing text) + # - line-like glyphs (i.e. false positives) + if maxseps == 0: + return np.zeros_like(binary, int) + skel, dist = medial_axis(binary, return_distance=True) + DSAVE("medial-axis", [dist, skel]) + labels, nlabels = morph.label(skel) + slices = [None] + morph.find_objects(labels) + DSAVE("skel-labels", labels) + # determine those components which could be separators + # (filter by compactness, and by mean+variance of distances) + sepmap = np.zeros(nlabels + 1, int) + numsep = 0 + sepsizes = [0] + sepslices = [None] + sepdists = [0] + for label in range(1, nlabels + 1): + labelslice = slices[label] + labelmask = labels == label + labelsize = np.count_nonzero(labelmask) # sum of skel pixels, i.e. "inner length" + labelarea = sl.area(labelslice) + labelaspect = sl.aspect(labelslice) + if labelaspect > 1: + labelaspect = 1 / labelaspect + labellength = np.hypot(*sl.dims(labelslice)) # length of bbox diagonal, i.e. "outer length" + #LOG.debug("skel label %d has inner size %d and outer size %d", label, labelsize, labellength) + if labelsize > 1.5 * labellength and labelaspect >= 0.1 and labelsize < 15 * scale: #and labelsize > 0.1 * labelarea + # not long / straight, but very compact + continue + distances = dist[labelmask] + avg_dist = np.median(distances) #np.mean(distances) + std_dist = np.std(distances) + # todo: empirical analysis of ideal thresholds + if avg_dist > scale / 4 or std_dist/avg_dist > 0.7: + continue + #LOG.debug("skel label %d has dist %.1f±%.2f", label, avg_dist, std_dist) + numsep += 1 + sepmap[label] = numsep + sepsizes.append(labelsize) + sepslices.append(labelslice) + sepdists.append(avg_dist) + if labelsize > 10 * scale and avg_dist > 0 and std_dist / avg_dist > 0.2: + # try to split this large label up along neighbouring spans of similar distances: + # (e.g. vlines that touch letters or images) + # 1. get optimal (by variability) spans as bin intervals, then merge largest spans + disthist, distedges = np.histogram(distances, bins='scott', density=True) # stone + disthist *= np.diff(distedges) # get probability masses + disthistlarge = disthist > 0.1 + if np.count_nonzero(disthistlarge) < 2: + continue # only 1 large bin + disthistlarge[-1] = True # ensure full interval + distedges = distedges[1:][disthistlarge] + disthist = np.cumsum(disthist)[disthistlarge] + disthist = np.diff(disthist, prepend=0) + distbin = np.digitize(distances, distedges, right=True) + # 2. now find connected components within bins, but map all tiny components + # to a single label so they can be replaced by their neighbours later-on + sublabels = np.zeros_like(labels) + sublabels[labelmask] = distbin + 1 + DSAVE("sublabels", sublabels) + sublabels2 = np.zeros_like(labels) + sublabel = 1 + sublabelmap = [0, 1] + for bin in range(len(distedges)): + binmask = sublabels == bin + 1 + binlabels, nbinlabels = morph.label(binmask) + _, binlabelcounts = np.unique(binlabels, return_counts=True) + largemask = (binlabelcounts > 2 * scale)[binlabels] + smallmask = (binlabelcounts <= 2 * scale)[binlabels] + sublabels2[binmask & smallmask] = 1 + if not np.any(binmask & largemask): + continue + sublabels2[binmask & largemask] = binlabels[binmask & largemask] + sublabel + sublabel += nbinlabels + sublabelmap.extend(nbinlabels*[bin + 1]) + if sublabel == 1: + continue # only tiny sublabels here + DSAVE("sublabels_connected", sublabels2) + sublabelmap = np.array(sublabelmap) + # 3. finally, replace tiny components by nearest components, + # and recombine survivors to bin labels + smallmask = sublabels2 == 1 + sublabels2[smallmask] = 0 + sublabels2[smallmask] = morph.spread_labels(sublabels2)[smallmask] + sublabels = sublabelmap[sublabels2] + DSAVE("sublabels_final", sublabels) + # now apply as multiple separators + numsep -= 1 + sepmap[label] = 0 + slices[label] = None + sepsizes = sepsizes[:-1] + sepslices = sepslices[:-1] + sepdists = sepdists[:-1] + for sublabel in np.unique(sublabels[labelmask]): + sublabelmask = sublabels == sublabel + sublabelsize = np.count_nonzero(sublabelmask) + sublabelslice = sublabelmask.nonzero() + sublabelslice = sl.box(sublabelslice[0].min(), + sublabelslice[0].max(), + sublabelslice[1].min(), + sublabelslice[1].max()) + subdistances = dist[sublabelmask] + nlabels += 1 + numsep += 1 + sepmap = np.append(sepmap, numsep) + labels[sublabelmask] = nlabels + slices.append(sublabelslice) + sepsizes.append(sublabelsize) + sepslices.append(sublabelslice) + sepdists.append(np.median(subdistances)) + #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice)) + sepsizes = np.array(sepsizes) + sepslices = np.array(sepslices, dtype=object) + LOG.debug("detected %d separator candidates", numsep) + DSAVE("seps-raw", sepmap[labels]) + # now dilate+erode to link neighbouring candidates, + # but allow only such links which + # - stay consistent regarding avg/std width + # - do not enclose large areas in between + # - do not "change direction" (roughly adds up their diagonals) + # then combine mutual neighbourships to largest allowed partitions + d0 = odd(max(1,scale/2)) + d1 = odd(max(1,scale/4)) + closed = morph.rb_closing(sepmap[labels] > 0, (d0,d1)) + DSAVE("seps-closed", [dist, closed]) + labels2, nlabels2 = morph.label(closed) + corrs = morph.correspondences(sepmap[labels], labels2, return_counts=False).T + corrmap = np.arange(numsep + 1) + for sep2 in range(1, nlabels2 + 1): + corrinds = corrs[:, 1] == sep2 + corrinds[corrs[:, 0] == 0] = False # ignore bg + corrinds = corrinds.nonzero()[0] + if len(corrinds) == 1: + continue # nothing to link + nonoverlapping = np.zeros((len(corrinds), len(corrinds)), dtype=bool) + for i, indi in enumerate(corrinds[:-1]): + sepi = corrs[indi, 0] + labeli = np.flatnonzero(sepmap == sepi)[0] + slicei = slices[labeli] + lengthi = np.hypot(*sl.dims(slicei)) + areai = sl.area(slicei) + for j, indj in enumerate(corrinds[i + 1:], i + 1): + sepj = corrs[indj, 0] + labelj = np.flatnonzero(sepmap == sepj)[0] + slicej = slices[labelj] + lengthj = np.hypot(*sl.dims(slicej)) + areaj = sl.area(slicej) + union = sl.union(slicei, slicej) + length = np.hypot(*sl.dims(union)) + if length < 0.9 * (lengthi + lengthj): + continue + if sl.area(union) > 1.3 * (areai + areaj): + continue + if not (0.8 < sepdists[sepi] / sepdists[sepj] < 1.2): + continue + inter = sl.intersect(slicei, slicej) + if (sl.empty(inter) or + (sl.area(inter) / areai < 0.2 and + sl.area(inter) / areaj < 0.2)): + nonoverlapping[i, j] = True + nonoverlapping[j, i] = True + # find largest maximal clique (i.e. fully connected subgraphs) + corrinds = corrinds[max(nx.find_cliques(nx.Graph(nonoverlapping)), key=len)] + corrmap[corrs[corrinds, 0]] = corrs[corrinds[0], 0] + _, corrmap = np.unique(corrmap, return_inverse=True) # make contiguous + numsep = corrmap.max() + LOG.debug("linked to %d separator candidates", numsep) + def union(slices): + if len(slices) > 1: + return sl.union(slices[0], union(slices[1:])) + return slices[0] + for sep in range(1, numsep + 1): + sepsizes[sep] = max(sepsizes[corrmap == sep]) # sum + sepslices[sep] = union(sepslices[corrmap == sep]) + sepsizes = sepsizes[:numsep + 1] + sepslices = sepslices[:numsep + 1] + seplengths = np.array([np.hypot(*sl.dims(sepslice)) if sepslice else 0 + for sepslice in sepslices]) + sepmap = corrmap[sepmap] + DSAVE("seps-raw-linked", sepmap[labels]) + # order by size, filter minsize and filter top maxseps + order = np.argsort(sepsizes)[::-1] + # no more than maxseps and no smaller than scale + minsize = np.flatnonzero((sepsizes[order] < scale) | (seplengths[order] < 3 * scale)) + if np.any(minsize): + maxseps = min(maxseps, minsize[0]) + maxseps = min(maxseps, numsep) + ordermap = np.zeros(numsep + 1, int) + ordermap[order[:maxseps]] = np.arange(1, maxseps + 1) + sepmap = ordermap[sepmap] + DSAVE("sep-top", sepmap[labels]) + # spread into fg against other fg + sepseeds = sepmap[labels] + sepseeds = morph.spread_labels(sepseeds, maxdist=max(sepdists)) + sepseeds[~binary] = 0 + #labels = morph.propagate_labels_simple(binary, labels) + #DSAVE("seps-top-spread-fg", sepseeds) + # spread into bg against other fg + sepseeds[binary & (sepseeds == 0)] = maxseps + 1 + seplabels = morph.spread_labels(sepseeds, maxdist=scale / 2) + seplabels[seplabels == maxseps + 1] = 0 + DSAVE("seps-top-spread-bg", seplabels) + return seplabels # from ocropus-gpageseg, but with horizontal opening @deprecated @@ -685,7 +914,7 @@ def compute_colseps_conv(binary, scale=1.0, csminheight=10, maxcolseps=2): grad = filters.gaussian_filter(1.0*binary,(scale,scale*0.5),order=(0,1)) grad = filters.uniform_filter(grad,(10.0*scale,1)) # csminheight DSAVE("colwsseps2_grad-raw",grad) - grad = (grad>0.5*np.amax(grad)) + grad = grad > np.minimum(0.5 * np.amax(grad), np.percentile(grad, 99.5)) DSAVE("colwsseps2_grad",grad) # combine dilated edges and whitespace seps = np.minimum(thresh,filters.maximum_filter(grad,(odd(10*scale),odd(5*scale)))) @@ -938,7 +1167,7 @@ def h_compatible(obj1, obj2, center1, center2): label1_y, label1_x = np.where(seeds == label) label2_y, label2_x = np.where(seed2) shared_y = np.intersect1d(label1_y, label2_y) - gap = np.zeros_like(seed2, np.bool) + gap = np.zeros_like(seed2, bool) for y in shared_y: can_x_min = label2_x[label2_y == y][0] can_x_max = label2_x[label2_y == y][-1] @@ -974,20 +1203,20 @@ def h_compatible(obj1, obj2, center1, center2): relabel[relabel == label2] = new_label # apply re-assignments: seeds = relabel[seeds] - DSAVE("hmerge5_connected", seeds) + # DSAVE("hmerge5_connected", seeds) return seeds # from ocropus-gpageseg, but: # - with fullpage switch -# (opt-in for h/v-line and column detection), +# (opt-in for separator line and column detection), # - with external separator mask -# (opt-in for h/v-line pass-through) +# (opt-in for separator line pass-through) # - with zoom parameter # (make fixed dimension params relative to pixel density, # instead of blind 300 DPI assumption) -# - with improved h/v-line and column detection -# - with v-line detection _before_ column detection -# - with h/v-line suppression _after_ large component filtering +# - with improved separator line and column detection +# - with separator detection _before_ column detection +# - with separator suppression _after_ large component filtering # - with more robust line seed estimation, # - with horizontal merge instead of blur, # - with component majority for foreground @@ -996,17 +1225,18 @@ def h_compatible(obj1, obj2, center1, center2): # (which must be split anyway) # - with tighter polygonal spread around foreground # - with spread of line labels against separator labels +# - with baseline extraction # - return bg line and sep labels intead of just fg line labels +# - return baseline coords, too @checks(ABINARY2) def compute_segmentation(binary, zoom=1.0, fullpage=False, seps=None, maxcolseps=2, + csminheight=4, maxseps=0, maximages=0, - csminheight=4, - hlminwidth=10, spread_dist=None, rl=False, bt=False): @@ -1024,13 +1254,10 @@ def compute_segmentation(binary, - for up to ``maxcolseps`` multi-line vertical whitespaces (as column separators, counted piece-wise) of at least ``csminheight`` multiples of ``scale``, - - for up to ``maxseps`` vertical black lines - (as column separators, counted piece-wise) of at least - ``csminheight`` multiples of ``scale``, and - - for any number of horizontal lines of at least - ``hlminwidth`` multiples of ``scale``, + - for up to ``maxseps`` black separator lines (horizontal, vertical + or oblique; counted piece-wise), - for anything in ``seps`` if given, - then suppress these separator components and return them separately. + then suppress these non-text components and return them separately. Labels will be projected ("spread") from the foreground to the surrounding background within ``spread_dist`` distance (or half @@ -1046,8 +1273,8 @@ def compute_segmentation(binary, foreground may remain unlabelled for separators and other non-text like small noise, or large drop-capitals / images), - - Numpy array of horizontal foreground lines mask, - - Numpy array of vertical foreground lines mask, + - list of Numpy arrays of baseline coordinates [y, x points in lr order] + - Numpy array of foreground separator lines mask, - Numpy array of large/non-text foreground component mask, - Numpy array of vertical background separators mask, - the estimated scale (i.e. median sqrt bbox area of glyph components). @@ -1059,18 +1286,17 @@ def compute_segmentation(binary, LOG.debug('height: %d, zoom: %.2f, scale: %d', binary.shape[0], zoom, scale) if fullpage: - LOG.debug('computing images') + LOG.debug('detecting images') images = compute_images(binary, scale, maximages=maximages) - LOG.debug('computing horizontal/vertical line separators') - hlines = compute_hlines(binary, scale, hlminwidth=hlminwidth, images=images) - vlines = compute_separators_morph(binary, scale, csminheight=csminheight, maxseps=maxseps, images=images) - binary = np.minimum(binary,1-hlines) - binary = np.minimum(binary,1-vlines) - binary = np.minimum(binary,1-images) + LOG.debug('detecting separators') + #hlines = compute_hlines(binary, scale, hlminwidth=hlminwidth, images=images) + #vlines = compute_separators_morph(binary, scale, csminheight=csminheight, maxseps=maxseps, images=images) + slines = compute_seplines(binary, scale, maxseps=maxseps) + binary = np.minimum(binary, 1 - (slines > 0)) + binary = np.minimum(binary, 1 - (images > 0)) else: - hlines = np.zeros_like(binary, np.bool) - vlines = np.zeros_like(binary, np.bool) - images = np.zeros_like(binary, np.bool) + slines = np.zeros_like(binary, np.uint8) + images = np.zeros_like(binary, np.uint8) if seps is not None and not seps.all(): # suppress separators/images for line estimation # (unless it encompasses the full image for some reason) @@ -1089,8 +1315,7 @@ def compute_segmentation(binary, # get a larger (closed) mask of all separators # (both bg boundary and fg line seps, detected # and passed in) to separate line/column labels - sepmask = np.maximum(hlines, vlines) - sepmask = np.maximum(sepmask, images) + sepmask = np.maximum(slines > 0, images > 0) sepmask = np.maximum(sepmask, colseps) if seps is not None: sepmask = np.maximum(sepmask, seps) @@ -1141,10 +1366,76 @@ def compute_segmentation(binary, LOG.debug('sorting labels by reading order') llabels = morph.reading_order(llabels,rl,bt)[llabels] DSAVE('llabels_ordered', llabels) - + #segmentation = llabels*binary #return segmentation - return llabels, hlines, vlines, images, colseps, scale + blines = compute_baselines(bottom, top, llabels, scale) + return llabels, blines, slines, images, colseps, scale + +@checks(AFLOAT2,AFLOAT2,SEGMENTATION,NUMBER) +def compute_baselines(bottom, top, linelabels, scale, method='bottom'): + """Get the coordinates of baselines running along each bottom gradient peak.""" + seeds = linelabels > 0 + # smooth bottom+top maps horizontally for centerline estimation + bot = filters.gaussian_filter(bottom, (scale*0.25,scale), mode='constant') + top = filters.gaussian_filter(top, (scale*0.25,scale), mode='constant') + # idea: center is where bottom and top gradient meet in the middle + # (but between top and bottom, not between bottom and top) + # - calculation via numpy == or isclose is too fragile numerically: + #clines = np.isclose(top, bottom, rtol=0.5) & (np.diff(top - bottom, axis=0, append=0) < 0) + # - calculation via zero crossing of bop-bottom is more robust, + # but needs post-processing for lines with much larger height than scale + if method == 'center': + blines = (np.diff(np.sign(top - bottom), axis=0, append=0) < 0) & seeds + #DSAVE('centerlines', blines) + # - calculation via peak gradient + elif method == 'bottom': + bot1d = np.diff(bot, axis=0, append=0) + bot1d = np.diff(np.sign(bot1d), axis=0, append=0) < 0 + bot1d &= bot > 0 + DSAVE('bot1d', bot1d) + blines = bot1d + baselabels, nbaselabels = morph.label(blines) + baseslices = [(slice(0,0),slice(0,0))] + morph.find_objects(baselabels) + # if multiple labels per seed, ignore the ones above others + # (can happen due to mis-estimation of scale) + corrs = morph.correspondences(linelabels, baselabels).T + labelmap = {} + DSAVE('baselines-raw', baselabels) + for line in np.unique(linelabels): + if not line: continue # ignore bg line + corrinds = corrs[:, 0] == line + corrinds[corrs[:, 1] == 0] = False # ignore bg baseline + if not np.any(corrinds): continue + corrinds = corrinds.nonzero()[0] + if len(corrinds) == 1: + labelmap.setdefault(line, list()).append(corrs[corrinds[0], 1]) + continue + nonoverlapping = ~np.eye(len(corrinds), dtype=bool) + for i, indi in enumerate(corrinds[:-1]): + baselabeli = corrs[indi, 1] + baseslicei = baseslices[baselabeli] + for j, indj in enumerate(corrinds[i + 1:], i + 1): + baselabelj = corrs[indj, 1] + baseslicej = baseslices[baselabelj] + if sl.xoverlaps(baseslicei, baseslicej): + nonoverlapping[i, j] = False + nonoverlapping[j, i] = False + # find all maximal cliques in the graph (i.e. all fully connected subgraphs) + # and then pick the partition with the largest sum of pixels at its nodes + def pathlen(path): + return sum(corrs[corrinds[path], 2]) + corrinds = corrinds[max(nx.find_cliques(nx.Graph(nonoverlapping)), key=pathlen)] + labelmap.setdefault(line, list()).extend(corrs[corrinds, 1]) + basepoints = [] + for line in np.unique(linelabels): + if line not in labelmap: continue + linemask = linelabels == line + points = [] + for label in labelmap[line]: + points.extend(list(zip(*np.where((baselabels == label) & linemask)))) + basepoints.append(points) + return basepoints # from ocropus-gpageseg, but # - on both foreground and background, @@ -1290,7 +1581,7 @@ def lines2regions(binary, llabels, bincounts = np.bincount(lbinary.flatten()) LOG.debug('combining lines to regions') - relabel = np.zeros(np.amax(llabels)+1, np.int) + relabel = np.zeros(np.amax(llabels)+1, int) num_regions = 0 def recursive_x_y_cut(box, mask=None, partition_type=None, debug=False): """Split lbinary at horizontal or vertical gaps recursively. @@ -1337,7 +1628,7 @@ def finalize(): llab = sl.cut(llabels, box) if isinstance(mask, np.ndarray): llab = np.where(mask, llab, 0) - linelabels0 = np.zeros(llabels.max()+1, dtype=np.bool) + linelabels0 = np.zeros(llabels.max()+1, dtype=bool) linelabels0[linelabels] = True llab *= linelabels0[llab] newregion = rlab.max()+1 @@ -1388,12 +1679,6 @@ def finalize(): sepm = sl.cut(sepmask, box) if isinstance(mask, np.ndarray): sepm = np.where(mask, sepm, 1) - if isinstance(rlabels, np.ndarray): - # treat existing regions like separators - rlab = sl.cut(rlabels, box) - if isinstance(mask, np.ndarray): - rlab = np.where(mask, rlab, 0) - sepm = np.where(rlab, 1, sepm) # provide `partitions` for next step partitions, npartitions = 1-sepm, 1 new_partition_type = None @@ -1403,31 +1688,35 @@ def finalize(): # try to apply in this cut like another separator partitions, npartitions = morph.label(1-sepm) if npartitions > 1: - # first, delete partitions that have no significant line labels - splitmap = np.zeros(len(objects)+1, dtype=np.int) - for label in range(1, npartitions+1): - linecounts = np.bincount(lbin[partitions==label], minlength=len(objects)) + # delete partitions that have no significant line labels, + # merge partitions that share any significant line labels + splitmap = np.zeros((len(objects), npartitions), dtype=bool) + for label in range(npartitions): + linecounts = np.bincount(lbin[partitions==label+1], minlength=len(objects)) linecounts[0] = 0 # without bg # get significant line labels for this partition # (but keep insignificant non-empty labels if complete) mincounts = np.minimum(min_line * scale, np.maximum(1, bincounts)) linelabels = np.nonzero(linecounts >= mincounts)[0] if linelabels.size: - splitmap[linelabels] = label - if debug: LOG.debug(' sepmask partition %d: %s', label, str(linelabels)) + splitmap[linelabels, label] = True + if debug: LOG.debug(' sepmask partition %d: %s', label+1, str(linelabels)) else: - partitions[partitions==label] = 0 - # second, merge partitions that share any significant line labels - for label1 in range(1, npartitions+1): - if not np.any(splitmap == label1): + partitions[partitions==label+1] = 0 + if isinstance(rlabels, np.ndarray): + # keep existing regions in distinct partitions if possible + rlab = sl.cut(rlabels, box) + if isinstance(mask, np.ndarray): + rlab = np.where(mask, rlab, 0) + splitmap[np.unique(lbin[rlab>0])] = False + mergemap = np.arange(npartitions + 1) + for line in splitmap: + if not np.any(line): continue - for label2 in range(label1+1, npartitions+1): - if not np.any(splitmap == label2): - continue - if np.any((splitmap == label1) & (splitmap == label2)): - splitmap[splitmap == label2] = label1 - partitions[partitions==label2] = label1 - npartitions = len(np.setdiff1d(np.unique(splitmap), [0])) + parts = np.flatnonzero(line)+1 + mergemap[parts] = mergemap[parts[0]] + partitions = mergemap[partitions] + npartitions = len(np.setdiff1d(np.unique(mergemap), [0])) new_partition_type = 'splitmask' if debug: LOG.debug(' %d sepmask partitions after filtering and merging', npartitions) if partition_type != 'topological': @@ -1435,10 +1724,16 @@ def finalize(): # get current slice's line labels def find_topological(): # run only if needed (no other partition/slicing possible) - nonlocal partitions, npartitions, new_partition_type + nonlocal sepm, partitions, npartitions, new_partition_type llab = sl.cut(llabels, box) if isinstance(mask, np.ndarray): llab = np.where(mask, llab, 0) + if isinstance(rlabels, np.ndarray): + # treat existing regions like separators + rlab = sl.cut(rlabels, box) + if isinstance(mask, np.ndarray): + rlab = np.where(mask, rlab, 0) + sepm = np.where(rlab, 1, sepm) obj = [sl.intersect(o, box) for o in objects] # get current slice's foreground bin = sl.cut(binary, box) @@ -1453,7 +1748,7 @@ def find_topological(): seplabs, counts = np.unique(seplab * bin, return_counts=True) kept = np.in1d(seplab.ravel(), seplabs[counts > scale * min_line]) seplab = seplab * kept.reshape(*seplab.shape) - DSAVE('seplab', seplab) + #DSAVE('seplab', seplab) sepobj = morph.find_objects(seplab) if not len(sepobj): return @@ -1462,8 +1757,8 @@ def find_topological(): linelabels = np.setdiff1d(np.unique(lbin), [0]) nlines = linelabels.max() + 1 # find pairs of lines above each other with a separator next to them - leftseps = np.zeros((nlines, nseps), np.bool) - rghtseps = np.zeros((nlines, nseps), np.bool) + leftseps = np.zeros((nlines, nseps), bool) + rghtseps = np.zeros((nlines, nseps), bool) for line in linelabels: for i, sep in enumerate(sepobj): if sep is None: @@ -1484,7 +1779,7 @@ def find_topological(): if not np.any(trueseps): return if debug: LOG.debug("trueseps: %s", str(trueseps)) - neighbours = np.zeros((nlines, nlines), np.bool) + neighbours = np.zeros((nlines, nlines), bool) for i in linelabels: for j in linelabels[i+1:]: if sl.yoverlap_rel(obj[i], obj[j]) > 0.5: @@ -1500,7 +1795,7 @@ def find_topological(): # group neighbours by adjacency (i.e. put any contiguous pairs # of such line labels into the same group) nlabels = llab.max() + 1 - splitmap = np.zeros(nlabels, dtype=np.int) + splitmap = np.zeros(nlabels, dtype=int) for i, j in zip(*neighbours.nonzero()): if splitmap[i] > 0: splitmap[j] = splitmap[i] @@ -1588,8 +1883,8 @@ def find_topological(): if not gaps.shape[0]: continue for start, stop, height in sorted(zip( - props['left_ips'].astype(np.int), - props['right_ips'].astype(np.int), + props['left_ips'].astype(int), + props['right_ips'].astype(int), props['peak_heights']), key=lambda x: x[2]): if is_horizontal: llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) @@ -1741,7 +2036,7 @@ def find_topological(): npartitions > len(gaps)+1 or # partitions without the cut still score better than after sum(map(sl.height if prefer_vertical else sl.width, - (morph.find_objects(partitions)))) > np.max( + filter(None, morph.find_objects(partitions)))) > np.max( partitionscores, initial=0))): # continue on each partition by suppressing the others, respectively order = morph.reading_order(partitions,rl,bt) @@ -1808,3 +2103,16 @@ def find_topological(): # rlabels[region_hull] = region # DSAVE('rlabels_closed', rlabels) return rlabels + +def determine_zoom(logger: logging.Logger, page_id: Optional[str], dpi: float, page_image_info: OcrdExif) -> float: + if dpi > 0: + zoom = 300.0/dpi + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi *= 2.54 + logger.info(f"Page '{page_id}' uses {dpi} DPI.") + zoom = 300.0/dpi + else: + zoom = 1 + return zoom diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 61a77141..eaed74df 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,38 +1,22 @@ from __future__ import absolute_import +from typing import Optional +from logging import Logger -import os.path +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) -from ocrd import Processor - -from .. import get_ocrd_tool -from .common import ( - # binarize, - remove_noise) - -TOOL = 'ocrd-cis-ocropy-denoise' +from .common import determine_zoom, remove_noise class OcropyDenoise(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-denoise' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyDenoise, self).__init__(*args, **kwargs) - - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Despeckle the pages / regions / lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested ``level-of-operation``. @@ -50,88 +34,55 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyDenoise') level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, - feature_selector='binarized' if level == 'page' else '') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - if level == 'page': - self.process_segment(page, page_image, page_xywh, zoom, - input_file.pageId, file_id) - else: - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh, - feature_selector='binarized' if level == 'region' else '') - if level == 'region': - self.process_segment(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue - lines = region.get_TextLine() - if not lines: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, - feature_selector='binarized') - self.process_segment(line, line_image, line_xywh, zoom, - input_file.pageId, - file_id + '_' + region.id + '_' + line.id) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id): - LOG = getLogger('processor.OcropyDenoise') + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_selector='binarized' if level == 'page' else '') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + if level == 'page': + image = self.process_segment(page, page_image, page_xywh, zoom, page_id) + if image: + result.images.append(image) + else: + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning(f'Page "{page_id}" contains no text regions') + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, + feature_selector='binarized' if level == 'region' else '') + if level == 'region': + file_id = f"{page_id}_{region.id}" + image = self.process_segment(region, region_image, region_xywh, zoom, file_id) + if image: + result.images.append(image) + continue + lines = region.get_TextLine() + if not lines: + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, feature_selector='binarized') + file_id = f"{page_id}_{region.id}_{line.id}" + image = self.process_segment(line, line_image, line_xywh, zoom, file_id) + if image: + result.images.append(image) + return result + + def process_segment(self, segment, segment_image, segment_xywh, zoom, file_id) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: - LOG.warning("Skipping '%s' with zero size", file_id) - return - LOG.info("About to despeckle '%s'", file_id) - bin_image = remove_noise(segment_image, - maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt - # update METS (add the image file): - file_path = self.workspace.save_image_file( - bin_image, - file_id + '.IMG-DESPECK', - page_id=page_id, - file_grp=self.output_file_grp) + self.logger.warning(f"Skipping '{segment.id}' with zero size") + return None + self.logger.info(f"About to despeckle '{segment.id}'") + bin_image = remove_noise( + segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72) # in pt # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=segment_xywh['features'] + ',despeckled')) + alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') + suffix = f"{file_id}.IMG-DESPECK" + segment.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index aabbce3e..b02c69d5 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,29 +1,13 @@ from __future__ import absolute_import +from typing import Optional +from logging import Logger -import os.path +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - PageType, - to_xml, AlternativeImageType -) -from ocrd import Processor - -from .. import get_ocrd_tool from . import common -from .common import ( - pil2array -) - -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -TOOL = 'ocrd-cis-ocropy-deskew' +from .common import pil2array def deskew(pil_image, maxskew=2): array = pil2array(pil_image) @@ -31,17 +15,14 @@ def deskew(pil_image, maxskew=2): return angle class OcropyDeskew(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-deskew' - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools'][TOOL] - kwargs['version'] = ocrd_tool['version'] - super(OcropyDeskew, self).__init__(*args, **kwargs) - - def process(self): - """Deskew the regions of the workspace. + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + """Deskew the pages or regions of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the TextRegion level. Next, for each file, crop each region image according to the layout @@ -56,97 +37,71 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyDeskew') level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id, + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_coords, _ = self.workspace.image_from_page( + page, page_id, + # image must not have been rotated already, + # (we will overwrite @orientation anyway,) + # abort if no such image can be produced: + feature_filter='deskewed' if level == 'page' else '') + if level == 'page': + image = self._process_segment(page, page_image, page_coords, "page '%s'" % page_id, page_id) + if image: + result.images.append(image) + return result + if level == 'table': + regions = page.get_TableRegion() + else: # region + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + # process region: + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: - feature_filter='deskewed' if level == 'page' else '') - if level == 'page': - self._process_segment(page, page_image, page_coords, - "page '%s'" % page_id, input_file.pageId, - file_id) - else: - if level == 'table': - regions = page.get_TableRegion() - else: # region - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) - for region in regions: - # process region: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, - # image must not have been rotated already, - # (we will overwrite @orientation anyway,) - # abort if no such image can be produced: - feature_filter='deskewed') - self._process_segment(region, region_image, region_coords, - "region '%s'" % region.id, input_file.pageId, - file_id + '_' + region.id) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id): - LOG = getLogger('processor.OcropyDeskew') + feature_filter='deskewed') + image = self._process_segment(region, region_image, region_coords, f"region '{region.id}'", page_id) + if image: + result.images.append(image) + return result + + def _process_segment( + self, segment, segment_image, segment_coords, segment_id, page_id + ) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: - LOG.warning("Skipping %s with zero size", segment_id) - return - angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image - LOG.info("About to deskew %s", segment_id) - angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied + self.logger.warning("Skipping %s with zero size", segment_id) + return None + angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image + self.logger.info(f"About to deskew {segment_id}") + angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied # segment angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: orientation = -(angle + angle0) - orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] - segment.set_orientation(orientation) # also removes all deskewed AlternativeImages - LOG.info("Found angle for %s: %.1f", segment_id, angle) + orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] + segment.set_orientation(orientation) # also removes all deskewed AlternativeImages + self.logger.info(f"Found angle for {segment_id}: %.1f", angle) # delegate reflection, rotation and re-cropping to core: if isinstance(segment, PageType): segment_image, segment_coords, _ = self.workspace.image_from_page( - segment, page_id, - fill='background', transparency=True) + segment, page_id, fill='background', transparency=True) + suffix = '.IMG-DESKEW' else: segment_image, segment_coords = self.workspace.image_from_segment( - segment, segment_image, segment_coords, - fill='background', transparency=True) + segment, segment_image, segment_coords, fill='background', transparency=True) + suffix = segment.id + '.IMG-DESKEW' if not angle: # zero rotation does not change coordinates, # but assures consuming processors that the # workflow had deskewing segment_coords['features'] += ',deskewed' - # update METS (add the image file): - file_path = self.workspace.save_image_file( - segment_image, - file_id + '.IMG-DESKEW', - page_id=page_id, - file_grp=self.output_file_grp) # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=segment_coords['features'])) + alternative = AlternativeImageType(comments=segment_coords['features']) + segment.add_AlternativeImage(alternative) + return OcrdPageResultImage(segment_image, suffix, alternative) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index dc083eaf..a0d0ea5c 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,30 +1,14 @@ from __future__ import absolute_import - -import os.path +from logging import Logger +from typing import Optional import numpy as np -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) from ocrd import Processor -from ocrd_utils import MIMETYPE_PAGE +from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage -from .. import get_ocrd_tool from .ocrolib import lineest -from .common import ( - pil2array, array2pil, - check_line, -) - -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -TOOL = 'ocrd-cis-ocropy-dewarp' +from .common import array2pil, check_line, determine_zoom, pil2array class InvalidLine(Exception): """Line image does not allow dewarping and should be ignored.""" @@ -37,27 +21,27 @@ def dewarp(image, lnorm, check=True, max_neighbour=0.02, zoom=1.0): if not image.width or not image.height: raise InvalidLine('image size is zero') line = pil2array(image) - + if np.prod(line.shape) == 0: raise InvalidLine('image dimensions are zero') if np.amax(line) == np.amin(line): raise InvalidLine('image is blank') - - temp = np.amax(line)-line # inverse, zero-closed + + temp = np.amax(line) - line # inverse, zero-closed if check: report = check_line(temp, zoom=zoom) if report: raise InadequateLine(report) - - temp = temp * 1.0 / np.amax(temp) # normalized + + temp = temp * 1.0 / np.amax(temp) # normalized if check: report = lnorm.check(temp, max_ignore=max_neighbour) if report: raise InvalidLine(report) - lnorm.measure(temp) # find centerline + lnorm.measure(temp) # find centerline line = lnorm.dewarp(line, cval=np.amax(line)) - + return array2pil(line) # pad with white above and below (as a fallback for dewarp) @@ -69,32 +53,14 @@ def padvert(image, range_): return array2pil(line) class OcropyDewarp(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-dewarp' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyDewarp, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() - - def setup(self): - # defaults from ocrolib.lineest: - self.lnorm = lineest.CenterNormalizer( - params=(self.parameter['range'], - self.parameter['smoothness'], - # let's not expose this for now - # (otherwise we must explain mutual - # dependency between smoothness - # and extra params) - 0.3)) - self.logger = getLogger('processor.OcropyDewarp') - - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Dewarp the lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the TextLine level. Next, get each line image according to the layout annotation (from @@ -110,81 +76,49 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id) - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh) - - lines = region.get_TextLine() - if not lines: - self.logger.warning('Region %s contains no text lines', region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh) - - self.logger.info("About to dewarp page '%s' region '%s' line '%s'", - page_id, region.id, line.id) - try: - dew_image = dewarp(line_image, self.lnorm, check=True, - max_neighbour=self.parameter['max_neighbour'], - zoom=zoom) - except InvalidLine as err: - self.logger.error('cannot dewarp line "%s": %s', line.id, err) - continue - except InadequateLine as err: - self.logger.warning('cannot dewarp line "%s": %s', line.id, err) - # as a fallback, simply pad the image vertically - # (just as dewarping would do on average, so at least - # this line has similar margins as the others): - dew_image = padvert(line_image, self.parameter['range']) - # update METS (add the image file): - file_path = self.workspace.save_image_file( - dew_image, - file_id + '_' + region.id + '_' + line.id + '.IMG-DEWARP', - page_id=input_file.pageId, - file_grp=self.output_file_grp) - # update PAGE (reference the image file): - alternative_image = line.get_AlternativeImage() - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=line_xywh['features'] + ',dewarped')) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + # defaults from ocrolib.lineest: + lnorm = lineest.CenterNormalizer( + params=(self.parameter['range'], + self.parameter['smoothness'], + # let's not expose this for now + # (otherwise we must explain mutual + # dependency between smoothness + # and extra params) + 0.3)) + + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning(f'Page "{page_id}" contains no text regions') + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) + lines = region.get_TextLine() + if not lines: + self.logger.warning(f'Region {region.id} contains no text lines') + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh) + self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") + try: + dew_image = dewarp( + line_image, lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) + except (InvalidLine, AssertionError) as err: + self.logger.error(f'Cannot dewarp line "{line.id}": {err}') + continue + except InadequateLine as err: + self.logger.warning(f'cannot dewarp line "{line.id}": {err}') + # as a fallback, simply pad the image vertically + # (just as dewarping would do on average, so at least + # this line has similar margins as the others): + dew_image = padvert(line_image, self.parameter['range']) + # update PAGE (reference the image file): + alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped') + line.add_AlternativeImage(alt_image) + suffix = f"{region.id}_{line.id}.IMG-DEWARP" + result.images.append(OcrdPageResultImage(dew_image, suffix, alt_image)) + return result diff --git a/ocrd_cis/ocropy/ocrolib/common.py b/ocrd_cis/ocropy/ocrolib/common.py index 1c0c3208..6741a676 100644 --- a/ocrd_cis/ocropy/ocrolib/common.py +++ b/ocrd_cis/ocropy/ocrolib/common.py @@ -445,6 +445,9 @@ class names that have changed.""" LOG.info("# loading object '%s'", fname) if zip==0 and fname.endswith(".gz"): zip = 1 + # most models will have been pickled with ocrolib at top level + # we therefore need to add ocrd_cis.ocropy to the search path + sys.path.append(os.path.dirname(os.path.dirname(__file__))) if zip>0: with gzip.GzipFile(fname,"rb") as stream: #with os.popen("gunzip < '%s'"%fname,"rb") as stream: diff --git a/ocrd_cis/ocropy/ocrolib/lineest.py b/ocrd_cis/ocropy/ocrolib/lineest.py index 669b12ca..392c7e4a 100644 --- a/ocrd_cis/ocropy/ocrolib/lineest.py +++ b/ocrd_cis/ocropy/ocrolib/lineest.py @@ -31,7 +31,7 @@ def check(self,line, max_ignore=0.02): #DSAVE('lineest check 1 dilated', smoothed + 0.5*line) smoothed = filters.gaussian_filter(smoothed, (1, h//10), mode='constant') # 2 #DSAVE('lineest check 2 smoothed', smoothed + 0.5*line) - smoothed = np.array(smoothed > np.median(smoothed), dtype=np.float) # 3 # or 0.05 instead of median? + smoothed = np.array(smoothed > np.median(smoothed), dtype=float) # 3 # or 0.05 instead of median? #DSAVE('lineest check 3 thresholded', smoothed + 0.5*line) smoothed = filters.minimum_filter(smoothed, (2, h//5)) # 4: undo 1/2 #DSAVE('lineest check 4 eroded', smoothed + 0.5*line) @@ -75,7 +75,7 @@ def measure(self,line): plt.plot(self.center) plt.ginput(1,1000) def dewarp(self,img,cval=0,dtype=np.dtype('f')): - assert img.shape==self.shape + assert img.shape==self.shape, f"input shape {img.shape} deviates from measured shape {self.shape}" h,w = img.shape # The actual image img is embedded into a larger image by # adding vertical space on top and at the bottom (padding) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index a0170c43..4b626e83 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -21,7 +21,8 @@ def label(image,**kw): """ # default connectivity in OpenCV: 8 (which is equivalent to...) # default connectivity in scikit-image: 2 - n, labels = cv2.connectedComponents(image.astype(uint8), connectivity=4) + # connectivity=4 crashes (segfaults) OpenCV#21366 + n, labels = cv2.connectedComponents(image.astype(uint8)) #n, labels = cv2.connectedComponentsWithAlgorithm(image.astype(uint8), connectivity=4, ltype=2, ccltype=cv2.CCL_DEFAULT) return labels, n-1 # try: return measurements.label(image,**kw) @@ -169,6 +170,20 @@ def rg_closing(image,size,origin=0): # image = r_dilation(image,size,origin=0) # return r_erosion(image,size,origin=-1) +@checks(GRAYSCALE,ABINARY2) +def rg_reconstruction(image,mask,step=1,maxsteps=None): + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2*step+1,2*step+1)) + dilated = image + while maxsteps is None or maxsteps > 0: + dilated = cv2.dilate(src=dilated, kernel=kernel) + dilated = np.where(mask, dilated, image) + # did result change? + if (image == dilated).all(): + return dilated + if maxsteps: + maxsteps -= step + return dilated + @checks(SEGMENTATION) def showlabels(x,n=7): import matplotlib.pyplot as plt @@ -328,7 +343,7 @@ def select_regions(binary,f,min=0,nbest=100000): return keep[labels] @checks(SEGMENTATION) -def all_neighbors(image, dist=1, bg=NaN): +def all_neighbors(image, dist=1, bg=float('nan')): """Given an image with labels, find all pairs of labels that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``.""" q = 100000 @@ -336,8 +351,8 @@ def all_neighbors(image, dist=1, bg=NaN): assert amin(image)>=0 u = unique(q*image+shift(image,(dist,0),order=0,cval=bg)) d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg)) - l = unique(q*image+shift(image,(dist,dist),order=0,cval=bg)) - r = unique(q*image+shift(image,(-dist,dist),order=0,cval=bg)) + l = unique(q*image+shift(image,(0,dist),order=0,cval=bg)) + r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg)) all = unique(r_[u,d,l,r]) all = all[all!=bg] all = c_[all//q,all%q] @@ -414,7 +429,7 @@ def reading_order(seg,rl=False,bt=False): segmap[1:] = 1 return segmap def pos(f,l): - return array([f(x) if x else nan for x in l]) + return array([f(x) if x else float('nan') for x in l]) ys = pos(sl.ycenter,objects) yorder = argsort(ys)[::-1 if bt else 1] groups = [[yorder[0]]] diff --git a/ocrd_cis/ocropy/ocrolib/time_morphology.py b/ocrd_cis/ocropy/ocrolib/time_morphology.py index 51a8e406..2e241d94 100644 --- a/ocrd_cis/ocropy/ocrolib/time_morphology.py +++ b/ocrd_cis/ocropy/ocrolib/time_morphology.py @@ -29,10 +29,10 @@ def cv_contours(bin): return zip((contour[:,0,::-1], cv2.contourArea(contour)) for contour in contours) def rb_opening(bin, size): - return filters.uniform_filter(filters.uniform_filter(bin, size, np.float, mode='constant', cval=1) == 1, size, np.float, origin=-1) > 1e-7 + return filters.uniform_filter(filters.uniform_filter(bin, size, float, mode='constant', cval=1) == 1, size, float, origin=-1) > 1e-7 def rb_closing(bin, size): - return filters.uniform_filter(filters.uniform_filter(bin, size, np.float) > 1e-7, size, mode='constant', cval=1, origin=-1) == 1 + return filters.uniform_filter(filters.uniform_filter(bin, size, float) > 1e-7, size, mode='constant', cval=1, origin=-1) == 1 def r_closing(bin, size): return filters.minimum_filter(filters.maximum_filter(bin, size), size, origin=-1) diff --git a/ocrd_cis/ocropy/ocropus_rtrain.py b/ocrd_cis/ocropy/ocropus_rtrain.py index fc34ad20..b1469e42 100644 --- a/ocrd_cis/ocropy/ocropus_rtrain.py +++ b/ocrd_cis/ocropy/ocropus_rtrain.py @@ -45,7 +45,7 @@ def resize_keep_ratio(image, baseheight): baseheight = 48 hpercent = (baseheight / float(image.size[1])) wsize = int((float(image.size[0] * float(hpercent)))) - image = image.resize((wsize, baseheight), Image.ANTIALIAS) + image = image.resize((wsize, baseheight), Image.LANCZOS) return image # make sure an output file has been set diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index a36fba2d..55d91cc5 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,40 +1,27 @@ from __future__ import absolute_import -import os.path +from logging import Logger +from sys import exit +from typing import Any, Optional +from os import access, R_OK +from os.path import abspath, dirname, isfile, join import numpy as np from PIL import Image -import Levenshtein - -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - coordinates_for_segment, - polygon_from_bbox, - points_from_polygon, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, TextEquivType, - CoordsType, GlyphType, WordType -) -from ocrd import Processor - -from .. import get_ocrd_tool +from rapidfuzz.distance import Levenshtein + +from ocrd_utils import coordinates_for_segment, points_from_polygon, polygon_from_bbox +from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType +from ocrd import Processor, OcrdPageResult + +from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange -from .common import ( - pil2array, - check_line -) -TOOL = 'ocrd-cis-ocropy-recognize' def resize_keep_ratio(image, baseheight=48): scale = baseheight / image.height wsize = round(image.width * scale) - image = image.resize((wsize, baseheight), Image.ANTIALIAS) + image = image.resize((wsize, baseheight), Image.LANCZOS) return image, scale # from ocropus-rpred process1, but without input files and without lineest/dewarping @@ -58,8 +45,8 @@ def recognize(image, pad, network, check=True): pred = network.predictString(line) # getting confidence - result = lstm.translate_back(network.outputs, pos=1) - scale = len(raw_line.T)*1.0/(len(network.outputs)-2*pad) + result = lstm.translate_back(network.outputs, pos=1) # raw positions + scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad) clist = [] rlist = [] @@ -69,7 +56,7 @@ def recognize(image, pad, network, check=True): if c != 0: confid = network.outputs[r, c] c = network.l2s([c]) - r = (r-pad)*scale + r = (r - pad) * scale confidlist.append(confid) clist.append(c) @@ -79,20 +66,15 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): + network: Any + pad: int + + @property + def executable(self): + return 'ocrd-cis-ocropy-recognize' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - self.pad = 16 # ocropus-rpred default - self.network = None # set in process - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyRecognize, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() - def setup(self): - self.logger = getLogger('processor.OcropyRecognize') + self.pad = 16 # from ocropus-rpred: self.network = load_object(self.get_model(), verbose=1) for x in self.network.walk(): @@ -102,31 +84,37 @@ def setup(self): x.allocate(5000) def get_model(self): - """Search for the model file. First checks if - parameter['model'] is a valid readeable file and returns it. - If not, it checks if the model can be found in the + """Search for the model file. First checks if parameter['model'] can + be resolved with OcrdResourceManager to a valid readable file and + returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" - canread = lambda p: os.path.isfile(p) and os.access(p, os.R_OK) - model = self.parameter['model'] - if canread(model): - return model - ocropydir = os.path.dirname(os.path.abspath(__file__)) - path = os.path.join(ocropydir, 'models', model) - if canread(path): - return path - return model - - def process(self): - """Recognize lines / words / glyphs of the workspace. - - Open and deserialise each PAGE input file and its respective image, + canread = lambda p: isfile(p) and access(p, R_OK) + p_model = self.parameter['model'] + try: + model = self.resolve_resource(p_model) + if canread(model): + return model + except SystemExit: + ocropydir = dirname(abspath(__file__)) + path = join(ocropydir, 'models', p_model) + self.logger.info(f"Failed to resolve model with OCR-D/core mechanism, trying {path}") + if canread(path): + return path + self.logger.error( + f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}") + exit(1) + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: + """Recognize lines / words / glyphs of a page. + + Open and deserialize the PAGE input file and its respective image, then iterate over the element hierarchy down to the requested ``textequiv_level``. If any layout annotation below the line level already exists, then remove it (regardless of ``textequiv_level``). - Set up Ocropy to recognise each text line (via coordinates into + Set up Ocropy to recognize each text line (via coordinates into the higher-level image, or from the alternative image; the image - must have been binarised/grayscale-normalised, deskewed and dewarped + must have been binarized/grayscale-normalised, deskewed and dewarped already). Rescale and pad the image, then recognize. Create new elements below the line level, if necessary. @@ -139,105 +127,80 @@ def process(self): Levenshtein distance. Aggregate these scores for each file and print the line-wise and the total character error rates (CER). - Produce a new output file by serialising the resulting hierarchy. + Return the resulting OcrdPage. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - maxlevel = self.parameter['textequiv_level'] - - # self.logger.info("Using model %s in %s for recognition", model) - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id) - - self.logger.info("Recognizing text in page '%s'", page_id) - # region, line, word, or glyph level: - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.logger.warning("Page '%s' contains no text regions", page_id) - self.process_regions(regions, maxlevel, page_image, page_coords) - - # update METS (add the PAGE file): - file_id = make_file_id(input_file, self.output_file_grp) - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def process_regions(self, regions, maxlevel, page_image, page_coords): + max_level = self.parameter['textequiv_level'] + assert self.workspace + self.logger.debug(f'Max level: "{max_level}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page + + page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id) + self.logger.info(f"Recognizing text in page '{page_id}'") + # region, line, word, or glyph level: + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning(f"Page '{page_id}' contains no text regions") + self.process_regions(regions, max_level, page_image, page_xywh) + return OcrdPageResult(pcgts) + + def process_regions(self, regions, maxlevel, page_image, page_xywh): edits = 0 lengs = 0 for region in regions: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords) - - self.logger.info("Recognizing text in region '%s'", region.id) + region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) + self.logger.info(f"Recognizing text in region '{region.id}'") textlines = region.get_TextLine() if not textlines: - self.logger.warning("Region '%s' contains no text lines", region.id) + self.logger.warning(f"Region '{region.id}' contains no text lines") else: - edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_coords) + edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_xywh) edits += edits_ lengs += lengs_ # update region text by concatenation for consistency - region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode - if line.get_TextEquiv() - else u'' for line in textlines) + region_unicode = u'\n'.join( + line.get_TextEquiv()[0].Unicode if line.get_TextEquiv() else u'' for line in textlines) region.set_TextEquiv([TextEquivType(Unicode=region_unicode)]) if lengs > 0: self.logger.info('CER: %.1f%%', 100.0 * edits / lengs) - def process_lines(self, textlines, maxlevel, region_image, region_coords): + def process_lines(self, textlines, maxlevel, region_image, region_xywh): edits = 0 lengs = 0 for line in textlines: - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords) - - self.logger.info("Recognizing text in line '%s'", line.id) + line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh) + self.logger.info(f"Recognizing text in line '{line.id}'") if line.get_TextEquiv(): linegt = line.TextEquiv[0].Unicode else: linegt = '' - self.logger.debug("GT '%s': '%s'", line.id, linegt) + self.logger.debug(f"GT '{line.id}': '{linegt}'") # remove existing annotation below line level: line.set_TextEquiv([]) line.set_Word([]) if line_image.size[1] < 16: - self.logger.debug("ERROR: bounding box is too narrow at line %s", line.id) + self.logger.debug(f"Error: bounding box is too narrow at line {line.id}") continue # resize image to 48 pixel height final_img, scale = resize_keep_ratio(line_image) # process ocropy: try: - linepred, clist, rlist, confidlist = recognize( - final_img, self.pad, self.network, check=True) + linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True) except Exception as err: - self.logger.debug('error processing line "%s": %s', line.id, err) + self.logger.debug(f'Error processing line "{line.id}": {str(err) or err.__class__.__name__}') continue - self.logger.debug("OCR '%s': '%s'", line.id, linepred) + self.logger.debug(f"OCR '{line.id}': '{linepred}'") edits += Levenshtein.distance(linepred, linegt) lengs += len(linegt) words = [x.strip() for x in linepred.split(' ') if x.strip()] - word_r_list = [[0]] # r-positions of every glyph in every word - word_conf_list = [[]] # confidences of every glyph in every word + word_r_list = [[0]] # r-positions of every glyph in every word + word_conf_list = [[]] # confidences of every glyph in every word if words != []: w_no = 0 found_char = False @@ -246,12 +209,10 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): found_char = True word_conf_list[w_no].append(confidlist[i]) word_r_list[w_no].append(rlist[i]) - if c == ' ' and found_char: if i == 0: word_r_list[0][0] = rlist[i] - - elif i+1 <= len(clist)-1 and clist[i+1] != ' ': + elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ': word_conf_list.append([]) word_r_list.append([rlist[i]]) w_no += 1 @@ -260,44 +221,38 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): word_r_list = [[0, line_image.width]] # conf for each word - wordsconf = [(min(x)+max(x))/2 for x in word_conf_list] + wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list] # conf for the line - line_conf = (min(wordsconf) + max(wordsconf))/2 + line_conf = (min(wordsconf) + max(wordsconf)) / 2 # line text - line.add_TextEquiv(TextEquivType( - Unicode=linepred, conf=line_conf)) + line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf)) if maxlevel in ['word', 'glyph']: for word_no, word_str in enumerate(words): word_points = points_from_polygon( coordinates_for_segment( np.array(polygon_from_bbox( - word_r_list[word_no][0] / scale, - 0, - word_r_list[word_no][-1] / scale, - 0 + line_image.height)), + word_r_list[word_no][0] / scale,0, + word_r_list[word_no][-1] / scale, 0 + line_image.height)), line_image, line_coords)) word_id = '%s_word%04d' % (line.id, word_no) word = WordType(id=word_id, Coords=CoordsType(word_points)) line.add_Word(word) - word.add_TextEquiv(TextEquivType( - Unicode=word_str, conf=wordsconf[word_no])) + word.add_TextEquiv(TextEquivType(Unicode=word_str, conf=wordsconf[word_no])) if maxlevel == 'glyph': for glyph_no, glyph_str in enumerate(word_str): glyph_points = points_from_polygon( coordinates_for_segment( np.array(polygon_from_bbox( - word_r_list[word_no][glyph_no] / scale, - 0, - word_r_list[word_no][glyph_no+1] / scale, - 0 + line_image.height)), + word_r_list[word_no][glyph_no] / scale, 0, + word_r_list[word_no][glyph_no + 1] / scale, 0 + line_image.height)), line_image, line_coords)) glyph_id = '%s_glyph%04d' % (word.id, glyph_no) glyph = GlyphType(id=glyph_id, Coords=CoordsType(glyph_points)) word.add_Glyph(glyph) - glyph.add_TextEquiv(TextEquivType( - Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no])) + glyph.add_TextEquiv( + TextEquivType(Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no])) return edits, lengs diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index d9d661b2..0fb133c0 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,36 +1,29 @@ from __future__ import absolute_import -import os.path +from typing import Optional +from logging import Logger + import numpy as np -from skimage import draw -from shapely.geometry import Polygon, asPolygon, LineString +from skimage import draw, segmentation +from shapely.geometry import Polygon, LineString from shapely.prepared import prep -from shapely.ops import unary_union -import alphashape -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, PageType -) -from ocrd import Processor from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, coordinates_for_segment, points_from_polygon, polygon_from_points, transform_coordinates, - MIMETYPE_PAGE ) +from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage +from ocrd import Processor, OcrdPageResult -from .. import get_ocrd_tool from .ocrolib import midrange, morph from .common import ( pil2array, odd, - # DSAVE, + DSAVE, + determine_zoom, # binarize, check_page, check_region, @@ -41,23 +34,21 @@ masks2polygons, polygon_for_parent, make_valid, - make_intersection + make_intersection, + join_baselines, + join_polygons, + diff_polygons ) -TOOL = 'ocrd-cis-ocropy-resegment' - class OcropyResegment(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-resegment' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super().__init__(*args, **kwargs) - - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Resegment lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the line level. Next, get the page image according to the layout annotation (from @@ -96,7 +87,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyResegment') # This makes best sense for bad/coarse line segmentation, like current GT # or as postprocessing for bbox-only steps like Tesseract. # Most notably, it can convert rectangles to polygons (polygonalization), @@ -107,87 +97,56 @@ def process(self): # accuracy crucially depends on a good estimate of the images' # pixel density (at least if source input is not 300 DPI). level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for n, input_file in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) + pcgts = input_pcgts[0] + page = pcgts.get_Page() - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID - page = pcgts.get_Page() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi + ignore = (page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_GraphicRegion() + + page.get_ChartRegion() + + page.get_MapRegion() + + page.get_MathsRegion() + + page.get_ChemRegion() + + page.get_MusicRegion() + + page.get_AdvertRegion() + + page.get_NoiseRegion() + + page.get_SeparatorRegion() + + page.get_UnknownRegion() + + page.get_CustomRegion()) + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning(f'Page "{page_id}" contains no text regions') + elif level == 'page': + lines = [line for region in regions + for line in region.get_TextLine()] + if lines: + self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) else: - zoom = 1 - - ignore = (page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_GraphicRegion() + - page.get_ChartRegion() + - page.get_MapRegion() + - page.get_MathsRegion() + - page.get_ChemRegion() + - page.get_MusicRegion() + - page.get_AdvertRegion() + - page.get_NoiseRegion() + - page.get_SeparatorRegion() + - page.get_UnknownRegion() + - page.get_CustomRegion()) - regions = page.get_AllRegions(classes=['Text']) - if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) - elif level == 'page': - lines = [line for region in regions - for line in region.get_TextLine()] + self.logger.warning(f'Page "{page_id}" contains no text regions with lines', ) + else: + for region in regions: + lines = region.get_TextLine() if lines: - self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) else: - LOG.warning('Page "%s" contains no text regions with lines', page_id) - else: - for region in regions: - lines = region.get_TextLine() - if lines: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) - else: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') + return OcrdPageResult(pcgts) def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): - LOG = getLogger('processor.OcropyResegment') threshold = self.parameter['min_fraction'] - margin = self.parameter['extend_margins'] method = self.parameter['method'] + maxdist = self.parameter['spread'] / zoom * 300 / 72 # in pt # prepare line segmentation parent_array = pil2array(parent_image) #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw - parent_bin = np.array(parent_array <= midrange(parent_array), np.bool) - ignore_bin = np.ones_like(parent_bin, np.bool) + parent_bin = np.array(parent_array <= midrange(parent_array), bool) + ignore_bin = np.ones_like(parent_bin, bool) if isinstance(parent, PageType): tag = 'page' fullpage = True @@ -197,55 +156,53 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l fullpage = False report = check_region(parent_bin, zoom) if report: - LOG.warning('Invalid %s "%s": %s', tag, - page_id if fullpage else parent.id, report) + self.logger.warning(f'Invalid {tag} "{page_id if fullpage else parent.id}": {report}') return # get existing line labels: - line_labels = np.zeros_like(parent_bin, np.bool) + line_labels = np.zeros_like(parent_bin, bool) line_labels = np.tile(line_labels[np.newaxis], (len(lines), 1, 1)) line_polygons = [] - for i, segment in enumerate(lines): - segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) - segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) - line_polygons.append(prep(segment_polygon)) - segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1] - # draw.polygon: If any segment_polygon lies outside of parent + for i, line in enumerate(lines): + if self.parameter['baseline_only'] and line.Baseline: + line_base = baseline_of_segment(line, parent_coords) + line_poly = polygon_from_baseline(line_base, 30 / zoom) + else: + line_poly = coordinates_of_segment(line, parent_image, parent_coords) + line_poly = make_valid(Polygon(line_poly)) + line_polygons.append(line_poly) + line_polygons = list(map(prep, line_polygons)) + for i, line_polygon in enumerate(line_polygons): + polygon = np.array(line_polygon.context.exterior.coords, int)[:-1] + # draw.polygon: If any line_polygon lies outside of parent # (causing negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does not need # to concern herself with this. - segment_y, segment_x = draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - parent_bin.shape) - line_labels[i, segment_y, segment_x] = True + line_y, line_x = draw.polygon(polygon[:, 1], polygon[:, 0], parent_bin.shape) + line_labels[i, line_y, line_x] = True # only text region(s) may contain new text lines - for i, segment in enumerate(set(line.parent_object_ for line in lines)): - LOG.debug('unmasking area of text region "%s" for "%s"', - segment.id, page_id if fullpage else parent.id) - segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) - segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) - segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1] - ignore_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - parent_bin.shape)] = False + for i, region in enumerate(set(line.parent_object_ for line in lines)): + self.logger.debug(f'Unmasking area of text region "{region.id}" for "{page_id if fullpage else parent.id}"') + region_polygon = coordinates_of_segment(region, parent_image, parent_coords) + region_polygon = make_valid(Polygon(region_polygon)) + region_polygon = np.array(region_polygon.exterior.coords, int)[:-1] + ignore_bin[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], parent_bin.shape)] = False # mask/ignore overlapping neighbours for i, segment in enumerate(ignore): - LOG.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4], - segment.id, page_id if fullpage else parent.id) + self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for ' + f'"{page_id if fullpage else parent.id}"') segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) - ignore_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - parent_bin.shape)] = True + ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True if method != 'lineest': - LOG.debug('calculating connected component and distance transforms for "%s"', parent.id) + self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"') bin = parent_bin & ~ ignore_bin components, _ = morph.label(bin) # estimate glyph scale (roughly) _, counts = np.unique(components, return_counts=True) if counts.shape[0] > 1: counts = np.sqrt(3 * counts) - scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)])) - components *= (counts > 15/zoom)[components] - LOG.debug("estimated scale: %d", scale) + scale = int(np.median(counts[(5 / zoom < counts) & (counts < 100 / zoom)])) + components *= (counts > 15 / zoom)[components] + self.logger.debug(f"Estimated scale: {scale}") else: scale = 43 if method == 'ccomps': @@ -254,175 +211,203 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for i, label in enumerate(labels): distances[i] = morph.dist_labels(label.astype(np.uint8)) # normalize the distances of all lines so larger ones do not displace smaller ones - distances[i] = distances[i] / distances[i].max() * 255 + if distances[i].any(): + distances[i] = distances[i] / distances[i].max() * 255 # use depth to flatten overlapping lines as seed labels new_labels = np.argmax(distances, axis=0) else: + # 'baseline' new_labels = np.zeros_like(parent_bin, np.uint8) for i, line in enumerate(lines): if line.Baseline is None: - LOG.warning("Skipping '%s' without baseline", line.id) + self.logger.warning(f"Skipping '{line.id}' without baseline") new_labels[line_labels[i]] = i + 1 continue - line_polygon = baseline_of_segment(line, parent_coords) - line_ltr = line_polygon[0,0] < line_polygon[-1,0] - line_polygon = make_valid(join_polygons(LineString(line_polygon).buffer( - # left-hand side if left-to-right, and vice versa - scale * (-1) ** line_ltr, single_sided=True), loc=line.id)) - line_polygon = np.array(line_polygon.exterior, np.int)[:-1] + line_baseline = baseline_of_segment(line, parent_coords) + line_polygon = polygon_from_baseline(line_baseline, maxdist or scale/2) + line_polygon = np.array(line_polygon.exterior.coords, int)[:-1] line_y, line_x = draw.polygon(line_polygon[:, 1], line_polygon[:, 0], parent_bin.shape) new_labels[line_y, line_x] = i + 1 - spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords, - scale=scale, loc=parent.id, threshold=threshold) + spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords, + maxdist=maxdist or scale / 2, loc=parent.id, threshold=threshold) return try: - new_line_labels, _, _, _, _, scale = compute_segmentation( - parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage, - maxseps=0, maxcolseps=len(ignore), maximages=0) + # TODO: 'scale' passed as a param may not be always defined (mehmedGIT) + new_line_labels, new_baselines, _, _, _, scale = compute_segmentation( + parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale / 2, + fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: - LOG.warning('Cannot line-segment %s "%s": %s', - tag, page_id if fullpage else parent.id, err) + self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}') return - LOG.info("Found %d new line labels for %d existing lines on %s '%s'", - new_line_labels.max(), len(lines), tag, parent.id) + self.logger.info( + f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'") # polygonalize and prepare comparison new_line_polygons, new_line_labels = masks2polygons( - new_line_labels, parent_bin, '%s "%s"' % (tag, parent.id), - min_area=640/zoom/zoom) - # DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin]) - # DSAVE('new_line_labels', [new_line_labels, parent_bin], disabled=False) - new_line_polygons = [make_valid(Polygon(line_poly)) - for line_label, line_poly in new_line_polygons] + self.logger, new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"', + min_area=640 / zoom / zoom) + DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) + DSAVE('new_line_labels', [new_line_labels, parent_bin]) + new_line_polygons, new_baselines = list(zip( + *[(Polygon(poly), LineString(base)) for _, poly, base in new_line_polygons])) or ([], []) # polygons for intersecting pairs intersections = dict() # ratio of overlap between intersection and new line - fits_bg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float) - fits_fg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float) + fits_bg = np.zeros((len(new_line_polygons), len(line_polygons)), float) + fits_fg = np.zeros((len(new_line_polygons), len(line_polygons)), float) # ratio of overlap between intersection and existing line - covers_bg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float) - covers_fg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float) + covers_bg = np.zeros((len(new_line_polygons), len(line_polygons)), float) + covers_fg = np.zeros((len(new_line_polygons), len(line_polygons)), float) # compare segmentations, calculating ratios of overlapping fore/background area for i, new_line_poly in enumerate(new_line_polygons): for j, line_poly in enumerate(line_polygons): # too strict: .contains - if line_poly.intersects(new_line_poly): - inter = make_intersection(line_poly.context, new_line_poly) - if not inter: - continue - new_line_mask = (new_line_labels == i+1) & parent_bin - line_mask = line_labels[j] & parent_bin - inter_mask = new_line_mask & line_mask - if (not np.count_nonzero(inter_mask) or + if not line_poly.intersects(new_line_poly): + continue + inter = make_intersection(line_poly.context, new_line_poly) + if not inter: + continue + new_line_mask = (new_line_labels == i + 1) & parent_bin + line_mask = line_labels[j] & parent_bin + inter_mask = new_line_mask & line_mask + if (not np.count_nonzero(inter_mask) or not np.count_nonzero(new_line_mask) or not np.count_nonzero(line_mask)): - continue - intersections[(i, j)] = inter - fits_bg[i, j] = inter.area / new_line_poly.area - covers_bg[i, j] = inter.area / line_poly.context.area - fits_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(new_line_mask) - covers_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(line_mask) - # LOG.debug("new %d old %d (%s): %.1f%% / %.1f%% bg, %.1f%% / %.1f%% fg", - # i, j, lines[j].id, - # fits_bg[i,j]*100, covers_bg[i,j]*100, - # fits_fg[i,j]*100, covers_fg[i,j]*100) - # assign new lines to existing lines, if possible - assignments = np.ones(len(new_line_polygons), np.int) * -1 - for i, new_line_poly in enumerate(new_line_polygons): - if not fits_bg[i].any(): - LOG.debug("new line %d fits no existing line's background", i) - continue - if not fits_fg[i].any(): - LOG.debug("new line %d fits no existing line's foreground", i) - continue - fits = (fits_bg[i] > 0.6) & (fits_fg[i] > 0.9) - if not fits.any(): - j = np.argmax(fits_bg[i] * fits_fg[i]) - LOG.debug("best fit '%s' for new line %d fits only %.1f%% bg / %.1f%% fg", - lines[j].id, i, fits_bg[i,j] * 100, fits_fg[i,j] * 100) + continue + intersections[(i, j)] = inter + fits_bg[i, j] = inter.area / new_line_poly.area + covers_bg[i, j] = inter.area / line_poly.context.area + fits_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(new_line_mask) + covers_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(line_mask) + # LOG.debug("new %d old %d (%s): %.1f%% / %.1f%% bg, %.1f%% / %.1f%% fg", + # i, j, lines[j].id, + # fits_bg[i,j]*100, covers_bg[i,j]*100, + # fits_fg[i,j]*100, covers_fg[i,j]*100) + # assign existing lines to new lines (1:n), if possible + # start from best matches (forced alignment) + dim1 = len(new_line_polygons) + dim2 = len(line_polygons) + idx1 = np.arange(dim1) + idx2 = np.arange(dim2) + keep1 = np.ones(dim1, bool) + keep2 = np.ones(dim2, bool) + assignments = -1 * np.ones(dim1, int) + for _ in range(dim1): + fit_bg_view = fits_bg[np.ix_(keep1, keep2)] + if not fit_bg_view.size: + break + cov_bg_view = covers_bg[np.ix_(keep1, keep2)] + fit_fg_view = fits_fg[np.ix_(keep1, keep2)] + cov_fg_view = covers_fg[np.ix_(keep1, keep2)] + priority = cov_fg_view * cov_bg_view + ind1, ind2 = np.unravel_index(np.argmax(priority, axis=None), priority.shape) + fit_fg = fit_fg_view[ind1, ind2] + fit_bg = fit_bg_view[ind1, ind2] + cov_fg = cov_fg_view[ind1, ind2] + cov_bg = cov_bg_view[ind1, ind2] + # return to full view and assign next + ind1 = idx1[keep1][ind1] + ind2 = idx2[keep2][ind2] + #new_poly = new_line_polygons[ind1] + #poly = line_polygons[ind2] + # assignment must be new + assert assignments[ind1] < 0 + assert keep1[ind1] + assert keep2[ind2] + # minimum threshold + if not (fit_bg > 0.6 and fit_fg > 0.7): + # skip next time + # LOG.debug("match for %s too large: %d%%fg / %d%%bg", lines[ind2].id, fit_fg*100, fit_bg*100) + covers_bg[ind1, ind2] = 0 + covers_fg[ind1, ind2] = 0 continue - covers = covers_bg[i] * covers_fg[i] * fits - j = np.argmax(covers) - line = lines[j] - inter_polygon = intersections[(i,j)] - new_line_polygon = new_line_polygons[i] - new_center = inter_polygon.centroid - center = new_line_polygon.centroid - # FIXME: apply reasonable threshold for centroid distance - LOG.debug("new line for '%s' has centroid distance %.2f", - line.id, center.distance(new_center)) - assignments[i] = j + assignments[ind1] = ind2 + keep1[ind1] = False + #keep2[ind2] = False # validate assignments retain enough area and do not loose unassigned matches - line_polygons = [poly.context.buffer(-margin) for poly in line_polygons] for j, line in enumerate(lines): new_lines = np.nonzero(assignments == j)[0] if not np.prod(new_lines.shape): - LOG.debug("no lines for '%s' match or fit", line.id) + self.logger.debug(f"no lines for '{line.id}' match or fit", ) continue - covers = np.sum(covers_bg[new_lines,j]) + covers = np.sum(covers_bg[new_lines, j]) if covers < threshold / 3: - LOG.debug("new lines for '%s' only cover %.1f%% bg", - line.id, covers * 100) + self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100) continue - covers = np.sum(covers_fg[new_lines,j]) + covers = np.sum(covers_fg[new_lines, j]) if covers < threshold: - LOG.debug("new lines for '%s' only cover %.1f%% fg", - line.id, covers * 100) + self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100) continue - looses = (assignments < 0) & (covers_bg[:,j] > 0.1) + looses = (assignments < 0) & (covers_bg[:, j] > 0.1) if looses.any(): - covers = np.sum(covers_bg[np.nonzero(looses)[0],j]) - LOG.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg", - line.id, np.count_nonzero(looses), covers * 100) + covers = np.sum(covers_bg[np.nonzero(looses)[0], j]) + self.logger.debug( + f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments " + f"totalling %.1f%% bg", covers * 100) continue line_count = np.count_nonzero(line_labels[j] & parent_bin) new_count = covers * line_count - LOG.debug('Black pixels before/after resegment of line "%s": %d/%d', - line.id, line_count, new_count) + self.logger.debug(f'Black pixels before/after resegment of line "{line.id}": {line_count}/{new_count}') # combine all assigned new lines to single outline polygon if len(new_lines) > 1: - LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) - new_polygon = join_polygons([intersections[(i, j)] for i in new_lines], loc=line.id) - line_polygons[j] = new_polygon + self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'") + # intersections[(i, j)] + new_polygon = join_polygons([new_line_polygons[i] for i in new_lines], loc=line.id, scale=scale) + new_baseline = join_baselines( + self.logger, [new_polygon.intersection(new_baselines[i]) for i in new_lines], loc=line.id) # convert back to absolute (page) coordinates: - line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], - parent_image, parent_coords) + line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords) line_polygon = polygon_for_parent(line_polygon, line.parent_object_) if line_polygon is None: - LOG.warning("Ignoring extant new polygon for line '%s'", line.id) + self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'") return # annotate result: line.get_Coords().set_points(points_from_polygon(line_polygon)) + if new_baseline is not None: + new_baseline = coordinates_for_segment(new_baseline.coords, parent_image, parent_coords) + line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline))) + line_polygons[j] = prep(new_polygon) # now also ensure the assigned lines do not overlap other existing lines for i in new_lines: for otherj in np.nonzero(fits_fg[i] > 0.1)[0]: if j == otherj: continue otherline = lines[otherj] - LOG.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id) - other_polygon = diff_polygons(line_polygons[otherj], new_polygon) + self.logger.debug(f"subtracting new '{line.id}' from overlapping '{otherline.id}'") + other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon) if other_polygon.is_empty: continue # convert back to absolute (page) coordinates: - other_polygon = coordinates_for_segment(other_polygon.exterior.coords[:-1], - parent_image, parent_coords) + other_polygon = coordinates_for_segment( + other_polygon.exterior.coords[:-1], parent_image, parent_coords) other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_) if other_polygon is None: - LOG.warning("Ignoring extant new polygon for line '%s'", otherline.id) + self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'") continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) -def spread_dist(lines, old_labels, new_labels, binarized, components, coords, - scale=43, loc='', threshold=0.9): + +def spread_dist( + logger: Logger, lines, old_labels, new_labels, binarized, components, coords, maxdist=43, loc='', + threshold=0.9): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" - LOG = getLogger('processor.OcropyResegment') - # allocate to connected components consistently (by majority, - # ignoring smallest components like punctuation) - #new_labels = morph.propagate_labels_majority(binarized, new_labels) - new_labels = morph.propagate_labels_majority(components > 0, new_labels) + DSAVE('seeds', [new_labels, (components>0)]) + # allocate to connected components consistently + # (ignoring the smallest components like punctuation) + # but when there are conflicts, meet in the middle via watershed + new_labels2 = morph.propagate_labels(components > 0, new_labels, conflict=0) + new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0)) + DSAVE('propagated', new_labels2) # dilate/grow labels from connected components against each other and bg - new_labels = morph.spread_labels(new_labels, maxdist=scale/2) + new_labels = morph.spread_labels(new_labels2, maxdist=maxdist) + DSAVE('spread', new_labels) + # now propagate again to catch the smallest components like punctuation + new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0) + new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized) + DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)]) + new_labels = morph.spread_labels(new_labels2, maxdist=maxdist/4) + DSAVE('spread-again', [new_labels, binarized]) # find polygon hull and modify line coords for i, line in enumerate(lines): new_label = new_labels == i + 1 @@ -431,90 +416,50 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, continue count = np.count_nonzero(old_label) if not count: - LOG.warning("skipping zero-area line '%s'", line.id) + logger.warning(f"skipping zero-area line '{line.id}'") continue covers = np.count_nonzero(new_label) / count if covers < threshold / 3: - LOG.debug("new line for '%s' only covers %.1f%% bg", - line.id, covers * 100) + logger.debug(f"new line for '{line.id}' only covers %.1f%% bg", covers * 100) continue count = np.count_nonzero(old_label * binarized) if not count: - LOG.warning("skipping binarizy-empty line '%s'", line.id) + logger.warning(f"skipping binary-empty line '{line.id}'") continue covers = np.count_nonzero(new_label * binarized) / count if covers < threshold: - LOG.debug("new line for '%s' only covers %.1f%% fg", - line.id, covers * 100) + logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100) continue - LOG.debug('Black pixels before/after resegment of line "%s": %d/%d', - line.id, count, covers * count) - contours = [contour[:,::-1] # get x,y order again + logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}') + contours = [contour[:, :: -1] # get x,y order again for contour, area in morph.find_contours(new_label)] #LOG.debug("joining %d subsegments for %s", len(contours), line.id) if len(contours) == 0: - LOG.warning("no contours for %s - keeping", line.id) + logger.warning(f"no contours for {line.id} - keeping") continue else: # get alpha shape - poly = join_polygons([make_valid(Polygon(contour)) - for contour in contours], loc=line.id) + poly = join_polygons( + [make_valid(Polygon(contour)) for contour in contours if len(contour) >= 4], + loc=line.id, scale=maxdist) poly = poly.exterior.coords[:-1] polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) if polygon is None: - LOG.warning("Ignoring extant line for %s", line.id) + logger.warning(f"Ignoring extant line for {line.id}") continue line.get_Coords().set_points(points_from_polygon(polygon)) -def diff_polygons(poly1, poly2): - poly = poly1.difference(poly2) - if poly.type == 'MultiPolygon': - poly = poly.convex_hull - if poly.minimum_clearance < 1.0: - poly = asPolygon(np.round(poly.exterior.coords)) - poly = make_valid(poly) - return poly - -def join_polygons(polygons, loc=''): - """construct concave hull (alpha shape) from input polygons""" - # compoundp = unary_union(polygons) - # jointp = compoundp.convex_hull - LOG = getLogger('processor.OcropyResegment') - if len(polygons) == 1: - return polygons[0] - # get equidistant list of points along hull - # (otherwise alphashape will jump across the interior) - points = [poly.exterior.interpolate(dist).coords[0] # .xy - for poly in polygons - for dist in np.arange(0, poly.length, 5.0)] - #alpha = alphashape.optimizealpha(points) # too slow - alpha = 0.05 - jointp = alphashape.alphashape(points, alpha) - tries = 0 - # from descartes import PolygonPatch - # import matplotlib.pyplot as plt - while jointp.type in ['MultiPolygon', 'GeometryCollection']: - # plt.figure() - # plt.gca().scatter(*zip(*points)) - # for geom in jointp.geoms: - # plt.gca().add_patch(PolygonPatch(geom, alpha=0.2)) - # plt.show() - alpha *= 0.7 - tries += 1 - if tries > 10: - LOG.warning("cannot find alpha for concave hull on '%s'", loc) - alpha = 0 - jointp = alphashape.alphashape(points, alpha) - if jointp.minimum_clearance < 1.0: - # follow-up calculations will necessarily be integer; - # so anticipate rounding here and then ensure validity - jointp = asPolygon(np.round(jointp.exterior.coords)) - jointp = make_valid(jointp) - return jointp - # zzz should go into core ocrd_utils def baseline_of_segment(segment, coords): line = np.array(polygon_from_points(segment.get_Baseline().points)) line = transform_coordinates(line, coords['transform']) return np.round(line).astype(np.int32) + +# zzz should go into core ocrd_utils +def polygon_from_baseline(baseline, scale): + ltr = baseline[0, 0] < baseline[-1, 0] + # left-hand side if left-to-right, and vice versa + polygon = make_valid(join_polygons( + [LineString(baseline).buffer(scale * (-1) ** ltr, single_sided=True)], scale=scale)) + return polygon diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index b782fdde..493deb30 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,24 +1,37 @@ from __future__ import absolute_import -import os.path +from typing import Optional +from logging import Logger +import itertools + import numpy as np +from scipy.sparse.csgraph import minimum_spanning_tree from skimage import draw from skimage.morphology import convex_hull_image import cv2 -from shapely.geometry import Polygon, asPolygon +from shapely.geometry import Polygon, LineString from shapely.prepared import prep -from shapely.ops import unary_union +from shapely.ops import unary_union, nearest_points +from shapely.validation import explain_validity +from shapely import set_precision -from ocrd_modelfactory import page_from_file +from ocrd_utils import ( + coordinates_of_segment, + coordinates_for_segment, + points_from_polygon, + polygon_from_points, +) from ocrd_models.ocrd_page import ( - to_xml, CoordsType, + CoordsType, TextLineType, TextRegionType, SeparatorRegionType, PageType, - AlternativeImageType + AlternativeImageType, + OcrdPage ) from ocrd_models.ocrd_page_generateds import ( + BaselineType, TableRegionType, ImageRegionType, RegionRefType, @@ -30,124 +43,216 @@ ReadingOrderType ) from ocrd import Processor -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - coordinates_of_segment, - coordinates_for_segment, - points_from_polygon, - polygon_from_points, - MIMETYPE_PAGE -) +from ocrd.processor import OcrdPageResult, OcrdPageResultImage -from .. import get_ocrd_tool from .ocrolib import midrange from .ocrolib import morph from .common import ( pil2array, array2pil, check_page, check_region, + determine_zoom, hmerge_line_seeds, compute_segmentation, lines2regions ) -TOOL = 'ocrd-cis-ocropy-segment' -def masks2polygons(bg_labels, fg_bin, name, min_area=None, simplify=None): +def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, + reorder=True): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, + (optionally) a Numpy array of a scalar field ``baselines``, and a Numpy array of the foreground ``fg_bin``, iterate through all labels (except zero and those labels which do not correspond to any foreground at all) to find - their outer contours. Each contour part which is not too - small and gives a (simplified) polygon of at least 4 points - becomes a polygon. (Thus, labels can be split into multiple - polygons.) + their outer contours and inner baselines. + Each contour part which is not too small and gives a + (simplified) polygon of at least 4 points becomes a polygon. + (Thus, labels can be split into multiple polygons.) Return a tuple: - - these polygons as a list of label, polygon tuples, and + - these polygons as a list of label, polygon, baseline tuples, and - a Numpy array of new background labels for that list. """ - LOG = getLogger('processor.OcropySegment') + # find sharp baseline + if baselines is not None: + def getx(xy): + return xy[0] + + baselines = [LineString(sorted([p[::-1] for p in line], key=getx)).simplify(5) + for line in baselines + if len(line) >= 2] results = list() result_labels = np.zeros_like(bg_labels, dtype=bg_labels.dtype) for label in np.unique(bg_labels): if not label: # ignore if background continue - bg_mask = np.array(bg_labels == label, np.uint8) + bg_mask = np.array(bg_labels == label, bool) if not np.count_nonzero(bg_mask * fg_bin): # ignore if missing foreground - LOG.debug('skipping label %d in %s due to empty fg', - label, name) + logger.debug(f'Skipping label {label} in {name} due to empty fg') continue # simplify to convex hull if simplify is not None: - hull = convex_hull_image(bg_mask).astype(np.uint8) - conflicts = np.setdiff1d((hull>0) * simplify, - (bg_mask>0) * simplify) + hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(bool) + conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify) if conflicts.any(): - LOG.debug('Cannot simplify %d: convex hull would create additional intersections %s', - label, str(conflicts)) + logger.debug( + f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}') else: bg_mask = hull - # find outer contour (parts): - contours, _ = cv2.findContours(bg_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + if open_holes: + # def plot_poly(contour, color): + # import matplotlib.pyplot as plt + # from matplotlib.patches import Polygon as PolygonPatch + # plt.figure() + # plt.imshow(fg_bin) + # plt.gca().scatter(*zip(*contour[:,0])) + # plt.gca().add_patch(PolygonPatch(contour[:,0], alpha=0.5, color=color, closed=False)) + # plt.show() + # find outer contour (parts) plus direct holes (if any) + contours = [] + cont, hier = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + idx = 0 + while idx >= 0: + contour = cont[idx] + if len(contour) < 3: + idx = hier[0, idx, 0] + continue + #plot_poly(contour, 'red') + idx_hole = hier[0, idx, 2] + while idx_hole >= 0: + hole = cont[idx_hole] + if len(hole) < 3: + idx_hole = hier[0, idx_hole, 0] + continue + logger.debug( + f"Label {label} contour {idx} [{len(contour)} pts] has hole {idx_hole} [{len(hole)} pts]") + #plot_poly(hole, 'blue') + # cut child from outside... + # first get nearest point on child + hole_idx = np.argmin([cv2.pointPolygonTest(contour, tuple(pt[0].tolist()), True) + for pt in hole]) + # now get nearest point on parent + # (we cannot use PolygonTest directly, because we must also interpolate + # to prevent crossing edges; at least each 10px) + contour = np.append(contour, contour[0:1], axis=0) + contour2 = np.diff(contour, axis=0) + contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(int)[:,0] // 10) + interpol = [] + for i, ntics in enumerate(contourtics): + interpol.extend(np.array( + contour[i:i + 1] + + contour2[i:i + 1] * + np.linspace(0, 1, ntics)[:, np.newaxis, np.newaxis], int)) + interpol.append(contour[-1]) + interpol = np.array(interpol) + contourtics = np.insert(np.cumsum(contourtics), 0, 0) + assert np.all(contour == interpol[contourtics]) + interpol_idx = np.linalg.norm(interpol - hole[hole_idx], axis=2).argmin() + contour_idx = np.searchsorted(contourtics, interpol_idx) + if interpol_idx in contourtics: + contour_idx2 = contour_idx + 1 + else: + contour_idx2 = contour_idx + if contour_idx2 >= len(contour): + contour_idx2 = 0 + cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx + 1] + if interpol_idx == 0: + diff1 = (interpol[-1:] - cispoint1) // 5 + else: + diff1 = (interpol[interpol_idx - 1: interpol_idx] - cispoint1) // 5 + if interpol_idx + 1 >= len(interpol): + diff2 = (interpol[0:1] - cispoint2) // 5 + else: + diff2 = (interpol[interpol_idx + 1: interpol_idx + 2] - cispoint2) // 5 + cispoint1 = cispoint1 + diff1 + cispoint2 = cispoint2 + diff2 + logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}") + # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest) + # (this works, because inner contours have inverse direction) + contour = np.concatenate( + [contour[:contour_idx], cispoint1, + hole[hole_idx:], hole[:hole_idx], + cispoint2, contour[contour_idx:]]) + #plot_poly(contour, 'green') + idx_hole = hier[0, idx_hole, 0] + #plot_poly(contour, 'red') + logger.debug(f"Adding label {label} contour {idx} [{len(contour)} pts]") + contours.append(contour) + idx = hier[0, idx, 0] + else: + # find outer contour (parts): + contours, _ = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # determine areas of parts: areas = [cv2.contourArea(contour) for contour in contours] total_area = sum(areas) if not total_area: # ignore if too small continue - # sort contours in reading order + # redraw label array contour_labels = np.zeros_like(bg_mask, np.uint8) for i, contour in enumerate(contours): - cv2.drawContours(contour_labels, contours[i:i+1], -1, i+1, cv2.FILLED) - order = np.argsort(morph.reading_order(contour_labels)[1:]) + cv2.drawContours(contour_labels, contours, i, i+1, cv2.FILLED) + if reorder: + # sort contours in reading order + order = np.argsort(morph.reading_order(contour_labels)[1:]) + else: + order = range(len(contours)) # convert to polygons for i in order: contour = contours[i] area = areas[i] if min_area and area < min_area and area / total_area < 0.1: - LOG.warning('Label %d contour %d is too small (%d/%d) in %s', - label, i, area, total_area, name) + logger.warning(f'Label {label} contour {i} is too small ({area}/{total_area}) in {name}') continue # simplify shape: # can produce invalid (self-intersecting) polygons: #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y - polygon = contour[:, 0, ::] # already ordered x,y + polygon = contour[:, 0, ::] # already ordered x,y # simplify and validate: polygon = Polygon(polygon) - for tolerance in range(2, int(area)): - polygon = polygon.simplify(tolerance) - if polygon.is_valid: - break - polygon = polygon.exterior.coords[:-1] # keep open - if len(polygon) < 4: - LOG.warning('Label %d contour %d has less than 4 points for %s', - label, i, name) + if not polygon.is_valid: + #logger.debug(polygon.wkt) + logger.debug(explain_validity(polygon)) + polygon = make_valid(polygon) + if not polygon.is_valid: + #LOG.debug(polygon.wkt) + logger.warning(explain_validity(polygon)) + poly = polygon.exterior.coords[:-1] # keep open + if len(poly) < 4: + logger.warning(f'Label {label} contour {i} for {name} has less than 4 points') continue - results.append((label, polygon)) - result_labels[contour_labels == i+1] = len(results) + # get baseline segments intersecting with this line mask + # and concatenate them from left to right + if baselines is not None: + base = join_baselines( + logger, + [baseline.intersection(polygon) for baseline in baselines if baseline.intersects(polygon)], name) + if base is not None: + base = base.coords + else: + base = None + results.append((label, poly, base)) + result_labels[contour_labels == i + 1] = len(results) return results, result_labels -class OcropySegment(Processor): - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropySegment, self).__init__(*args, **kwargs) +class OcropySegment(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-segment' - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Segment pages into regions+lines, tables into cells+lines, or regions into lines. - - Open and deserialise PAGE input files and their respective images, + + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested level. - + + \b Depending on ``level-of-operation``, consider existing segments: - If ``overwrite_separators=True`` on ``page`` level, then delete any SeparatorRegions. @@ -160,39 +265,41 @@ def process(self): - If ``overwrite_order=True`` on ``page`` or ``table`` level, then delete the reading order OrderedGroup entry corresponding to the (page/table) segment. - + Next, get each element image according to the layout annotation (from the alternative image of the page/region, or by cropping via coordinates into the higher-level image) in binarized form, and represent it as an array with non-text regions and (remaining) text neighbours suppressed. - + + \b Then compute a text line segmentation for that array (as a label mask). When ``level-of-operation`` is ``page`` or ``table``, this also entails detecting - up to ``maximages`` large foreground images, - - up to ``maxseps`` foreground h/v-line separators and + - up to ``maxseps`` foreground line separators and - up to ``maxcolseps`` background column separators before text line segmentation itself, as well as aggregating text lines to text regions afterwards. - + Text regions are detected via a hybrid variant recursive X-Y cut algorithm (RXYC): RXYC partitions the binarized image in top-down manner by detecting horizontal or vertical gaps. This implementation uses the bottom-up text line segmentation to guide the search, and also uses both pre-existing and newly detected separators to alternatively partition the respective boxes into non-rectangular parts. - + During line segmentation, suppress the foreground of all previously annotated regions (of any kind) and lines, except if just removed due to ``overwrite``. During region aggregation however, combine the existing separators with the new-found separators to guide the column search. - + All detected segments (both text line and text region) are sorted according to their reading order (assuming a top-to-bottom, left-to-right ordering). When ``level-of-operation`` is ``page``, prefer vertical (column-first) succession of regions. When it is ``table``, prefer horizontal (row-first) succession of cells. - + + \b Then for each resulting segment label, convert its background mask into polygon outlines by finding the outer contours consistent with the element's polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: @@ -204,10 +311,9 @@ def process(self): - If it is ``page``, then append the new lines to their respective regions, and append the new regions to the page. (Also, create an OrderedGroup for it in the ReadingOrder.) - + Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropySegment') # FIXME: allow passing a-priori info on reading order / textline order # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture # of different scripts; also, vertical writing needs internal rotation @@ -218,222 +324,196 @@ def process(self): overwrite_order = self.parameter['overwrite_order'] oplevel = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - # TODO: also allow grayscale_normalized (try/except?) - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - # aggregate existing regions so their foreground can be ignored - ignore = (page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_GraphicRegion() + - page.get_ChartRegion() + - page.get_MapRegion() + - page.get_MathsRegion() + - page.get_ChemRegion() + - page.get_MusicRegion() + - page.get_AdvertRegion() + - page.get_NoiseRegion() + - page.get_UnknownRegion() + - page.get_CustomRegion()) - if oplevel == 'page' and overwrite_separators: - page.set_SeparatorRegion([]) - else: - ignore.extend(page.get_SeparatorRegion()) - # prepare reading order - reading_order = dict() - ro = page.get_ReadingOrder() - if ro: - rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() - if rogroup: - page_get_reading_order(reading_order, rogroup) - - # get segments to process / overwrite - if oplevel == 'page': - ignore.extend(page.get_TableRegion()) - regions = list(page.get_TextRegion()) - if regions: - # page is already region-segmented - if overwrite_regions: - LOG.info('removing existing TextRegions in page "%s"', page_id) - # we could remove all other region types as well, - # but this is more flexible (for workflows with - # specialized separator/image/table detectors): - page.set_TextRegion([]) - page.set_ReadingOrder(None) - ro = None - else: - LOG.warning('keeping existing TextRegions in page "%s"', page_id) - ignore.extend(regions) - # create reading order if necessary - if not ro or overwrite_order: - ro = ReadingOrderType() - page.set_ReadingOrder(ro) - rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() - if not rogroup: - # new top-level group - rogroup = OrderedGroupType(id="reading-order") - ro.set_OrderedGroup(rogroup) - # go get TextRegions with TextLines (and SeparatorRegions): - self._process_element(page, ignore, page_image, page_coords, - page_id, file_id, - input_file.pageId, zoom, rogroup=rogroup) - if (not rogroup.get_RegionRefIndexed() and + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + # TODO: also allow grayscale_normalized (try/except?) + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + # aggregate existing regions so their foreground can be ignored + ignore = (page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_GraphicRegion() + + page.get_ChartRegion() + + page.get_MapRegion() + + page.get_MathsRegion() + + page.get_ChemRegion() + + page.get_MusicRegion() + + page.get_AdvertRegion() + + page.get_NoiseRegion() + + page.get_UnknownRegion() + + page.get_CustomRegion()) + if oplevel == 'page' and overwrite_separators: + page.set_SeparatorRegion([]) + else: + ignore.extend(page.get_SeparatorRegion()) + # prepare reading order + reading_order = dict() + ro = page.get_ReadingOrder() + if ro: + rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() + if rogroup: + page_get_reading_order(reading_order, rogroup) + + # get segments to process / overwrite + if oplevel == 'page': + ignore.extend(page.get_TableRegion()) + regions = list(page.get_TextRegion()) + if regions: + # page is already region-segmented + if overwrite_regions: + self.logger.info(f'Removing existing TextRegions in page "{page_id}"', ) + # we could remove all other region types as well, + # but this is more flexible (for workflows with + # specialized separator/image/table detectors): + page.set_TextRegion([]) + page.set_ReadingOrder(None) + ro = None + else: + self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"', ) + ignore.extend(regions) + # create reading order if necessary + if not ro or overwrite_order: + ro = ReadingOrderType() + page.set_ReadingOrder(ro) + rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() + if not rogroup: + # new top-level group + rogroup = OrderedGroupType(id="reading-order") + ro.set_OrderedGroup(rogroup) + if (not rogroup.get_RegionRefIndexed() and not rogroup.get_OrderedGroupIndexed() and not rogroup.get_UnorderedGroupIndexed()): - # schema forbids empty OrderedGroup - ro.set_OrderedGroup(None) - elif oplevel == 'table': - ignore.extend(page.get_TextRegion()) - regions = list(page.get_TableRegion()) - if not regions: - LOG.warning('Page "%s" contains no table regions', page_id) - for region in regions: - subregions = region.get_TextRegion() - if subregions: - # table is already cell-segmented - if overwrite_regions: - LOG.info('removing existing TextRegions in table "%s"', region.id) - region.set_TextRegion([]) - roelem = reading_order.get(region.id) - # replace by empty group with same index and ref - # (which can then take the cells as subregions) - reading_order[region.id] = page_subgroup_in_reading_order(roelem) - else: - LOG.warning('skipping table "%s" with existing TextRegions', region.id) - continue - # TODO: also allow grayscale_normalized (try/except?) - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - # ignore everything but the current table region - subignore = regions + ignore - subignore.remove(region) - # create reading order group if necessary - roelem = reading_order.get(region.id) - if not roelem: - LOG.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", - page_id, region.id, "no target to add cells to") - elif overwrite_order: - # replace by empty ordered group with same (index and) ref + # schema forbids empty OrderedGroup + ro.set_OrderedGroup(None) + # go get TextRegions with TextLines (and SeparatorRegions): + image = self._process_element(page, ignore, page_image, page_coords, zoom=zoom, rogroup=rogroup) + if image: + result.images.append(image) + return result + + if oplevel == 'table': + ignore.extend(page.get_TextRegion()) + regions = list(page.get_TableRegion()) + if not regions: + self.logger.warning(f'Page "{page_id}" contains no table regions') + for region in regions: + subregions = region.get_TextRegion() + if subregions: + # table is already cell-segmented + if overwrite_regions: + self.logger.info(f'Removing existing TextRegions in table "{region.id}"') + region.set_TextRegion([]) + roelem = reading_order.get(region.id) + # replace by empty group with same index and ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem) - reading_order[region.id] = roelem - elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - LOG.warning("Page '%s' table region '%s' already has an ordered group (%s)", - page_id, region.id, "cells will be appended") - elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - LOG.warning("Page '%s' table region '%s' already has an unordered group (%s)", - page_id, region.id, "cells will not be appended") - roelem = None + reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) else: - # replace regionRef(Indexed) by group with same index and ref - # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem) - reading_order[region.id] = roelem - # go get TextRegions with TextLines (and SeparatorRegions) - self._process_element(region, subignore, region_image, region_coords, - region.id, file_id + '_' + region.id, - input_file.pageId, zoom, rogroup=roelem) - else: # 'region' - regions = list(page.get_TextRegion()) - # besides top-level text regions, line-segment any table cells, - # and for tables without any cells, add a pseudo-cell - for region in page.get_TableRegion(): - subregions = region.get_TextRegion() - if subregions: - regions.extend(subregions) + self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions') + continue + # TODO: also allow grayscale_normalized (try/except?) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + # ignore everything but the current table region + subignore = regions + ignore + subignore.remove(region) + # create reading order group if necessary + roelem = reading_order.get(region.id) + if not roelem: + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' is not referenced in reading order " + f"(no target to add cells to)") + elif overwrite_order: + # replace by empty ordered group with same (index and) ref + # (which can then take the cells as subregions) + roelem = page_subgroup_in_reading_order(self.logger, roelem) + reading_order[region.id] = roelem + elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' already has an ordered group " + f"(cells will be appended)") + elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' already has an unordered group " + f"(cells will not be appended)") + roelem = None + else: + # replace regionRef(Indexed) by group with same index and ref + # (which can then take the cells as subregions) + roelem = page_subgroup_in_reading_order(self.logger, roelem) + reading_order[region.id] = roelem + # go get TextRegions with TextLines (and SeparatorRegions) + image = self._process_element( + region, subignore, region_image, region_coords, zoom=zoom, rogroup=roelem) + if image: + result.images.append(image) + else: # 'region' + regions = list(page.get_TextRegion()) + # besides top-level text regions, line-segment any table cells, + # and for tables without any cells, add a pseudo-cell + for region in page.get_TableRegion(): + subregions = region.get_TextRegion() + if subregions: + regions.extend(subregions) + else: + subregion = TextRegionType( + id=f'{region.id}_text', Coords=region.get_Coords(), parent_object_=region) + region.add_TextRegion(subregion) + regions.append(subregion) + if not regions: + self.logger.warning(f'Page "{page_id}" contains no text regions') + for region in regions: + if region.get_TextLine(): + if overwrite_lines: + self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"') + region.set_TextLine([]) else: - subregion = TextRegionType(id=region.id + '_text', - Coords=region.get_Coords(), - # as if generated from parser: - parent_object_=region) - region.add_TextRegion(subregion) - regions.append(subregion) - if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) - for region in regions: - if region.get_TextLine(): - if overwrite_lines: - LOG.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) - region.set_TextLine([]) - else: - LOG.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) - ignore.extend(region.get_TextLine()) - # TODO: also allow grayscale_normalized (try/except?) - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - # if the region images have already been clipped against their neighbours specifically, - # then we don't need to suppress all neighbours' foreground generally here - if 'clipped' in region_coords['features'].split(','): - ignore = [] - # go get TextLines - self._process_element(region, ignore, region_image, region_coords, - region.id, file_id + '_' + region.id, - input_file.pageId, zoom) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): + self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"') + ignore.extend(region.get_TextLine()) + # TODO: also allow grayscale_normalized (try/except?) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + # if the region images have already been clipped against their neighbours specifically, + # then we don't need to suppress all neighbours' foreground generally here + if 'clipped' in region_coords['features'].split(','): + ignore = [] + # go get TextLines + image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom) + if image: + result.images.append(image) + return result + + def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]: """Add PAGE layout elements by segmenting an image. Given a PageType, TableRegionType or TextRegionType ``element``, and a corresponding binarized PIL.Image object ``image`` with coordinate metadata ``coords``, run line segmentation with Ocropy. - + If operating on the full page (or table), then also detect horizontal and vertical separators, and aggregate the lines into text regions afterwards. - + Add the resulting sub-segments to the parent ``element``. - + If ``ignore`` is not empty, then first suppress all foreground components in any of those segments' coordinates during segmentation, and if also in full page/table mode, then combine all separators among them with the newly detected separators to guide region segmentation. """ - LOG = getLogger('processor.OcropySegment') if not image.width or not image.height: - LOG.warning("Skipping '%s' with zero size", element_id) - return + self.logger.warning(f"Skipping '{element.id}' with zero size") + return None element_array = pil2array(image) - element_bin = np.array(element_array <= midrange(element_array), np.bool) - sep_bin = np.zeros_like(element_bin, np.bool) - ignore_labels = np.zeros_like(element_bin, np.int) + element_bin = np.array(element_array <= midrange(element_array), bool) + sep_bin = np.zeros_like(element_bin, bool) + ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - LOG.debug('masking foreground of %s "%s" for "%s"', - type(segment).__name__[:-4], segment.id, element_id) + self.logger.debug( + f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element.id}"') # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: @@ -444,17 +524,16 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does # not need to concern herself with this. + sp_row = segment_polygon[:, 1] + sp_col = segment_polygon[:, 0] if isinstance(segment, SeparatorRegionType): - sep_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - sep_bin.shape)] = True - ignore_labels[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - ignore_labels.shape)] = i+1 # mapped back for RO + sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True + ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i + 1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True report = check_page(element_bin, zoom) + suffix = '.IMG-CLIP' elif isinstance(element, TableRegionType) or ( # sole/congruent text region of a table region? element.id.endswith('_text') and @@ -462,39 +541,39 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_name = 'table' fullpage = True report = check_region(element_bin, zoom) + suffix = f"{element.id}.IMG-CLIP" else: element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - LOG.info('computing line segmentation for %s "%s"', element_name, element_id) + suffix = f"{element.id}.IMG-CLIP" + element_name_id = f'{element_name} "{element.id}"' + self.logger.info(f'Computing line segmentation for {element_name_id}') # TODO: we should downscale if DPI is large enough to save time try: if report: raise Exception(report) - line_labels, hlines, vlines, images, colseps, scale = compute_segmentation( + line_labels, baselines, seplines, images, colseps, scale = compute_segmentation( # suppress separators and ignored regions for textline estimation # but keep them for h/v-line detection (in fullpage mode): - element_bin, seps=(sep_bin+ignore_labels)>0, + element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, - spread_dist=round(self.parameter['spread']/zoom*300/72), # in pt + spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], maximages=self.parameter['maximages'] if element_name != 'table' else 0, - csminheight=self.parameter['csminheight'], - hlminwidth=self.parameter['hlminwidth']) + csminheight=self.parameter['csminheight']) except Exception as err: if isinstance(element, TextRegionType): - LOG.error('Cannot line-segment region "%s": %s', element_id, err) + self.logger.error(f'Cannot line-segment region "{element.id}": {err}') # as a fallback, add a single text line comprising the whole region: - element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords())) + element.add_TextLine(TextLineType(id=f"{element.id}_line", Coords=element.get_Coords())) else: - LOG.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) - return + self.logger.error(f'Cannot line-segment {element_name_id}: {err}') + return None - LOG.info('Found %d text lines for %s "%s"', - len(np.unique(line_labels)) - 1, - element_name, element_id) + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name_id}') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -503,32 +582,29 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # i.e. identical line and region labels # to detect their reading order among the others # (these cannot be split or grouped together with other regions) - line_labels = np.where(line_labels, line_labels+len(ignore), ignore_labels) + line_labels = np.where(line_labels, line_labels + len(ignore), ignore_labels) # suppress separators/images in fg and try to use for partitioning slices - sepmask = np.maximum(np.maximum(hlines, vlines), - np.maximum(sep_bin, images)) + sepmask = np.maximum(sep_bin, np.maximum(seplines > 0, images > 0)) region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, - sepmask=np.maximum(sepmask, colseps), # add bg + sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - LOG.info('Found %d text regions for %s "%s"', - len(np.unique(region_labels)) - 1, - element_name, element_id) + self.logger.info( + f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name_id}') except Exception as err: - LOG.error('Cannot region-segment %s "%s": %s', - element_name, element_id, err) + self.logger.error(f'Cannot region-segment {element_name_id}: {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) - + # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): index = 0 - # start counting from largest existing index + # start counting from the largest existing index for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): @@ -540,7 +616,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_no = 0 for region_label in np.unique(region_labels): if not region_label: - continue # no bg + continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) @@ -549,13 +625,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # (no new region, no actual text lines) region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ - "region label %d has both existing regions and new lines (%s)" % ( - region_label, str(region_line_labels0)) + (f'Region label "{region_label}" has both existing regions and new lines ' + f'({str(region_line_labels0)})') region = ignore[region_line_labels0[0] - 1] - if rogroup and region.parent_object_ == element and not isinstance(region, SeparatorRegionType): + if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): index = page_add_to_reading_order(rogroup, region.id, index) - LOG.debug('Region label %d is for ignored region "%s"', - region_label, region.id) + self.logger.debug(f'Region label "{region_label}" is for ignored region "{region.id}"') continue # normal case: new lines inside new regions # remove binary-empty labels, and re-order locally @@ -563,160 +638,139 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps - region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, - seps=np.maximum(sepmask, colseps)) + region_line_labels = hmerge_line_seeds( + element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons(region_mask * region_label, element_bin, - '%s "%s"' % (element_name, element_id), - min_area=6000/zoom/zoom, - simplify=ignore_labels * ~(sep_bin)) + regions, _ = masks2polygons( + self.logger, region_mask * region_label, None, element_bin, name=element_name_id, + min_area=6000 / zoom / zoom, simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons(region_line_labels, element_bin, - 'region "%s"' % element_id, - min_area=640/zoom/zoom) + lines, _ = masks2polygons( + self.logger, region_line_labels, baselines, element_bin, name=f'region "{element.id}"', + min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) - line_polys = [Polygon(polygon) for _, polygon in lines] - for _, region_polygon in regions: + line_polys = [Polygon(polygon) for _, polygon, _ in lines] + for _, region_polygon, _ in regions: region_poly = prep(Polygon(region_polygon)) # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(region_polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for region label %d', region_label) + self.logger.warning(f'Ignoring extant region contour for region label {region_label}') continue # annotate result: region_no += 1 - region_id = element_id + "_region%04d" % region_no - LOG.debug('Region label %d becomes ID "%s"', region_label, region_id) - region = TextRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon))) + region_id = f"{element.id}_region%04d" % region_no + self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"') + region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): - if not region_poly.intersects(line_poly): # .contains + if not region_poly.intersects(line_poly): # .contains continue - line_label, line_polygon = lines[i] + line_label, line_polygon, line_baseline = lines[i] # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) if line_polygon is None: - LOG.warning('Ignoring extant line contour for region label %d line label %d', - region_label, line_label) + self.logger.warning( + f'Ignoring extant line contour for region label {region_label} line label {line_label}') continue # annotate result: line_no += 1 - line_id = region_id + "_line%04d" % line_no - LOG.debug('Line label %d becomes ID "%s"', line_label, line_id) - line = TextLineType( - id=line_id, Coords=CoordsType( - points=points_from_polygon(line_polygon))) + line_id = f"{region_id}_line%04d" % line_no + self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"') + line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) + if line_baseline: + line_baseline = coordinates_for_segment(line_baseline, image, coords) + line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) region.add_TextLine(line) # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - LOG.info('Added region "%s" with %d lines for %s "%s"', - region_id, line_no, element_name, element_id) + self.logger.info(f'Added region "{region_id}" with {line_no} lines for {element_name_id}') if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - image_labels, num_images = morph.label(images) - LOG.info('Found %d large non-text/image regions for %s "%s"', - num_images, element_name, element_id) + self.logger.info(f'Found {images.max()} large image regions for {element_name_id}') # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(image_labels, element_bin, - '%s "%s"' % (element_name, element_id)) - for image_label, polygon in image_polygons: + image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, name=element_name_id) + for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for image label %d', image_label) + self.logger.warning(f'Ignoring extant region contour for image label {image_label}') continue region_no += 1 # annotate result: - region_id = element_id + "_image%04d" % region_no + region_id = f"{element.id}_image%04d" % region_no element.add_ImageRegion(ImageRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon)))) - # split rulers into separator regions: - hline_labels, num_hlines = morph.label(hlines) - vline_labels, num_vlines = morph.label(vlines) - LOG.info('Found %d/%d h/v-lines for %s "%s"', - num_hlines, num_vlines, element_name, element_id) + id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) + # split detected separator labels into separator regions: + self.logger.info(f'Found {seplines.max()} separators for {element_name_id}') # find contours around region labels (can be non-contiguous): - hline_polygons, _ = masks2polygons(hline_labels, element_bin, - '%s "%s"' % (element_name, element_id)) - vline_polygons, _ = masks2polygons(vline_labels, element_bin, - '%s "%s"' % (element_name, element_id)) - for _, polygon in hline_polygons + vline_polygons: + sep_polygons, _ = masks2polygons( + self.logger, seplines, None, element_bin, name=element_name_id, open_holes=True, reorder=False) + for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for separator') + self.logger.warning(f'Ignoring extant region contour for separator {sep_label}') continue # annotate result: region_no += 1 - region_id = element_id + "_sep%04d" % region_no + region_id = f"{element.id}_sep%04d" % region_no element.add_SeparatorRegion(SeparatorRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon)))) + id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image - element_array[sepmask] = np.amax(element_array) # clip to white/bg + element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) - file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', - page_id=page_id, - file_grp=self.output_file_grp) - element.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=coords['features'] + ',clipped')) + image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') + element.add_AlternativeImage(image_ref) + return OcrdPageResultImage(image_clipped, suffix, image_ref) else: # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) - region_mask = np.zeros_like(element_bin, np.bool) - region_mask[draw.polygon(region_polygon[:, 1], - region_polygon[:, 0], - region_mask.shape)] = True + region_mask = np.zeros_like(element_bin, bool) + region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons(line_labels, element_bin, - 'region "%s"' % element_id, - min_area=640/zoom/zoom) + line_polygons, _ = masks2polygons( + self.logger, line_labels, baselines, element_bin, + name=f'region "{element.id}"', min_area=640 / zoom / zoom) line_no = 0 - for line_label, polygon in line_polygons: + for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) if line_polygon is None: - LOG.warning('Ignoring extant line contour for line label %d', - line_label) + self.logger.warning(f'Ignoring extant line contour for line label {line_label}') continue # annotate result: line_no += 1 - line_id = element_id + "_line%04d" % line_no - element.add_TextLine(TextLineType( - id=line_id, Coords=CoordsType( - points=points_from_polygon(line_polygon)))) + line_id = f"{element.id}_line%04d" % line_no + line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) + if baseline: + line_baseline = coordinates_for_segment(baseline, image, coords) + line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) + element.add_TextLine(line) if not sep_bin.any(): - return # no derived image + return None # no derived image # annotate a text/image-separated image - element_array[sep_bin] = np.amax(element_array) # clip to white/bg + element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) - file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', - page_id=page_id, - file_grp=self.output_file_grp) - # update PAGE (reference the image file): - element.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=coords['features'] + ',clipped')) + image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') + element.add_AlternativeImage(image_ref) + return OcrdPageResultImage(image_clipped, suffix, image_ref) def polygon_for_parent(polygon, parent): """Clip polygon to parent polygon range. - + (Should be moved to ocrd_utils.coordinates_for_segment.) """ childp = Polygon(polygon) @@ -751,37 +805,185 @@ def make_intersection(poly1, poly2): # post-process if interp.is_empty or interp.area == 0.0: return None - if interp.type == 'GeometryCollection': + if interp.geom_type == 'GeometryCollection': # heterogeneous result: filter zero-area shapes (LineString, Point) interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) - if interp.type == 'MultiPolygon': + if interp.geom_type == 'MultiPolygon': # homogeneous result: construct convex hull to connect - # FIXME: construct concave hull / alpha shape - interp = interp.convex_hull + interp = join_polygons(interp.geoms) if interp.minimum_clearance < 1.0: # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity - interp = asPolygon(np.round(interp.exterior.coords)) + interp = Polygon(np.round(interp.exterior.coords)) interp = make_valid(interp) return interp def make_valid(polygon): - for split in range(1, len(polygon.exterior.coords)-1): + points = list(polygon.exterior.coords) + for split in range(1, len(points)): if polygon.is_valid or polygon.simplify(polygon.area).is_valid: break # simplification may not be possible (at all) due to ordering # in that case, try another starting point - polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split]) - for tolerance in range(1, int(polygon.area)): + polygon = Polygon(points[-split:]+points[:-split]) + for tolerance in range(int(polygon.area)): if polygon.is_valid: break # simplification may require a larger tolerance - polygon = polygon.simplify(tolerance) + polygon = polygon.simplify(tolerance + 1) return polygon +def diff_polygons(poly1, poly2): + poly = poly1.difference(poly2) + if poly.geom_type == 'MultiPolygon': + poly = poly.convex_hull + if poly.minimum_clearance < 1.0: + poly = Polygon(np.round(poly.exterior.coords)) + poly = make_valid(poly) + return poly + +def join_polygons(polygons, loc='', scale=20): + """construct concave hull (alpha shape) from input polygons""" + # compoundp = unary_union(polygons) + # jointp = compoundp.convex_hull + polygons = list(itertools.chain.from_iterable([ + poly.geoms if poly.geom_type in ['MultiPolygon', 'GeometryCollection'] + else [poly] + for poly in polygons])) + npoly = len(polygons) + if npoly == 1: + return polygons[0] + # find min-dist path through all polygons (travelling salesman) + pairs = itertools.combinations(range(npoly), 2) + dists = np.eye(npoly, dtype=float) + for i, j in pairs: + dist = polygons[i].distance(polygons[j]) + if dist < 1e-5: + dist = 1e-5 # if pair merely touches, we still need to get an edge + dists[i, j] = dist + dists[j, i] = dist + dists = minimum_spanning_tree(dists, overwrite=True) + # add bridge polygons (where necessary) + max_dist = max(1.0, scale / 5) + for prevp, nextp in zip(*dists.nonzero()): + prevp = polygons[prevp] + nextp = polygons[nextp] + nearest = nearest_points(prevp, nextp) + bridgep = LineString(nearest).buffer(max_dist, resolution=1) + polygons.append(bridgep) + jointp = unary_union(polygons) + assert jointp.geom_type == 'Polygon', jointp.wkt + if jointp.minimum_clearance < 1.0: + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity + jointp = Polygon(np.round(jointp.exterior.coords)) + jointp = make_valid(jointp) + return jointp + +def join_baselines(logger: Logger, baselines, loc=''): + lines = [] + for baseline in baselines: + if (baseline.is_empty or + baseline.geom_type in ['Point', 'MultiPoint']): + continue + elif baseline.geom_type == 'MultiLineString': + lines.extend(baseline.geoms) + elif baseline.geom_type == 'LineString': + lines.append(baseline) + elif baseline.geom_type == 'GeometryCollection': + for geom in baseline.geoms: + if geom.geom_type == 'LineString': + lines.append(geom) + elif geom.geom_type == 'MultiLineString': + lines.extend(geom) + else: + logger.warning(f"Ignoring baseline subtype {geom.geom_type} in {loc}") + else: + logger.warning(f"Ignoring baseline type {baseline.geom_type} in {loc}") + nlines = len(lines) + if nlines == 0: + return None + elif nlines == 1: + return lines[0] + # Shapely cannot reorder: + #result = line_merge(MultiLineString([line.normalize() for line in lines])) + # find min-dist path through all lines (travelling salesman) + pairs = itertools.combinations(range(nlines), 2) + dists = np.eye(nlines, dtype=float) + for i, j in pairs: + dist = lines[i].distance(lines[j]) + if dist < 1e-5: + dist = 1e-5 # if pair merely touches, we still need to get an edge + dists[i, j] = dist + dists[j, i] = dist + dists = minimum_spanning_tree(dists, overwrite=True) + assert dists.nonzero()[0].size, dists + # get path + chains = [] + for prevl, nextl in zip(*dists.nonzero()): + foundchains = [] + for chain in chains: + if chain[0] == prevl: + found = chain, 0, nextl + elif chain[0] == nextl: + found = chain, 0, prevl + elif chain[-1] == prevl: + found = chain, -1, nextl + elif chain[-1] == nextl: + found = chain, -1, prevl + else: + continue + foundchains.append(found) + if len(foundchains): + assert len(foundchains) <= 2, foundchains + chain, pos, node = foundchains.pop() + if len(foundchains): + otherchain, otherpos, othernode = foundchains.pop() + assert node != othernode + assert chain[pos] == othernode + assert otherchain[otherpos] == node + if pos < 0 and otherpos < 0: + chain.extend(reversed(otherchain)) + chains.remove(otherchain) + elif pos < 0 and otherpos == 0: + chain.extend(otherchain) + chains.remove(otherchain) + elif pos == 0 and otherpos == 0: + otherchain.extend(reversed(chain)) + chains.remove(chain) + elif pos == 0 and otherpos < 0: + otherchain.extend(chain) + chains.remove(chain) + elif pos < 0: + chain.append(node) + else: + chain.insert(0, node) + else: + chains.append([prevl, nextl]) + if len(chains) > 1: + logger.warning(f"Baseline merge impossible (no spanning tree) in {loc}") + return None + assert len(chains) == 1, chains + assert len(chains[0]) == nlines, chains[0] + path = chains[0] + # get points + coords = [] + for node in path: + line = lines[node] + coords.extend(line.normalize().coords) + result = LineString(coords) + if result.is_empty: + logger.warning(f"Baseline merge is empty in {loc}") + return None + assert result.geom_type == 'LineString', result.wkt + result = set_precision(result, 1.0) + if result.geom_type != 'LineString' or not result.is_valid: + result = LineString(np.round(line.coords)) + return result + def page_get_reading_order(ro, rogroup): """Add all elements from the given reading order group to the given dictionary. - + Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, and an object ``rogroup`` with additional ReadingOrder element objects, add all references to the dict, traversing the group recursively. @@ -801,10 +1003,10 @@ def page_get_reading_order(ro, rogroup): def page_add_to_reading_order(rogroup, region_id, index=None): """Add a region reference to an un/ordered RO group. - + Given a ReadingOrder group ``rogroup`` (of any type), append a reference to region ``region_id`` to it. - + If ``index`` is given, use that as position and return incremented by one. (This must be an integer if ``rogroup`` is an OrderedGroup(Indexed). @@ -812,65 +1014,56 @@ def page_add_to_reading_order(rogroup, region_id, index=None): """ if rogroup: if index is None: - rogroup.add_RegionRef(RegionRefType( - regionRef=region_id)) + rogroup.add_RegionRef(RegionRefType(regionRef=region_id)) else: - rogroup.add_RegionRefIndexed(RegionRefIndexedType( - regionRef=region_id, index=index)) + rogroup.add_RegionRefIndexed(RegionRefIndexedType(regionRef=region_id, index=index)) index += 1 return index -def page_subgroup_in_reading_order(roelem): +def page_subgroup_in_reading_order(logger: Logger, roelem): """Replace given RO element by an equivalent OrderedGroup. - + Given a ReadingOrder element ``roelem`` (of any type), first look up its parent group. Remove it from the respective member list (of its region refs or un/ordered groups), even if it already was an OrderedGroup(Indexed). - + Then instantiate an empty OrderedGroup(Indexed), referencing the same region as ``roelem`` (and using the same index, if any). Add that group to the parent instead. - + Return the new group object. """ - LOG = getLogger('processor.OcropySegment') if not roelem: - LOG.error('Cannot subgroup from empty ReadingOrder element') + logger.error('Cannot subgroup from empty ReadingOrder element') return roelem if not roelem.parent_object_: - LOG.error('Cannot subgroup from orphan ReadingOrder element') + logger.error('Cannot subgroup from orphan ReadingOrder element') return roelem - if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not ( + if isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)) and not ( roelem.get_OrderedGroupIndexed() or roelem.get_UnorderedGroupIndexed() or roelem.get_RegionRefIndexed()): # is already a group and still empty return roelem - if isinstance(roelem, (OrderedGroupType, - UnorderedGroupType, - RegionRefType)): + if isinstance(roelem, (OrderedGroupType, UnorderedGroupType, RegionRefType)): getattr(roelem.parent_object_, { OrderedGroupType: 'get_OrderedGroup', UnorderedGroupType: 'get_UnorderedGroup', RegionRefType: 'get_RegionRef', }.get(roelem.__class__))().remove(roelem) - roelem2 = OrderedGroupType(id=roelem.regionRef + '_group', - regionRef=roelem.regionRef) + roelem2 = OrderedGroupType(id=f"{roelem.regionRef}_group", regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroup(roelem2) roelem2.parent_object_ = roelem.parent_object_ return roelem2 - if isinstance(roelem, (OrderedGroupIndexedType, - UnorderedGroupIndexedType, - RegionRefIndexedType)): + if isinstance(roelem, (OrderedGroupIndexedType, UnorderedGroupIndexedType, RegionRefIndexedType)): getattr(roelem.parent_object_, { OrderedGroupIndexedType: 'get_OrderedGroupIndexed', UnorderedGroupIndexedType: 'get_UnorderedGroupIndexed', RegionRefIndexedType: 'get_RegionRefIndexed' }.get(roelem.__class__))().remove(roelem) - roelem2 = OrderedGroupIndexedType(id=roelem.regionRef + '_group', - index=roelem.index, - regionRef=roelem.regionRef) + roelem2 = OrderedGroupIndexedType( + id=f"{roelem.regionRef}_group", index=roelem.index, regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroupIndexed(roelem2) roelem2.parent_object_ = roelem.parent_object_ return roelem2 diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 4427d47c..78302f12 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -1,207 +1,134 @@ from __future__ import absolute_import -import sys, os.path, cv2 -from ocrd_modelfactory import page_from_file -from ocrd import Processor -from ocrd_utils import getLogger -from ocrd_cis import get_ocrd_tool - -sys.path.append(os.path.dirname(os.path.abspath(__file__))) +from typing import Optional +from logging import Logger +from sys import exit +from os import makedirs, remove +from os.path import abspath, dirname, exists, join, isfile +from ocrd_models import OcrdPage +from ocrd import Processor, Workspace, OcrdPageResult from .ocropus_rtrain import * - -np.seterr(divide='raise',over='raise',invalid='raise',under='ignore') - - - - -def bounding_box(coord_points): - point_list = [[int(p) for p in pair.split(',')] for pair in coord_points.split(' ')] - x_coordinates, y_coordinates = zip(*point_list) - return (min(x_coordinates), min(y_coordinates), max(x_coordinates), max(y_coordinates)) +from .binarize import binarize def deletefiles(filelist): for file in filelist: - if os.path.exists(file): - os.remove(file) - if os.path.exists(file[:-3]+'gt.txt'): - os.remove(file[:-3]+'gt.txt') + if exists(file): + remove(file) + if exists(file[:-3] + 'gt.txt'): + remove(file[:-3] + 'gt.txt') def resize_keep_ratio(image, baseheight=48): hpercent = (baseheight / float(image.size[1])) wsize = int((float(image.size[0] * float(hpercent)))) - image = image.resize((wsize, baseheight), Image.ANTIALIAS) + image = image.resize((wsize, baseheight), Image.LANCZOS) return image -def binarize(pil_image): - # Convert RGB to OpenCV - img = cv2.cvtColor(np.asarray(pil_image), cv2.COLOR_RGB2GRAY) - - # global thresholding - #ret1,th1 = cv2.threshold(img,127,255,cv2.THRESH_BINARY) - - # Otsu's thresholding - #ret2,th2 = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - - # Otsu's thresholding after Gaussian filtering - blur = cv2.GaussianBlur(img,(5,5),0) - ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - - bin_img = Image.fromarray(th3) - return bin_img - - - class OcropyTrain(Processor): + modelpath: str + outputpath: str - def __init__(self, *args, **kwargs): - self.log = getLogger('OcropyTrain') - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-ocropy-train'] - kwargs['version'] = ocrd_tool['version'] - super(OcropyTrain, self).__init__(*args, **kwargs) - - - def process(self): - """ - Performs the training - """ - #print(self.parameter) - if self.parameter['textequiv_level'] not in ['line', 'word', 'glyph']: - raise Exception("currently only implemented at the line/glyph level") - - filepath = os.path.dirname(os.path.abspath(__file__)) - - - + @property + def executable(self): + return 'ocrd-cis-ocropy-train' + def setup(self): if 'model' in self.parameter: model = self.parameter['model'] - modelpath = filepath + '/models/' + model + '.gz' - outputpath = filepath + '/output/' + model - if 'outputpath' in self.parameter: - outputpath = self.parameter + '/' + model - if os.path.isfile(modelpath) == False: - raise Exception("configured model " + model + " is not in models folder") + try: + self.modelpath = self.resolve_resource(model) + except SystemExit: + ocropydir = dirname(abspath(__file__)) + self.modelpath = join(ocropydir, 'models', model) + self.logger.error(f"Failed to resolve model '{model}' path, trying '{self.modelpath}'") + if not isfile(self.modelpath): + self.logger.critical(f"Could not find model '{model}'.\n" + f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'") + exit(1) + self.outputpath = join(self.parameter.get('outputpath', 'output'), model) else: - modelpath = None - outputpath = filepath + '/output/' + 'lstm' - if 'outputpath' in self.parameter: - outputpath = self.parameter + '/' +'lstm' - - if 'ntrain' in self.parameter: - ntrain = self.parameter['ntrain'] - - - - filelist = [] - - #self.log.info("Using model %s in %s for recognition", model) - for (n, input_file) in enumerate(self.input_files): - #self.log.info("INPUT FILE %i / %s", n, input_file) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) - - - self.log.info("page %s", pcgts) - for region in pcgts.get_Page().get_TextRegion(): - textlines = region.get_TextLine() - self.log.info("About to extract %i lines in region '%s'", len(textlines), region.id) - for line in textlines: - - if self.parameter['textequiv_level'] == 'line': - self.log.debug("Extracting line '%s'", line.id) - - #get box from points - box = bounding_box(line.get_Coords().points) - - #crop word from page - croped_image = pil_image.crop(box=box) - - #binarize with Otsu's thresholding after Gaussian filtering - bin_image = binarize(croped_image) - - #resize image to 48 pixel height - final_img = resize_keep_ratio(bin_image) - - #save temp image - path = os.path.join(filepath, 'temp', str(input_file.ID) + str(region.id) + str(line.id)) - imgpath = path + '.png' - final_img.save(imgpath) - - filelist.append(imgpath) - - #ground truth - gt = line.get_TextEquiv()[0].Unicode.strip() - gtpath = path + '.gt.txt' - with open(gtpath, "w", encoding='utf-8') as f: - f.write(gt) - - - - if self.parameter['textequiv_level'] == 'word' or 'glyph': - for word in line.get_Word(): - - if self.parameter['textequiv_level'] == 'word': - self.log.debug("Extracting word '%s'", word.id) - - #get box from points - box = bounding_box(word.get_Coords().points) - - #crop word from page - croped_image = pil_image.crop(box=box) - - #binarize with Otsu's thresholding after Gaussian filtering - bin_image = binarize(croped_image) - - #resize image to 48 pixel height - final_img = resize_keep_ratio(bin_image) - - #save temp image - path = os.path.join(filepath, 'temp', str(input_file.ID) + str(region.id) + str(line.id) + str(word.id)) - imgpath = path + '.png' - final_img.save(imgpath) - - filelist.append(imgpath) - - #ground truth - gt = word.get_TextEquiv()[0].Unicode.strip() - gtpath = path + '.gt.txt' - - with open(gtpath, "w", encoding='utf-8') as f: - f.write(gt) - - else: - for glyph in word.get_Glyph(): - self.log.debug("Extracting glyph '%s'", glyph.id) - - #get box from points - box = bounding_box(glyph.get_Coords().points) - - #crop word from page - croped_image = pil_image.crop(box=box) - - #binarize with Otsu's thresholding after Gaussian filtering - bin_image = binarize(croped_image) - - #resize image to 48 pixel height - final_img = resize_keep_ratio(bin_image) - - #save temp image - path = os.path.join(filepath, 'temp', str(input_file.ID) + str(region.id) + str(line.id) + str(word.id) + str(glyph.id)) - imgpath = path + '.png' - final_img.save(imgpath) - - filelist.append(imgpath) - - #ground truth - gt = glyph.get_TextEquiv()[0].Unicode.strip() - with open(gtpath, "w", encoding='utf-8') as f: - f.write(gt) + self.modelpath = None + self.outputpath = join(self.parameter.get('outputpath', 'output'), 'lstm') + makedirs(dirname(self.outputpath)) + self.filelist = None + def process_workspace(self, workspace: Workspace) -> None: + """ + Trains a new model on the text lines from the input fileGrp, + extracted as image-text file pairs into the output fileGrp. + (If the output fileGrp already exists and these files should + be re-used, pass the `--overwrite` option when processing.) - rtrain(filelist, modelpath, outputpath, ntrain) - deletefiles(filelist) + The model is written into `outputpath` (or just `output`) under + the same name as `model` (i.e. the start model, or just `lstm`). + """ + self.filelist = [] + super().process_workspace(workspace) + self.logger.info(f"Training {self.outputpath} from {self.modelpath or 'scratch'} " + f"on {len(self.filelist)} file pairs") + rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) + # deletefiles(self.filelist) + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + """ + Extracts pairs of plaintext and cropped image files for each text line + in the PAGE file (to be used during training). + """ + pcgts = input_pcgts[0] + #self.logger.info("Using model %s in %s for recognition", model) + page = pcgts.get_Page() + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + + self.logger.debug(f"Extracting from page '{page_id}'") + for region in page.get_AllRegions(classes=['Text']): + textlines = region.get_TextLine() + self.logger.debug(f"Extracting {len(textlines)} lines from region '{region.id}'") + for line in textlines: + if self.parameter['textequiv_level'] == 'line': + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}") + self.filelist.append(self.extract_segment(path, line, page_image, page_coords)) + continue + for word in line.get_Word(): + if self.parameter['textequiv_level'] == 'word': + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}") + self.filelist.append(self.extract_segment(path, word, page_image, page_coords)) + continue + for glyph in word.get_Glyph(): + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}_{glyph.id}") + self.filelist.append(self.extract_segment(path, glyph, page_image, page_coords)) + # FIXME: PAGE-XML not really needed, find a way around this (raising special exception?) + return OcrdPageResult(pcgts) + + def extract_segment(self, path, segment, page_image, page_coords): + gtpath = path + '.gt.txt' + imgpath = path + '.png' + if exists(gtpath) and exists(imgpath): + self.logger.debug(f"Reusing {segment.__class__.__name__} '{segment.id}' file pair") + return imgpath + + gt = segment.TextEquiv + if not gt: + return None + gt = gt[0].Unicode + if not gt or not gt.strip(): + return None + gt = gt.strip() + with open(gtpath, "w", encoding='utf-8') as f: + f.write(gt) + + self.logger.debug(f"Extracting {segment.__class__.__name__} '{segment.id}' file pair") + image, coords = self.workspace.image_from_segment(segment, page_image, page_coords) + + if 'binarized' not in coords['features'].split(','): + # binarize with nlbin + image, _ = binarize(self.logger, image, maxskew=0) + + # resize image to 48 pixel height + image = resize_keep_ratio(image) + + image.save(imgpath) + + return imgpath diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index a5125b8d..70918de7 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -1,46 +1,72 @@ from __future__ import absolute_import -import click -import json import os -from ocrd import Processor -from ocrd.decorators import ocrd_cli_options -from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd_utils import getLogger -from ocrd_models.ocrd_mets import OcrdMets +import json + +import click + +from ocrd import Processor, Workspace +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd_utils import getLevelName, pushd_popd +from ocrd_models import OcrdMets + from ocrd_cis import JavaPostCorrector -from ocrd_cis import get_ocrd_tool -LOG_LEVEL = 'INFO' @click.command() @ocrd_cli_options def ocrd_cis_postcorrect(*args, **kwargs): - if 'log_level' in kwargs and kwargs['log_level']: - global LOG_LEVEL - LOG_LEVEL = kwargs['log_level'] return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs) class PostCorrector(Processor): - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect'] - kwargs['version'] = ocrd_tool['version'] - super(PostCorrector, self).__init__(*args, **kwargs) - self.log = getLogger('cis.Processor.PostCorrector') - - def process(self): - ifgs = self.input_file_grp.split(",") # input file groups - ofg = self.output_file_grp + @property + def executable(self): + return 'ocrd-cis-postcorrect' + + def setup(self): + # since ocrd v3.0 we cannot overwrite self.parameter anymore + # because that gets validated against the schema + # (so these additions would fail) + self.params = dict(self.parameter) profiler = {} profiler["path"] = self.parameter["profilerPath"] profiler["config"] = self.parameter["profilerConfig"] profiler["noCache"] = True - self.parameter["profiler"] = profiler - self.parameter["runDM"] = True - metspath = os.path.join(self.workspace.directory, "mets.xml") - print(json.dumps(self.parameter, indent=4)) - p = JavaPostCorrector(metspath, ",".join(ifgs), ofg, self.parameter, LOG_LEVEL) - p.exe() - # reload the mets file to prevent it from overriding the - # updated version from the java process - self.workspace.mets = OcrdMets(filename=metspath) + self.params["profiler"] = profiler + self.params["runDM"] = True + self.logger.debug(json.dumps(self.params, indent=4)) + + def process_workspace(self, workspace: Workspace): + with pushd_popd(workspace.directory): + self.workspace = workspace + self.verify() + # ensure that input files are referenced in on-disk METS + self.workspace.save_mets() + # this CLI call mimics the OCR-D processor CLI itself + # we have no control over its interior + # (we get no page-wise error handling and input downloading) + p = JavaPostCorrector(self.workspace.mets_target, + self.input_file_grp, + self.output_file_grp, + self.params, + getLevelName(self.logger.getEffectiveLevel())) + p.exe() + # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): + # We cannot do that with this method, because our self.workspace.mets might be + # a ClientSideOcrdMets, which does not allow modifying or removing files: + # for output_file in self.workspace.find_files(file_grp=self.output_file_grp): + # flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') + # flocat.attrib['LOCTYPE'] = 'OTHER' + # flocat.attrib['OTHERLOCTYPE'] = 'FILE' + # output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + # So instead, let's post-process the local METS file result directly: + mets = OcrdMets(filename=self.workspace.mets_target) + for output_file in mets.find_files(fileGrp=self.output_file_grp): + flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') + flocat.attrib['LOCTYPE'] = 'OTHER' + flocat.attrib['OTHERLOCTYPE'] = 'FILE' + output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + with open(self.workspace.mets_target, 'w') as f: + f.write(mets.to_xml(xmllint=True).decode('utf-8')) + # reload the mets file to prevent run_processor's save_mets + # from overriding the results from the Java process + self.workspace.reload_mets() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..6432dd27 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,107 @@ +[build-system] +requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"] + +[project] +name = "ocrd_cis" +authors = [ + {name = "Florian Fink", email = "finkf@cis.lmu.de"}, + {name = "Tobias Englmeier", email = "englmeier@cis.lmu.de"}, + {name = "Christoph Weber", email = "web_chris@msn.com"}, + {name = "Robert Sachunsky", email = "sachunsky@informatik.uni-leipzig.de"}, +] +description = "CIS OCR-D post-correction tools and improved Ocropy1" +readme = "README.md" +license = {text = "MIT"} +requires-python = ">=3.8" +keywords = ["ocr", "ocr-d", "ocropus-ocr", "post-correction"] + +dynamic = ["version", "dependencies"] + +# https://pypi.org/classifiers/ +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Science/Research", + "Intended Audience :: Other Audience", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Text Processing", +] + +[project.scripts] +ocrd-cis-align = "ocrd_cis.align.cli:ocrd_cis_align" +ocrd-cis-postcorrect = "ocrd_cis.postcorrect.cli:ocrd_cis_postcorrect" +ocrd-cis-data = "ocrd_cis.data.__main__:main" +ocrd-cis-ocropy-train = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_train" +ocrd-cis-ocropy-recognize = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_recognize" +ocrd-cis-ocropy-segment = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_segment" +ocrd-cis-ocropy-resegment = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_resegment" +ocrd-cis-ocropy-clip = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_clip" +ocrd-cis-ocropy-dewarp = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_dewarp" +ocrd-cis-ocropy-deskew = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_deskew" +ocrd-cis-ocropy-denoise = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_denoise" +ocrd-cis-ocropy-binarize = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_binarize" + +[project.urls] +Homepage = "https://github.com/cisocrgroup/ocrd_cis" +Repository = "https://github.com/cisocrgroup/ocrd_cis.git" + +[project.optional-dependencies] +debug = ["matplotlib>3.0.0"] + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[tool.setuptools] +packages = ["ocrd_cis", "ocrd_cis.postcorrect", "ocrd_cis.aio", "ocrd_cis.data", "ocrd_cis.wer", "ocrd_cis.ocropy", "ocrd_cis.ocropy.ocrolib", "ocrd_cis.div", "ocrd_cis.align"] +package-data = {"*" = ["*.json", "*.jar", "model.zip", "3gs.csv.gz"]} + +[tool.pytest.ini_options] +minversion = 6.0 +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] + + +[tool.mypy] +plugins = ["numpy.typing.mypy_plugin"] + +ignore_missing_imports = true + + +strict = true + +disallow_subclassing_any = false +# ❗ error: Class cannot subclass "Processor" (has type "Any") +disallow_any_generics = false +disallow_untyped_defs = false +disallow_untyped_calls = false + + +[tool.ruff.lint] +select = ["E", "F", "I"] + + +[tool.coverage.run] +branch = true +source = [ + "ocrd_cis" +] +concurrency = [ + "thread", + "multiprocessing" +] + +[tool.coverage.report] +exclude_also = [ + "if self\\.debug", + "pragma: no cover", + "raise NotImplementedError", + "if __name__ == .__main__.:", +] +ignore_errors = true +omit = [ + "ocrd_cis/*/cli" +] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..a57112af --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +ocrd>=3.0.2 +click +scipy +numpy>=1.17.0 +pillow>=7.1.2 +shapely>=2.0.0 +scikit-image +networkx +opencv-python-headless +rapidfuzz diff --git a/setup.py b/setup.py deleted file mode 100644 index 72e11280..00000000 --- a/setup.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Installs: - - ocrd-cis-align - - ocrd-cis-postcorrect - - ocrd-cis-data - - ocrd-cis-ocropy-clip - - ocrd-cis-ocropy-denoise - - ocrd-cis-ocropy-deskew - - ocrd-cis-ocropy-binarize - - ocrd-cis-ocropy-resegment - - ocrd-cis-ocropy-segment - - ocrd-cis-ocropy-dewarp - - ocrd-cis-ocropy-recognize - - ocrd-cis-ocropy-train -""" - -import codecs -import json -from setuptools import setup -from setuptools import find_packages - -with codecs.open('README.md', encoding='utf-8') as f: - README = f.read() - -with open('./ocrd-tool.json', 'r') as f: - version = json.load(f)['version'] - -setup( - name='ocrd_cis', - version=version, - description='CIS OCR-D command line tools', - long_description=README, - long_description_content_type='text/markdown', - author='Florian Fink, Tobias Englmeier, Christoph Weber', - author_email='finkf@cis.lmu.de, englmeier@cis.lmu.de, web_chris@msn.com', - url='https://github.com/cisocrgroup/ocrd_cis', - license='MIT', - packages=find_packages(), - include_package_data=True, - install_requires=[ - 'ocrd>=2.30', - 'click', - 'scipy', - 'numpy>=1.17.0', - 'pillow>=7.1.2', - 'shapely>=1.7.1,<1.8', - 'scikit-image', - 'alphashape', - 'opencv-python-headless', - 'python-Levenshtein', - 'calamari_ocr == 0.3.5' - ], - extras_require={ - 'debug': ['matplotlib>3.0.0'], - }, - package_data={ - '': ['*.json', '*.yml', '*.yaml', '*.csv.gz', '*.jar', '*.zip'], - }, - entry_points={ - 'console_scripts': [ - 'ocrd-cis-align=ocrd_cis.align.cli:ocrd_cis_align', - 'ocrd-cis-postcorrect=ocrd_cis.postcorrect.cli:ocrd_cis_postcorrect', - 'ocrd-cis-data=ocrd_cis.data.__main__:main', - 'ocrd-cis-ocropy-binarize=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_binarize', - 'ocrd-cis-ocropy-clip=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_clip', - 'ocrd-cis-ocropy-denoise=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_denoise', - 'ocrd-cis-ocropy-deskew=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_deskew', - 'ocrd-cis-ocropy-dewarp=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_dewarp', - 'ocrd-cis-ocropy-recognize=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_recognize', - 'ocrd-cis-ocropy-resegment=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_resegment', - 'ocrd-cis-ocropy-segment=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_segment', - 'ocrd-cis-ocropy-train=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_train', - ] - }, -) diff --git a/tests/run_add_zip_test.bash b/tests/run_add_zip_test.bash index 003c5e86..e2d44983 100644 --- a/tests/run_add_zip_test.bash +++ b/tests/run_add_zip_test.bash @@ -6,31 +6,20 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-GT-SEG-LINE); do + [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi +(( found_files == 3 )) || fail "invalid number of files: $found_files" popd # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-IMG); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-IMG); do + [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi +(( found_files == 3 )) || fail "invalid number of files: $found_files" popd + diff --git a/tests/run_alignment_test.bash b/tests/run_alignment_test.bash index 1e9e3ea0..7a82254b 100644 --- a/tests/run_alignment_test.bash +++ b/tests/run_alignment_test.bash @@ -6,32 +6,21 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do + [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi +(( found_files == 3 )) || fail "invalid number of files: $found_files" popd ocrd_cis_align pushd $tmpws found_files=0 -for file in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do - if [[ ! -f "$file" ]]; then - echo "cannot find aligned file group workspace" - exit 1 - fi +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-ALIGN); do + [[ -f "$file" ]] || fail "cannot find aligned file group workspace" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi +(( found_files == 3 )) || fail "invalid number of files: $found_files" popd + diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_image_preprocessing_test.bash deleted file mode 100644 index 4fd028e4..00000000 --- a/tests/run_image_preprocessing_test.bash +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -set -e -source $(dirname $0)/test_lib.bash - -ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip" - -# test if there are 3 gt files -pushd "$tmpws" -found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi - found_files=$((found_files + 1)) -done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi -popd - -ocrd-cis-ocropy-binarize --log-level DEBUG \ - --input-file-grp OCR-D-GT-SEG-LINE \ - --output-file-grp OCR-D-CIS-IMG-BIN \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-clip --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-BIN \ - --output-file-grp OCR-D-CIS-IMG-CLIP \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-denoise --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-CLIP \ - --output-file-grp OCR-D-CIS-IMG-DEN \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-deskew --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-DEN \ - --output-file-grp OCR-D-CIS-IMG-DES \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-dewarp --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-DES \ - --output-file-grp OCR-D-CIS-IMG-DEW \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-segment --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-DEW \ - --output-file-grp OCR-D-CIS-IMG-SEG \ - --mets "$tmpws/mets.xml" diff --git a/tests/run_ocr_test.bash b/tests/run_ocr_test.bash index 6de88a7b..f737ae43 100644 --- a/tests/run_ocr_test.bash +++ b/tests/run_ocr_test.bash @@ -6,31 +6,19 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do + [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi -popd +(( $found_files == 3 )) || fail "invalid number of files: $found_files" # download ocr model -wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz" +ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz # run ocr -ocrd-cis-ocropy-recognize --log-level DEBUG \ - --input-file-grp "OCR-D-GT-SEG-LINE" \ - --output-file-grp OCR-D-CIS-OCR \ - --mets "$tmpws/mets.xml" \ - --parameter <(cat < "$tmpdir/bin/profiler.bash" < "bin/profiler.bash" < /dev/null echo '{}' EOF -chmod a+x "$tmpdir/bin/profiler.bash" -ocrd-cis-postcorrect --log-level DEBUG \ - -I OCR-D-CIS-ALIGN \ - -O OCR-D-CIS-POSTCORRECT \ - -m $tmpws/mets.xml \ - --parameter <(cat <e.#Säugethiere.#' $f sed -i -e 's#E#Säugethieren#' $f done @@ -57,7 +54,4 @@ cat $(ocrd-cis-data -config) \ | sed -e "s#/path/to/train.dir#$tmpdir/train#" ) -if [[ ! -f $tmpdir/train/model.zip ]]; then - echo $tmpdir/train/model.zip not found - exit 1 -fi +[[ -f "$tmpdir/train/model.zip" ]] || fail "$tmpdir/train/model.zip not found" diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 5d38f482..76111d25 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -1,51 +1,69 @@ #/bin/bash tmpdir=$(mktemp -d) -trap "rm -rf $tmpdir" EXIT +function stopserver() { + : +} +function failexit() { + stopserver +} +function cleanexit() { + stopserver + rm -rf $tmpdir +} +trap "trap failexit EXIT" ERR +trap cleanexit EXIT + +OCRD_LOG_ARGS=() +if test -v OCRD_OVERRIDE_LOGLEVEL; then + OCRD_LOG_ARGS+=(-l $OCRD_OVERRIDE_LOGLEVEL) +fi +OCRD_WS_ARGS=() # -m mets.xml OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://ocr-d-repo.scc.kit.edu/api/v1/dataresources/75ad9f94-dbaa-43e0-ab06-2ce24c497c61/data" + +data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/" function ocrd_cis_download_bagit() { local url="$data_url/$1" - mkdir -p "$tmpdir/download" - wget -P "$tmpdir/download" "$url" + mkdir -p "$PWD/download" + wget -nc -P "$PWD/download" "$url" } function ocrd_cis_init_ws() { ocrd_cis_download_bagit "$1" - ocrd zip spill -d "$tmpdir" "$tmpdir/download/$1" + ocrd zip spill -d "$tmpdir" "$PWD/download/$1" tmpws="$tmpdir/${1%.ocrd.zip}" + if ((${OCRD_MAX_PARALLEL_PAGES:-0} > 1)); then + echo starting METS server at $tmpws + ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server start & + OCRD_WS_ARGS+=(-U "$tmpws/mets.sock") + sleep 1 + function stopserver() { + echo stopping METS server at $tmpws + ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server stop || true + } + fi } + function ocrd_cis_align() { # download ocr models - wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz" - wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur2-00062000.pyrnn.gz" + ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz + ocrd resmgr download ocrd-cis-ocropy-recognize fraktur-jze.pyrnn.gz # run ocr - ocrd-cis-ocropy-recognize --log-level DEBUG \ - --input-file-grp "OCR-D-GT-SEG-LINE" \ - --output-file-grp OCR-D-CIS-OCR-1 \ - --mets "$tmpws/mets.xml" \ - --parameter <(cat <&2 "$@" + false }