Add PaddleDetection-based Layout Model (#54)

an1018 · lolipopshock · web-flow · commit 035f66a22b86 · 2021-08-17T12:33:14.000-04:00
* add paddle model

* Better model downloading logic

* Use layout parser PathManager

* simplify the layoutmodel in paddledetection

* remove the empty preprocess.py file

* incldue paddle models in dev-requirements

Co-authored-by: Shannon Shen &lt;22512825+lolipopshock@users.noreply.github.com&gt;
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -10,4 +10,5 @@ sphinx_rtd_theme
 google-cloud-vision==1
 pytesseract
 pycocotools
-git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2
+git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2
+paddlepaddle
diff --git a/src/layoutparser/__init__.py b/src/layoutparser/__init__.py
@@ -15,7 +15,8 @@
 )
 
 from .models import (
-    Detectron2LayoutModel
+    Detectron2LayoutModel,
+    PaddleDetectionLayoutModel
 )
 
 from .io import (
diff --git a/src/layoutparser/models/__init__.py b/src/layoutparser/models/__init__.py
@@ -1 +1,2 @@
-from .detectron2.layoutmodel import Detectron2LayoutModel
+from .detectron2.layoutmodel import Detectron2LayoutModel
+from .paddledetection.layoutmodel import PaddleDetectionLayoutModel
diff --git a/src/layoutparser/models/paddledetection/__init__.py b/src/layoutparser/models/paddledetection/__init__.py
@@ -0,0 +1,2 @@
+from . import catalog as _UNUSED
+from .layoutmodel import PaddleDetectionLayoutModel
diff --git a/src/layoutparser/models/paddledetection/catalog.py b/src/layoutparser/models/paddledetection/catalog.py
@@ -0,0 +1,198 @@
+import os
+import logging
+from typing import Any, Optional
+from urllib.parse import urlparse
+import tarfile
+import uuid
+
+from iopath.common.file_io import PathHandler
+from iopath.common.file_io import HTTPURLHandler
+from iopath.common.file_io import get_cache_dir, file_lock
+from iopath.common.download import download
+
+from ..base_catalog import PathManager
+
+CONFIG_CATALOG = {
+    "PubLayNet": {
+        "ppyolov2_r50vd_dcn_365e_publaynet": "https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar",
+    },
+    "TableBank": {
+        "ppyolov2_r50vd_dcn_365e_tableBank_word": "https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar",
+        "ppyolov2_r50vd_dcn_365e_tableBank_latex": "https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar",
+    },
+}
+
+# fmt: off
+LABEL_MAP_CATALOG = {
+    "PubLayNet": {
+        0: "Text",
+        1: "Title",
+        2: "List",
+        3: "Table",
+        4: "Figure"},
+    "TableBank": {
+        0: "Table"
+    },
+}
+# fmt: on
+
+
+# Paddle model package everything in tar files, and each model's tar file should contain
+# the following files in the list:
+_TAR_FILE_NAME_LIST = [
+    "inference.pdiparams",
+    "inference.pdiparams.info",
+    "inference.pdmodel",
+]
+
+
+def _get_untar_directory(tar_file: str) -> str:
+
+    base_path = os.path.dirname(tar_file)
+    file_name = os.path.splitext(os.path.basename(tar_file))[0]
+    target_folder = os.path.join(base_path, file_name)
+
+    return target_folder
+
+
+def _untar_model_weights(model_tar):
+    """untar model files"""
+
+    model_dir = _get_untar_directory(model_tar)
+
+    if not os.path.exists(
+        os.path.join(model_dir, _TAR_FILE_NAME_LIST[0])
+    ) or not os.path.exists(os.path.join(model_dir, _TAR_FILE_NAME_LIST[2])):
+        # the path to save the decompressed file
+        os.makedirs(model_dir, exist_ok=True)
+        with tarfile.open(model_tar, "r") as tarobj:
+            for member in tarobj.getmembers():
+                filename = None
+                for tar_file_name in _TAR_FILE_NAME_LIST:
+                    if tar_file_name in member.name:
+                        filename = tar_file_name
+                if filename is None:
+                    continue
+                file = tarobj.extractfile(member)
+                with open(os.path.join(model_dir, filename), "wb") as model_file:
+                    model_file.write(file.read())
+    return model_dir
+
+
+def is_cached_folder_exists_and_valid(cached):
+    possible_extracted_model_folder = _get_untar_directory(cached)
+    if not os.path.exists(possible_extracted_model_folder):
+        return False
+    for tar_file in _TAR_FILE_NAME_LIST:
+        if not os.path.exists(os.path.join(possible_extracted_model_folder, tar_file)):
+            return False
+    return True
+
+
+class PaddleModelURLHandler(HTTPURLHandler):
+    """
+    Supports download and file check for Baidu Cloud links
+    """
+
+    MAX_FILENAME_LEN = 250
+
+    def _get_supported_prefixes(self):
+        return ["https://paddle-model-ecology.bj.bcebos.com"]
+
+    def _isfile(self, path):
+        return path in self.cache_map
+
+    def _get_local_path(
+        self,
+        path: str,
+        force: bool = False,
+        cache_dir: Optional[str] = None,
+        **kwargs: Any,
+    ) -> str:
+        """
+        As paddle model stores all files in tar files, we need to extract them
+        and get the newly extracted folder path. This function rewrites the base
+        function to support the following situations:
+
+        1. If the tar file is not downloaded, it will download the tar file,
+            extract it to the target folder, delete the downloaded tar file,
+            and return the folder path.
+        2. If the extracted target folder is present, and all the necessary model
+            files are present (specified in _TAR_FILE_NAME_LIST), it will
+            return the folder path.
+        3. If the tar file is downloaded, but the extracted target folder is not
+            present (or it doesn't contain the necessary files in _TAR_FILE_NAME_LIST),
+            it will extract the tar file to the target folder, delete the tar file,
+            and return the folder path.
+
+        """
+        self._check_kwargs(kwargs)
+        if (
+            force
+            or path not in self.cache_map
+            or not os.path.exists(self.cache_map[path])
+        ):
+            logger = logging.getLogger(__name__)
+            parsed_url = urlparse(path)
+            dirname = os.path.join(
+                get_cache_dir(cache_dir), os.path.dirname(parsed_url.path.lstrip("/"))
+            )
+            filename = path.split("/")[-1]
+            if len(filename) > self.MAX_FILENAME_LEN:
+                filename = filename[:100] + "_" + uuid.uuid4().hex
+
+            cached = os.path.join(dirname, filename)
+
+            if is_cached_folder_exists_and_valid(cached):
+                # When the cached folder exists and valid, we don't need to redownload
+                # the tar file.
+                self.cache_map[path] = _get_untar_directory(cached)
+
+            else:
+                with file_lock(cached):
+                    if not os.path.isfile(cached):
+                        logger.info("Downloading {} ...".format(path))
+                        cached = download(path, dirname, filename=filename)
+
+                    if path.endswith(".tar"):
+                        model_dir = _untar_model_weights(cached)
+                        try:
+                            os.remove(cached)  # remove the redundant tar file
+                            # TODO: remove the .lock file .
+                        except:
+                            logger.warning(
+                                f"Not able to remove the cached tar file {cached}"
+                            )
+
+                logger.info("URL {} cached in {}".format(path, model_dir))
+                self.cache_map[path] = model_dir
+
+        return self.cache_map[path]
+
+
+class LayoutParserPaddleModelHandler(PathHandler):
+    """
+    Resolve anything that's in LayoutParser model zoo.
+    """
+
+    PREFIX = "lp://paddledetection/"
+
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+
+    def _get_local_path(self, path, **kwargs):
+        model_name = path[len(self.PREFIX) :]
+        dataset_name, *model_name, data_type = model_name.split("/")
+
+        if data_type == "config":
+            model_url = CONFIG_CATALOG[dataset_name]["/".join(model_name)]
+        else:
+            raise ValueError(f"Unknown data_type {data_type}")
+        return PathManager.get_local_path(model_url, **kwargs)
+
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+
+
+PathManager.register_handler(PaddleModelURLHandler())
+PathManager.register_handler(LayoutParserPaddleModelHandler())
diff --git a/src/layoutparser/models/paddledetection/layoutmodel.py b/src/layoutparser/models/paddledetection/layoutmodel.py
diff --git a/tests/test_model.py b/tests/test_model.py

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,8 @@`
`15`	`15`	`)`
`16`	`16`
`17`	`17`	`from .models import (`
`18`		`- Detectron2LayoutModel`
	`18`	`+ Detectron2LayoutModel,`
	`19`	`+ PaddleDetectionLayoutModel`
`19`	`20`	`)`
`20`	`21`
`21`	`22`	`from .io import (`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-from .detectron2.layoutmodel import Detectron2LayoutModel`
	`1`	`+from .detectron2.layoutmodel import Detectron2LayoutModel`
	`2`	`+from .paddledetection.layoutmodel import PaddleDetectionLayoutModel`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from . import catalog as _UNUSED`
	`2`	`+from .layoutmodel import PaddleDetectionLayoutModel`