|
| 1 | +""" |
| 2 | +This module contains variables that can permitted to be tweaked by the system environment. For |
| 3 | +example, model parameters that changes the output of an inference call. Constants do NOT belong in |
| 4 | +this module. Constants are values that are usually names for common options (e.g., color names) or |
| 5 | +settings that should not be altered without making a code change (e.g., definition of 1Gb of memory |
| 6 | +in bytes). Constants should go into `./constants.py` |
| 7 | +""" |
| 8 | +import os |
| 9 | +from dataclasses import dataclass |
| 10 | + |
| 11 | + |
| 12 | +@dataclass |
| 13 | +class InferenceConfig: |
| 14 | + """class for configuring inference parameters""" |
| 15 | + |
| 16 | + def _get_string(self, var: str, default_value: str = "") -> str: |
| 17 | + """attempt to get the value of var from the os environment; if not present return the |
| 18 | + default_value""" |
| 19 | + return os.environ.get(var, default_value) |
| 20 | + |
| 21 | + def _get_int(self, var: str, default_value: int) -> int: |
| 22 | + if value := self._get_string(var): |
| 23 | + return int(value) |
| 24 | + return default_value |
| 25 | + |
| 26 | + def _get_float(self, var: str, default_value: float) -> float: |
| 27 | + if value := self._get_string(var): |
| 28 | + return float(value) |
| 29 | + return default_value |
| 30 | + |
| 31 | + @property |
| 32 | + def TABLE_IMAGE_CROP_PAD(self) -> int: |
| 33 | + """extra image content to add around an identified table region; measured in pixels |
| 34 | +
|
| 35 | + The padding adds image data around an identified table bounding box for downstream table |
| 36 | + structure detection model use as input |
| 37 | + """ |
| 38 | + return self._get_int("TABLE_IMAGE_CROP_PAD", 12) |
| 39 | + |
| 40 | + @property |
| 41 | + def TABLE_IMAGE_BACKGROUND_PAD(self) -> int: |
| 42 | + """number of pixels to pad around an table image with a white background color |
| 43 | +
|
| 44 | + The padding adds NO image data around an identified table bounding box; it simply adds white |
| 45 | + background around the image |
| 46 | + """ |
| 47 | + return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 0) |
| 48 | + |
| 49 | + @property |
| 50 | + def LAYOUT_SAME_REGION_THRESHOLD(self) -> float: |
| 51 | + """threshold for two layouts' bounding boxes to be considered as the same region |
| 52 | +
|
| 53 | + When the intersection area over union area of the two is larger than this threshold the two |
| 54 | + boxes are considered the same region |
| 55 | + """ |
| 56 | + return self._get_float("LAYOUT_SAME_REGION_THRESHOLD", 0.75) |
| 57 | + |
| 58 | + @property |
| 59 | + def LAYOUT_SUBREGION_THRESHOLD(self) -> float: |
| 60 | + """threshold for one bounding box to be considered as a sub-region of another bounding box |
| 61 | +
|
| 62 | + When the intersection region area divided by self area is larger than this threshold self is |
| 63 | + considered a subregion of the other |
| 64 | + """ |
| 65 | + return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75) |
| 66 | + |
| 67 | + @property |
| 68 | + def ELEMENTS_H_PADDING_COEF(self) -> float: |
| 69 | + """When extending the boundaries of a PDF object for the purpose of determining which other |
| 70 | + elements should be considered in the same text region, we use a relative distance based on |
| 71 | + some fraction of the block height (typically character height). This is the fraction used |
| 72 | + for the horizontal extension applied to the left and right sides. |
| 73 | + """ |
| 74 | + return self._get_float("ELEMENTS_H_PADDING_COEF", 0.4) |
| 75 | + |
| 76 | + @property |
| 77 | + def ELEMENTS_V_PADDING_COEF(self) -> float: |
| 78 | + """Same as ELEMENTS_H_PADDING_COEF but the vertical extension.""" |
| 79 | + return self._get_float("ELEMENTS_V_PADDING_COEF", 0.3) |
| 80 | + |
| 81 | + |
| 82 | +inference_config = InferenceConfig() |
0 commit comments