Skip to content

Commit ee6e638

Browse files
authored
Merge branch 'main' into pprados/fix_password
2 parents 71bb5f7 + 85bcdc1 commit ee6e638

File tree

9 files changed

+56
-27
lines changed

9 files changed

+56
-27
lines changed

.github/workflows/ci.yml

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ on:
66
pull_request:
77
branches: [ main ]
88

9-
env:
10-
PYTHON_VERSION: 3.9
11-
129
jobs:
1310
setup:
11+
strategy:
12+
matrix:
13+
python-version: ["3.9","3.10","3.11", "3.12"]
1414
runs-on: ubuntu-latest
1515
steps:
1616
- uses: actions/checkout@v4
@@ -19,24 +19,27 @@ jobs:
1919
with:
2020
path: |
2121
.venv
22-
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
22+
key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }}
2323
lookup-only: true
24-
- name: Set up Python ${{ env.PYTHON_VERSION }}
24+
- name: Set up Python ${{ matrix.python-version }}
2525
uses: actions/setup-python@v5
2626
with:
27-
python-version: ${{ env.PYTHON_VERSION }}
27+
python-version: ${{ matrix.python-version }}
2828
- name: Install Poppler
2929
run: |
3030
sudo apt-get update
3131
sudo apt-get -y install poppler-utils
3232
- name: Setup virtual environment (no cache hit)
3333
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
3434
run: |
35-
python${{ env.PYTHON_VERSION }} -m venv .venv
35+
python${{ matrix.python-version }} -m venv .venv
3636
source .venv/bin/activate
3737
make install-ci
3838
3939
lint:
40+
strategy:
41+
matrix:
42+
python-version: ["3.9","3.10","3.11", "3.12"]
4043
runs-on: ubuntu-latest
4144
needs: setup
4245
steps:
@@ -45,13 +48,13 @@ jobs:
4548
id: virtualenv-cache
4649
with:
4750
path: .venv
48-
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
51+
key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }}
4952
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
5053
# We can take this out when we implement the fix in CORE-99
5154
- name: Setup virtual environment (no cache hit)
5255
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
5356
run: |
54-
python${{ env.PYTHON_VERSION }} -m venv .venv
57+
python${{ matrix.python-version }} -m venv .venv
5558
- name: Lint
5659
run: |
5760
source .venv/bin/activate
@@ -66,6 +69,9 @@ jobs:
6669
uses: ludeeus/action-shellcheck@master
6770

6871
test:
72+
strategy:
73+
matrix:
74+
python-version: ["3.9","3.10","3.11", "3.12"]
6975
runs-on: ubuntu-latest
7076
needs: [setup, lint]
7177
steps:
@@ -75,13 +81,13 @@ jobs:
7581
with:
7682
path: |
7783
.venv
78-
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
84+
key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }}
7985
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
8086
# We can take this out when we implement the fix in CORE-99
8187
- name: Setup virtual environment (no cache hit)
8288
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
8389
run: |
84-
python${{ env.PYTHON_VERSION }} -m venv .venv
90+
python${{ matrix.python-version }} -m venv .venv
8591
- name: Install Poppler
8692
run: |
8793
sudo apt-get update

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
## 0.8.6
2+
3+
* feat: add back `source` to `TextRegions` and `LayoutElements` for backward compatibility
4+
5+
## 0.8.5
6+
7+
* fix: remove `pdfplumber` but include `pdfminer-six==20240706` to update `pdfminer`
8+
19
## 0.8.4
210

311
* feat: add `text_as_html` and `table_as_cells` to `LayoutElements` class as new attributes

requirements/base.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@ transformers>=4.25.1
1313
rapidfuzz
1414
pandas
1515
scipy
16-
pdfplumber
16+
pypdfium2
17+
pdfminer-six==20240706

requirements/base.txt

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ filelock==3.16.1
2727
# transformers
2828
flatbuffers==24.12.23
2929
# via onnxruntime
30-
fonttools==4.55.3
30+
fonttools==4.55.4
3131
# via matplotlib
3232
fsspec==2024.12.0
3333
# via
@@ -83,14 +83,11 @@ packaging==24.2
8383
# transformers
8484
pandas==2.2.3
8585
# via -r requirements/base.in
86-
pdfminer-six==20231228
87-
# via pdfplumber
88-
pdfplumber==0.11.5
86+
pdfminer-six==20240706
8987
# via -r requirements/base.in
9088
pillow==11.1.0
9189
# via
9290
# matplotlib
93-
# pdfplumber
9491
# torchvision
9592
protobuf==5.29.3
9693
# via
@@ -101,7 +98,7 @@ pycparser==2.22
10198
pyparsing==3.2.1
10299
# via matplotlib
103100
pypdfium2==4.30.1
104-
# via pdfplumber
101+
# via -r requirements/base.in
105102
python-dateutil==2.9.0.post0
106103
# via
107104
# matplotlib
@@ -135,7 +132,7 @@ sympy==1.13.1
135132
# via
136133
# onnxruntime
137134
# torch
138-
timm==1.0.13
135+
timm==1.0.14
139136
# via -r requirements/base.in
140137
tokenizers==0.21.0
141138
# via transformers
@@ -150,7 +147,7 @@ tqdm==4.67.1
150147
# via
151148
# huggingface-hub
152149
# transformers
153-
transformers==4.48.0
150+
transformers==4.48.1
154151
# via -r requirements/base.in
155152
typing-extensions==4.12.2
156153
# via

requirements/dev.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ executing==2.1.0
8080
# via stack-data
8181
fastjsonschema==2.21.1
8282
# via nbformat
83-
fonttools==4.55.3
83+
fonttools==4.55.4
8484
# via
8585
# -c requirements/base.txt
8686
# matplotlib
@@ -106,7 +106,7 @@ idna==3.10
106106
# httpx
107107
# jsonschema
108108
# requests
109-
importlib-metadata==8.5.0
109+
importlib-metadata==8.6.1
110110
# via
111111
# build
112112
# jupyter-client
@@ -273,7 +273,7 @@ platformdirs==4.3.6
273273
# jupyter-core
274274
prometheus-client==0.21.1
275275
# via jupyter-server
276-
prompt-toolkit==3.0.48
276+
prompt-toolkit==3.0.50
277277
# via
278278
# ipython
279279
# jupyter-console

test_unstructured_inference/test_elements.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def test_layoutelements():
6161
element_coords=coords,
6262
element_class_ids=element_class_ids,
6363
element_class_id_map=class_map,
64-
sources=np.array(["yolox"] * len(element_class_ids)),
64+
source="yolox",
6565
)
6666

6767

@@ -441,20 +441,22 @@ def test_layoutelements_to_list_and_back(test_layoutelements):
441441
def test_layoutelements_from_list_no_elements():
442442
back = LayoutElements.from_list(elements=[])
443443
assert back.sources.size == 0
444+
assert back.source is None
444445
assert back.element_coords.size == 0
445446

446447

447448
def test_textregions_from_list_no_elements():
448449
back = TextRegions.from_list(regions=[])
449450
assert back.sources.size == 0
451+
assert back.source is None
450452
assert back.element_coords.size == 0
451453

452454

453455
def test_layoutelements_concatenate():
454456
layout1 = LayoutElements(
455457
element_coords=np.array([[0, 0, 1, 1], [1, 1, 2, 2]]),
456458
texts=np.array(["a", "two"]),
457-
sources=np.array(["yolox", "yolox"]),
459+
source="yolox",
458460
element_class_ids=np.array([0, 1]),
459461
element_class_id_map={0: "type0", 1: "type1"},
460462
)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.8.4" # pragma: no cover
1+
__version__ = "0.8.6" # pragma: no cover

unstructured_inference/inference/elements.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,11 +211,21 @@ class TextRegions:
211211
element_coords: np.ndarray
212212
texts: np.ndarray = field(default_factory=lambda: np.array([]))
213213
sources: np.ndarray = field(default_factory=lambda: np.array([]))
214+
source: Source | None = None
214215

215216
def __post_init__(self):
216217
if self.texts.size == 0 and self.element_coords.size > 0:
217218
self.texts = np.array([None] * self.element_coords.shape[0])
218219

220+
# for backward compatibility; also allow to use one value to set sources for all regions
221+
if self.sources.size == 0 and self.element_coords.size > 0:
222+
self.sources = np.array([self.source] * self.element_coords.shape[0])
223+
elif self.source is None and self.sources.size:
224+
self.source = self.sources[0]
225+
226+
# we convert to float so data type is more consistent (e.g., None will be np.nan)
227+
self.element_coords = self.element_coords.astype(float)
228+
219229
def slice(self, indices) -> TextRegions:
220230
"""slice text regions based on indices"""
221231
return TextRegions(

unstructured_inference/inference/layoutelement.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,18 @@ def __post_init__(self):
4141
"element_probs",
4242
"element_class_ids",
4343
"texts",
44-
"sources",
4544
"text_as_html",
4645
"table_as_cells",
4746
):
4847
if getattr(self, attr).size == 0 and element_size:
4948
setattr(self, attr, np.array([None] * element_size))
5049

50+
# for backward compatibility; also allow to use one value to set sources for all regions
51+
if self.sources.size == 0 and self.element_coords.size > 0:
52+
self.sources = np.array([self.source] * self.element_coords.shape[0])
53+
elif self.source is None and self.sources.size:
54+
self.source = self.sources[0]
55+
5156
self.element_probs = self.element_probs.astype(float)
5257

5358
def __eq__(self, other: object) -> bool:

0 commit comments

Comments
 (0)