Skip to content

Commit ef76d4d

Browse files
author
Jamey O'Neill
committed
Resolve merge conflict: add model HuggingFace link and remove HF_TOKEN
2 parents 9e022d1 + 805a92b commit ef76d4d

19 files changed

+3020
-1587
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ jobs:
3030
- name: Install dependencies
3131
run: make install
3232

33-
- name: Revert the changes to bin folder
34-
run: chmod 644 ./bin/*
35-
3633
# - name: Check code
3734
# run: make check
3835

.github/workflows/deploy-schemas.yml

Lines changed: 0 additions & 13 deletions
This file was deleted.

.github/workflows/docs.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
name: build
1+
name: Build and deploy documentation
22
on:
33
push:
44
branches:
55
- main
66
jobs:
7-
deploy:
7+
build-and-deploy:
88
runs-on: ubuntu-latest
99
steps:
1010
- uses: actions/checkout@v2

.github/workflows/publish-to-pypi-beta.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
runs-on: ubuntu-latest
1919
strategy:
2020
matrix:
21-
python-version: [3.11]
21+
python-version: [3.12]
2222

2323
steps:
2424
- name: Checkout the repository

.github/workflows/publish-to-pypi.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
runs-on: ubuntu-latest
1919
strategy:
2020
matrix:
21-
python-version: [3.11]
21+
python-version: [3.12]
2222

2323
steps:
2424
- name: Checkout the repository

.github/workflows/publish-to-test-pypi.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
runs-on: ubuntu-latest
1818
strategy:
1919
matrix:
20-
python-version: [3.11]
20+
python-version: [3.12]
2121

2222
steps:
2323
- name: Checkout the repository

CHANGELOG.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,20 @@
11
# Changelog
22

3-
## [0.1.0] - Unreleased
3+
All notable changes to this project will be documented in this file.
44

5-
- Initial poster2json package (example structure).
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7+
8+
## [0.1.1] - 2026-02-04
9+
10+
### Added
11+
12+
- Add documentation to the package.
13+
- Add tests to the package.
14+
- Update the logo for the package.
15+
16+
## [0.1.0] - 2026-02-04
17+
18+
### Added
19+
20+
- Initial poster2json package.

Makefile

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,14 @@ endif
4848

4949
# TEST ########################################################################
5050

51-
RANDOM_SEED ?= $(shell date +%s)
5251
FAILURES := .cache/pytest/v/cache/lastfailed
5352

54-
PYTEST_OPTIONS := --random --random-seed=$(RANDOM_SEED)
5553
ifndef DISABLE_COVERAGE
5654
PYTEST_OPTIONS += --cov=$(PACKAGE)
5755
endif
5856
ifdef CI
5957
PYTEST_OPTIONS += --cov-report=xml
58+
PYTEST_OPTIONS += -m "not gpu"
6059
endif
6160
PYTEST_RERUN_OPTIONS := --last-failed --exitfirst
6261

@@ -68,27 +67,18 @@ test-unit: install
6867
@ ( mv $(FAILURES) $(FAILURES).bak || true ) > /dev/null 2>&1
6968
poetry run pytest $(PACKAGE) $(PYTEST_OPTIONS)
7069
@ ( mv $(FAILURES).bak $(FAILURES) || true ) > /dev/null 2>&1
71-
ifndef DISABLE_COVERAGE
72-
poetry run coveragespace update unit
73-
endif
7470

7571
.PHONY: test-int
7672
test-int: install
7773
@ if test -e $(FAILURES); then poetry run pytest tests $(PYTEST_RERUN_OPTIONS); fi
7874
@ rm -rf $(FAILURES)
7975
poetry run pytest tests $(PYTEST_OPTIONS)
80-
ifndef DISABLE_COVERAGE
81-
poetry run coveragespace update integration
82-
endif
8376

8477
.PHONY: test-all
8578
test-all: install
8679
@ if test -e $(FAILURES); then poetry run pytest $(PACKAGE) tests $(PYTEST_RERUN_OPTIONS); fi
8780
@ rm -rf $(FAILURES)
8881
poetry run pytest $(PACKAGE) tests $(PYTEST_OPTIONS)
89-
ifndef DISABLE_COVERAGE
90-
poetry run coveragespace update overall
91-
endif
9282

9383
.PHONY: read-coverage
9484
read-coverage:

README.md

Lines changed: 52 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<div align="center">
22

3-
<img src="https://raw.githubusercontent.com/fairdataihub/poster2json/main/logo.svg" alt="logo" width="200" height="auto" />
3+
<img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
44

55
<br />
66

@@ -58,6 +58,7 @@ Convert scientific posters (PDF/images) to structured JSON metadata using Large
5858
**poster2json** extracts structured metadata from scientific conference posters (PDF or image format) into machine-actionable JSON conforming to the [poster-json-schema](https://github.com/fairdataihub/poster-json-schema).
5959

6060
The pipeline uses:
61+
6162
- [**Llama-3.1-8B-Poster-Extraction**](https://huggingface.co/jimnoneill/Llama-3.1-8B-Poster-Extraction) for JSON structuring
6263
- **Qwen2-VL-7B** for vision-based OCR of image posters
6364
- **pdfalto** for layout-aware PDF text extraction
@@ -104,49 +105,56 @@ Output conforms to the [poster-json-schema](https://github.com/fairdataihub/post
104105
{
105106
"$schema": "https://posters.science/schema/v0.1/poster_schema.json",
106107
"creators": [
107-
{"name": "Garcia, Sofia", "givenName": "Sofia", "familyName": "Garcia", "affiliation": ["University"]}
108+
{
109+
"name": "Garcia, Sofia",
110+
"givenName": "Sofia",
111+
"familyName": "Garcia",
112+
"affiliation": ["University"]
113+
}
114+
],
115+
"titles": [
116+
{ "title": "Machine Learning Approaches to Diabetic Retinopathy Detection" }
108117
],
109-
"titles": [{"title": "Machine Learning Approaches to Diabetic Retinopathy Detection"}],
110118
"posterContent": {
111119
"sections": [
112-
{"sectionTitle": "Abstract", "sectionContent": "..."},
113-
{"sectionTitle": "Methods", "sectionContent": "..."},
114-
{"sectionTitle": "Results", "sectionContent": "..."}
120+
{ "sectionTitle": "Abstract", "sectionContent": "..." },
121+
{ "sectionTitle": "Methods", "sectionContent": "..." },
122+
{ "sectionTitle": "Results", "sectionContent": "..." }
115123
]
116124
},
117-
"imageCaptions": [{"captions": ["Figure 1.", "ROC curves showing..."]}],
118-
"tableCaptions": [{"captions": ["Table 1.", "Performance metrics"]}]
125+
"imageCaptions": [{ "captions": ["Figure 1.", "ROC curves showing..."] }],
126+
"tableCaptions": [{ "captions": ["Table 1.", "Performance metrics"] }]
119127
}
120128
```
121129

122130
## System Requirements
123131

124-
| Requirement | Specification |
125-
|-------------|---------------|
126-
| GPU | NVIDIA CUDA-capable, ≥16GB VRAM |
127-
| RAM | ≥32GB recommended |
128-
| Python | 3.10+ |
129-
| OS | Linux, macOS, Windows (via WSL2) |
132+
| Requirement | Specification |
133+
| ----------- | -------------------------------- |
134+
| GPU | NVIDIA CUDA-capable, ≥16GB VRAM |
135+
| RAM | ≥32GB recommended |
136+
| Python | 3.10+ |
137+
| OS | Linux, macOS, Windows (via WSL2) |
130138

131139
## Performance
132140

133141
Validated on 10 manually annotated scientific posters:
134142

135-
| Metric | Score | Threshold |
136-
|--------|-------|-----------|
137-
| Word Capture | 0.96 | ≥0.75 |
138-
| ROUGE-L | 0.89 | ≥0.75 |
139-
| Number Capture | 0.93 | ≥0.75 |
140-
| Field Proportion | 0.99 | 0.30–2.50 |
143+
| Metric | Score | Threshold |
144+
| ---------------- | ----- | --------- |
145+
| Word Capture | 0.96 | ≥0.75 |
146+
| ROUGE-L | 0.89 | ≥0.75 |
147+
| Number Capture | 0.93 | ≥0.75 |
148+
| Field Proportion | 0.99 | 0.30–2.50 |
141149

142150
**Pass Rate**: 10/10 (100%)
143151

144152
## Documentation
145153

146-
| Document | Description |
147-
|----------|-------------|
154+
| Document | Description |
155+
| ------------------------------------ | ------------------------------- |
148156
| [Architecture](docs/architecture.md) | Technical details & methodology |
149-
| [Evaluation](docs/evaluation.md) | Validation metrics & results |
157+
| [Evaluation](docs/evaluation.md) | Validation metrics & results |
150158

151159
## Development Setup
152160

@@ -155,15 +163,32 @@ Validated on 10 manually annotated scientific posters:
155163
git clone https://github.com/fairdataihub/poster2json.git
156164
cd poster2json
157165

158-
# Install with Poetry
166+
# Create a virtual environment
167+
python -m venv .venv
168+
169+
# Activate the virtual environment
170+
source venv/bin/activate
171+
.venv\Scripts\activate # On Windows
172+
173+
# Install poetry
159174
pip install poetry
175+
176+
# Install dependencies
160177
poetry install
161178

162179
# Run tests
163-
poetry run pytest
180+
poe test
164181

165182
# Format code
166-
poetry run poe format
183+
poe format
184+
```
185+
186+
If you are on windows and have multiple python versions, you can use the following commands:
187+
188+
```bash
189+
py -0p # list all python versions
190+
191+
py -3.12 -m venv .venv
167192
```
168193

169194
## License
@@ -175,7 +200,7 @@ MIT License - see [LICENSE](LICENSE.md) for details.
175200
```bibtex
176201
@software{poster2json2026,
177202
title = {poster2json: Scientific Poster to JSON Metadata Extraction},
178-
author = {O'Neill, James and Soundarajan, Sanjay and Patel, Bhavesh},
203+
author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
179204
year = {2026},
180205
url = {https://github.com/fairdataihub/poster2json},
181206
doi = {10.5281/zenodo.18320010}

0 commit comments

Comments
 (0)