Skip to content

Commit 2dcacd9

Browse files
authored
optionally allow consolidation via env variables (#288)
1 parent d9b6184 commit 2dcacd9

File tree

4 files changed

+139
-22
lines changed

4 files changed

+139
-22
lines changed

.env

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
COMPOSE_PROJECT_NAME=sciencebeam
22
IMAGE_TAG=develop
33
VERSION=
4-
GROBID_TAG=0.5.3
4+
GROBID_TAG=0.6.0
55
SCIENCEBEAM_DOC_CONVERT_PROCESS_TIMEOUT=60
6+
SCIENCEBEAM_CONSOLIDATE_HEADER=0
7+
SCIENCEBEAM_CONSOLIDATE_CITATIONS=0
8+
SCIENCEBEAM_INCLUDE_RAW_AFFILIATIONS=1
9+
SCIENCEBEAM_INCLUDE_RAW_CITATIONS=1

docker-compose.yml

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@ services:
1414
--grobid-url http://grobid:8070/api
1515
environment:
1616
- SCIENCEBEAM_DOC_CONVERT_PROCESS_TIMEOUT=${SCIENCEBEAM_DOC_CONVERT_PROCESS_TIMEOUT}
17+
- SCIENCEBEAM_CONSOLIDATE_HEADER=${SCIENCEBEAM_CONSOLIDATE_HEADER}
18+
- SCIENCEBEAM_CONSOLIDATE_CITATIONS=${SCIENCEBEAM_CONSOLIDATE_CITATIONS}
19+
- SCIENCEBEAM_INCLUDE_RAW_AFFILIATIONS=${SCIENCEBEAM_INCLUDE_RAW_AFFILIATIONS}
20+
- SCIENCEBEAM_INCLUDE_RAW_CITATIONS=${SCIENCEBEAM_INCLUDE_RAW_CITATIONS}
1721
ports:
1822
- "8075:8075"
19-
networks:
20-
- default
21-
- grobid
2223

2324
sciencebeam-base-dev:
2425
build:
@@ -46,11 +47,3 @@ services:
4647
- JAVA_OPTS=-Xmx1g
4748
ports:
4849
- "8070:8070"
49-
networks:
50-
- grobid
51-
52-
networks:
53-
default:
54-
internal: false
55-
grobid:
56-
internal: true

sciencebeam/transformers/grobid_service.py

Lines changed: 76 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import os
12
from io import BytesIO
23
import logging
34
from functools import partial
5+
from typing import Dict
46

57
import requests
68

@@ -18,18 +20,80 @@ class GrobidApiPaths:
1820
PROCESS_FULL_TEXT_DOCUMENT = '/processFulltextDocument'
1921

2022

23+
class GrobidServiceConfigEnvVariables:
24+
CONSOLIDATE_HEADER = 'SCIENCEBEAM_CONSOLIDATE_HEADER'
25+
CONSOLIDATE_CITATIONS = 'SCIENCEBEAM_CONSOLIDATE_CITATIONS'
26+
INCLUDE_RAW_AFFILIATIONS = 'SCIENCEBEAM_INCLUDE_RAW_AFFILIATIONS'
27+
INCLUDE_RAW_CITATIONS = 'SCIENCEBEAM_INCLUDE_RAW_CITATIONS'
28+
29+
2130
service_wrapper = GrobidServiceWrapper()
2231

2332

2433
def get_logger():
2534
return logging.getLogger(__name__)
2635

2736

37+
class GrobidServiceConfig:
38+
def __init__(
39+
self,
40+
consolidate_header: bool = False,
41+
consolidate_citations: bool = False,
42+
include_raw_affiliations: bool = True,
43+
include_raw_citations: bool = True):
44+
self.consolidate_header = consolidate_header
45+
self.consolidate_citations = consolidate_citations
46+
self.include_raw_affiliations = include_raw_affiliations
47+
self.include_raw_citations = include_raw_citations
48+
49+
50+
def get_grobid_service_config() -> GrobidServiceConfig:
51+
env_to_prop_map = {
52+
GrobidServiceConfigEnvVariables.CONSOLIDATE_HEADER: 'consolidate_header',
53+
GrobidServiceConfigEnvVariables.CONSOLIDATE_CITATIONS: 'consolidate_citations',
54+
GrobidServiceConfigEnvVariables.INCLUDE_RAW_AFFILIATIONS: 'include_raw_affiliations',
55+
GrobidServiceConfigEnvVariables.INCLUDE_RAW_CITATIONS: 'include_raw_citations'
56+
}
57+
config = GrobidServiceConfig()
58+
for env_name, prop_name in env_to_prop_map.items():
59+
env_value = os.environ.get(env_name)
60+
if env_value:
61+
setattr(config, prop_name, env_value == '1')
62+
return config
63+
64+
65+
def _get_bool_int_param(bool_value: bool) -> str:
66+
return '1' if bool_value else '0'
67+
68+
69+
def get_request_data_for_config(grobid_service_config: GrobidServiceConfig) -> Dict[str, str]:
70+
return {
71+
'consolidateHeader': _get_bool_int_param(
72+
grobid_service_config.consolidate_header
73+
),
74+
'consolidateCitations': _get_bool_int_param(
75+
grobid_service_config.consolidate_citations
76+
),
77+
'includeRawAffiliations': _get_bool_int_param(
78+
grobid_service_config.include_raw_affiliations
79+
),
80+
'includeRawCitations': _get_bool_int_param(
81+
grobid_service_config.include_raw_citations
82+
)
83+
}
84+
85+
2886
def start_service_if_not_running():
2987
service_wrapper.start_service_if_not_running()
3088

3189

32-
def run_grobid_service(item, base_url, path, start_service=True, field_name=None):
90+
def run_grobid_service(
91+
item,
92+
base_url: str,
93+
path: str,
94+
grobid_service_config: GrobidServiceConfig = None,
95+
start_service: bool = True,
96+
field_name: str = None):
3397
"""
3498
Translates PDF content via the GROBID service.
3599
@@ -69,12 +133,7 @@ def run_grobid_service(item, base_url, path, start_service=True, field_name=None
69133
response = requests.post(
70134
url,
71135
files={'input': (filename, BytesIO(content))},
72-
data={
73-
'consolidateHeader': '0',
74-
'consolidateCitations': '0',
75-
'includeRawAffiliations': '1',
76-
'includeRawCitations': '1'
77-
}
136+
data=get_request_data_for_config(grobid_service_config)
78137
)
79138
response.raise_for_status()
80139
result_content = response.content
@@ -83,11 +142,19 @@ def run_grobid_service(item, base_url, path, start_service=True, field_name=None
83142
return result_content
84143

85144

86-
def grobid_service(base_url, path, start_service=True, field_name=None):
145+
def grobid_service(
146+
base_url: str,
147+
path: str,
148+
start_service: bool = True,
149+
field_name: str = None,
150+
grobid_service_config: GrobidServiceConfig = None):
151+
if not grobid_service_config:
152+
grobid_service_config = get_grobid_service_config()
87153
return partial(
88154
run_grobid_service,
89155
base_url=base_url,
90156
path=path,
91157
start_service=start_service,
92-
field_name=field_name
158+
field_name=field_name,
159+
grobid_service_config=grobid_service_config
93160
)

tests/transformers/grobid_service_test.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,14 @@
22

33
import pytest
44

5-
from sciencebeam.transformers.grobid_service import grobid_service as create_grobid_service
5+
from sciencebeam.transformers.grobid_service import (
6+
GrobidServiceConfigEnvVariables,
7+
GrobidServiceConfig,
8+
get_grobid_service_config,
9+
get_request_data_for_config,
10+
grobid_service as create_grobid_service
11+
)
12+
613

714
BASE_URL = 'http://grobid/api'
815
PATH_1 = '/path1'
@@ -21,6 +28,52 @@ def _mock_requests_post():
2128
yield requests_post
2229

2330

31+
@pytest.fixture(name='environ', autouse=True)
32+
def _mock_environ():
33+
with patch('os.environ', {}) as mock:
34+
yield mock
35+
36+
37+
@pytest.fixture(name='grobid_service_config')
38+
def _grobid_service_config() -> GrobidServiceConfig:
39+
return GrobidServiceConfig()
40+
41+
42+
class TestGetGrobidServiceConfig:
43+
def test_should_return_default_config(self):
44+
config = get_grobid_service_config()
45+
assert not config.consolidate_header
46+
assert not config.consolidate_citations
47+
assert config.include_raw_affiliations
48+
assert config.include_raw_citations
49+
50+
def test_should_be_able_toggle_config(self, environ: dict):
51+
environ[GrobidServiceConfigEnvVariables.CONSOLIDATE_HEADER] = '1'
52+
environ[GrobidServiceConfigEnvVariables.CONSOLIDATE_CITATIONS] = '1'
53+
environ[GrobidServiceConfigEnvVariables.INCLUDE_RAW_AFFILIATIONS] = '0'
54+
environ[GrobidServiceConfigEnvVariables.INCLUDE_RAW_CITATIONS] = '0'
55+
config = get_grobid_service_config()
56+
assert config.consolidate_header
57+
assert config.consolidate_citations
58+
assert not config.include_raw_affiliations
59+
assert not config.include_raw_citations
60+
61+
62+
class TestGetRequestDataForConfig:
63+
def test_should_generate_dict_for_default_config(
64+
self, grobid_service_config: GrobidServiceConfig):
65+
grobid_service_config.consolidate_header = False
66+
grobid_service_config.consolidate_citations = False
67+
grobid_service_config.include_raw_affiliations = True
68+
grobid_service_config.include_raw_citations = True
69+
assert get_request_data_for_config(grobid_service_config) == {
70+
'consolidateHeader': '0',
71+
'consolidateCitations': '0',
72+
'includeRawAffiliations': '1',
73+
'includeRawCitations': '1'
74+
}
75+
76+
2477
class TestCreateGrobidService:
2578
def test_should_pass_url_and_data_as_file(self, requests_post):
2679
create_grobid_service(BASE_URL, PATH_1, start_service=False)(

0 commit comments

Comments
 (0)