1+ import os
12from io import BytesIO
23import logging
34from functools import partial
5+ from typing import Dict
46
57import requests
68
@@ -18,18 +20,80 @@ class GrobidApiPaths:
1820 PROCESS_FULL_TEXT_DOCUMENT = '/processFulltextDocument'
1921
2022
23+ class GrobidServiceConfigEnvVariables :
24+ CONSOLIDATE_HEADER = 'SCIENCEBEAM_CONSOLIDATE_HEADER'
25+ CONSOLIDATE_CITATIONS = 'SCIENCEBEAM_CONSOLIDATE_CITATIONS'
26+ INCLUDE_RAW_AFFILIATIONS = 'SCIENCEBEAM_INCLUDE_RAW_AFFILIATIONS'
27+ INCLUDE_RAW_CITATIONS = 'SCIENCEBEAM_INCLUDE_RAW_CITATIONS'
28+
29+
2130service_wrapper = GrobidServiceWrapper ()
2231
2332
2433def get_logger ():
2534 return logging .getLogger (__name__ )
2635
2736
37+ class GrobidServiceConfig :
38+ def __init__ (
39+ self ,
40+ consolidate_header : bool = False ,
41+ consolidate_citations : bool = False ,
42+ include_raw_affiliations : bool = True ,
43+ include_raw_citations : bool = True ):
44+ self .consolidate_header = consolidate_header
45+ self .consolidate_citations = consolidate_citations
46+ self .include_raw_affiliations = include_raw_affiliations
47+ self .include_raw_citations = include_raw_citations
48+
49+
50+ def get_grobid_service_config () -> GrobidServiceConfig :
51+ env_to_prop_map = {
52+ GrobidServiceConfigEnvVariables .CONSOLIDATE_HEADER : 'consolidate_header' ,
53+ GrobidServiceConfigEnvVariables .CONSOLIDATE_CITATIONS : 'consolidate_citations' ,
54+ GrobidServiceConfigEnvVariables .INCLUDE_RAW_AFFILIATIONS : 'include_raw_affiliations' ,
55+ GrobidServiceConfigEnvVariables .INCLUDE_RAW_CITATIONS : 'include_raw_citations'
56+ }
57+ config = GrobidServiceConfig ()
58+ for env_name , prop_name in env_to_prop_map .items ():
59+ env_value = os .environ .get (env_name )
60+ if env_value :
61+ setattr (config , prop_name , env_value == '1' )
62+ return config
63+
64+
65+ def _get_bool_int_param (bool_value : bool ) -> str :
66+ return '1' if bool_value else '0'
67+
68+
69+ def get_request_data_for_config (grobid_service_config : GrobidServiceConfig ) -> Dict [str , str ]:
70+ return {
71+ 'consolidateHeader' : _get_bool_int_param (
72+ grobid_service_config .consolidate_header
73+ ),
74+ 'consolidateCitations' : _get_bool_int_param (
75+ grobid_service_config .consolidate_citations
76+ ),
77+ 'includeRawAffiliations' : _get_bool_int_param (
78+ grobid_service_config .include_raw_affiliations
79+ ),
80+ 'includeRawCitations' : _get_bool_int_param (
81+ grobid_service_config .include_raw_citations
82+ )
83+ }
84+
85+
2886def start_service_if_not_running ():
2987 service_wrapper .start_service_if_not_running ()
3088
3189
32- def run_grobid_service (item , base_url , path , start_service = True , field_name = None ):
90+ def run_grobid_service (
91+ item ,
92+ base_url : str ,
93+ path : str ,
94+ grobid_service_config : GrobidServiceConfig = None ,
95+ start_service : bool = True ,
96+ field_name : str = None ):
3397 """
3498 Translates PDF content via the GROBID service.
3599
@@ -69,12 +133,7 @@ def run_grobid_service(item, base_url, path, start_service=True, field_name=None
69133 response = requests .post (
70134 url ,
71135 files = {'input' : (filename , BytesIO (content ))},
72- data = {
73- 'consolidateHeader' : '0' ,
74- 'consolidateCitations' : '0' ,
75- 'includeRawAffiliations' : '1' ,
76- 'includeRawCitations' : '1'
77- }
136+ data = get_request_data_for_config (grobid_service_config )
78137 )
79138 response .raise_for_status ()
80139 result_content = response .content
@@ -83,11 +142,19 @@ def run_grobid_service(item, base_url, path, start_service=True, field_name=None
83142 return result_content
84143
85144
86- def grobid_service (base_url , path , start_service = True , field_name = None ):
145+ def grobid_service (
146+ base_url : str ,
147+ path : str ,
148+ start_service : bool = True ,
149+ field_name : str = None ,
150+ grobid_service_config : GrobidServiceConfig = None ):
151+ if not grobid_service_config :
152+ grobid_service_config = get_grobid_service_config ()
87153 return partial (
88154 run_grobid_service ,
89155 base_url = base_url ,
90156 path = path ,
91157 start_service = start_service ,
92- field_name = field_name
158+ field_name = field_name ,
159+ grobid_service_config = grobid_service_config
93160 )
0 commit comments