Skip to content

Commit ec359cf

Browse files
authored
fixed libreoffice memory issue (#352)
* added support for accept parameter to shortcut pipeline * yaml linting * added script to "load" test docx to pdf conversion * added minimal docx document * added end to end tests * waiting for sciencebeam and grobid * added separate build-and-start * added script to load test in parallel * close document * linting * switched to using soffice.bin directly * added max_uptime config * added get_uptime to BackgroundProcess * implemented max_uptime * added SCIENCEBEAM_DOC_CONVERT_ENABLE_DEBUG
1 parent 966f719 commit ec359cf

File tree

20 files changed

+568
-51
lines changed

20 files changed

+568
-51
lines changed

Dockerfile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,13 @@ RUN useradd -ms /bin/bash sciencebeam
4949
USER sciencebeam
5050
ENV HOME=/home/sciencebeam
5151

52-
# set and check UNO_PYTHON_PATH
52+
# set and check UNO_PYTHON_PATH and UNO_OFFICE_BINARY_PATH
5353
ENV UNO_PYTHON_PATH=python3.7
54+
ENV UNO_OFFICE_BINARY_PATH=/usr/lib/libreoffice/program/soffice.bin
5455
RUN ${UNO_PYTHON_PATH} -c 'import uno, unohelper' \
55-
&& echo "UNO_PYTHON_PATH: ${UNO_PYTHON_PATH}"
56+
&& echo "UNO_PYTHON_PATH: ${UNO_PYTHON_PATH}" \
57+
&& ls -l ${UNO_OFFICE_BINARY_PATH} \
58+
&& echo "UNO_OFFICE_BINARY_PATH: ${UNO_OFFICE_BINARY_PATH}"
5659

5760
# labels
5861
LABEL org.opencontainers.image.source="https://github.com/elifesciences/sciencebeam"

Jenkinsfile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ elifePipeline {
2828
}
2929
}
3030

31+
stage 'End-to-end tests', {
32+
try {
33+
sh "make IMAGE_TAG=${commit} NO_BUILD=y ci-end-to-end-test"
34+
} finally {
35+
sh "make ci-clean"
36+
}
37+
}
38+
3139
elifeMainlineOnly {
3240
stage 'Merge to master', {
3341
elifeGitMoveToBranch commit, 'master'

Makefile

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ PYTHON = $(VENV)/bin/python
88

99
RUN_DEV = $(DOCKER_COMPOSE) run --rm sciencebeam-dev
1010

11+
SCIENCEBEAM_PORT = 8075
12+
CONVERT_API_URL = http://localhost:$(SCIENCEBEAM_PORT)/api/convert
13+
EXAMPLE_DOCUMENT = test-data/minimal-office-open.docx
14+
1115
NO_BUILD =
1216
ARGS =
1317

@@ -91,16 +95,50 @@ shell-dev: build-dev
9195
$(RUN_DEV) bash
9296

9397

94-
start:
98+
build-and-start:
9599
$(DOCKER_COMPOSE) up -d --build grobid sciencebeam
96100

97101

102+
start:
103+
$(DOCKER_COMPOSE) up -d grobid sciencebeam
104+
105+
98106
start-doc-to-pdf:
99107
$(DOCKER_COMPOSE) build sciencebeam
100108
$(DOCKER_COMPOSE) run --rm --no-deps -p 8075:8075 sciencebeam \
101109
python -m sciencebeam.server --host=0.0.0.0 --port=8075 --pipeline=doc_to_pdf $(ARGS)
102110

103111

112+
convert-example-document:
113+
curl --fail --show-error \
114+
--form "file=@$(EXAMPLE_DOCUMENT);filename=$(EXAMPLE_DOCUMENT)" \
115+
--silent "$(CONVERT_API_URL)" \
116+
> /dev/null
117+
118+
119+
wait-for-sciencebeam:
120+
$(DOCKER_COMPOSE) run --rm wait-for-it \
121+
"sciencebeam:$(SCIENCEBEAM_PORT)" \
122+
--timeout=10 \
123+
--strict \
124+
-- echo "ScienceBeam is up"
125+
126+
127+
wait-for-grobid:
128+
$(DOCKER_COMPOSE) run --rm wait-for-it \
129+
"grobid:8070" \
130+
--timeout=10 \
131+
--strict \
132+
-- echo "GROBID is up"
133+
134+
135+
end-to-end-test:
136+
$(MAKE) start
137+
$(MAKE) wait-for-sciencebeam wait-for-grobid
138+
$(MAKE) convert-example-document
139+
$(MAKE) stop
140+
141+
104142
stop:
105143
$(DOCKER_COMPOSE) down
106144

@@ -117,5 +155,9 @@ ci-test:
117155
$(MAKE) DOCKER_COMPOSE="$(DOCKER_COMPOSE_CI)" test
118156

119157

158+
ci-end-to-end-test:
159+
$(MAKE) DOCKER_COMPOSE="$(DOCKER_COMPOSE_CI)" end-to-end-test
160+
161+
120162
ci-clean:
121163
$(DOCKER_COMPOSE_CI) down -v

app-defaults.cfg

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,11 @@ max_concurrent_threads = 10
1515

1616
[doc_convert]
1717
stop_listener_on_error = true
18+
19+
# 5 minutes
1820
process_timeout = 300
21+
22+
# 24 hours
23+
max_uptime = 86400
24+
1925
enable_debug = false

docker-compose.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ services:
2828
args:
2929
commit: ${IMAGE_TAG}
3030
version: ${VERSION}
31-
install_dev: y
31+
install_dev: "y"
3232
image: elifesciences/sciencebeam-base-dev:${IMAGE_TAG}
3333

3434
sciencebeam-dev:
@@ -47,3 +47,7 @@ services:
4747
- JAVA_OPTS=-Xmx1g
4848
ports:
4949
- "8070:8070"
50+
51+
52+
wait-for-it:
53+
image: willwill/wait-for-it

sciencebeam/pipeline_runners/simple_pipeline_runner.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
2-
from typing import List # pylint: disable=unused-import
2+
from typing import List, Set
33

4-
from sciencebeam.pipelines import PipelineStep # pylint: disable=unused-import
4+
from sciencebeam.pipelines import PipelineStep
55
from sciencebeam.pipelines import (
66
get_pipeline_for_configuration_and_args,
77
add_pipeline_args,
@@ -20,8 +20,7 @@ def __init__(self, data_type):
2020

2121

2222
class SimplePipelineRunner:
23-
def __init__(self, steps):
24-
# type: (List[PipelineStep])
23+
def __init__(self, steps: List[PipelineStep]):
2524
LOGGER.debug('creating pipeline with steps: %s', steps)
2625
self._steps = steps
2726

@@ -34,6 +33,7 @@ def get_supported_types(self):
3433

3534
def convert(
3635
self, content: str, filename: str, data_type: str,
36+
accept_types: Set[str] = None,
3737
includes=None,
3838
context: dict = None) -> dict:
3939
current_item = {
@@ -47,9 +47,15 @@ def convert(
4747
num_processed = 0
4848
for step in self._steps:
4949
data_type = current_item['type']
50+
if accept_types and data_type in accept_types:
51+
LOGGER.debug(
52+
'skipping step (type %r already in accept types: %r): %s',
53+
data_type, accept_types, step
54+
)
55+
continue
5056
if data_type not in step.get_supported_types():
5157
LOGGER.debug(
52-
'skipping step (type "%s" not supported): %s', data_type, step
58+
'skipping step (type %r not supported): %s', data_type, step
5359
)
5460
continue
5561
LOGGER.debug(

sciencebeam/server/blueprints/api.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ def _check_max_concurrent_requests(self):
6969
def _do_convert(self):
7070
data_type = None
7171
includes = parse_includes(request.args.get('includes'))
72+
accept_types = set(request.accept_mimetypes.values())
73+
LOGGER.info('accept_types: %s', accept_types)
7274
if not request.files:
7375
data_type = request.mimetype
7476
filename = request.args.get('filename')
@@ -114,12 +116,13 @@ def _do_convert(self):
114116
conversion_result = self.pipeline_runner.convert(
115117
content=content, filename=filename, data_type=data_type,
116118
includes=includes,
117-
context=context
119+
context=context,
120+
accept_types=accept_types
118121
)
119122
response_content = conversion_result['content']
120123
response_type = conversion_result['type']
121124
LOGGER.debug(
122-
'response_content: %s (%s)',
125+
'response_content: %s bytes (%s)',
123126
len(response_content), response_type
124127
)
125128
if response_type in {MimeTypes.TEI_XML, MimeTypes.JATS_XML}:

sciencebeam/transformers/convert_doc.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
import os
3+
from distutils.util import strtobool
34
from backports.tempfile import TemporaryDirectory
45

56
from sciencebeam.config.app_config import get_app_config
@@ -23,14 +24,18 @@
2324
class AppConfigOptions:
2425
STOP_LISTENER_ON_ERROR = 'stop_listener_on_error'
2526
PROCESS_TIMEOUT = 'process_timeout'
27+
MAX_UPTIME = 'max_uptime'
2628
ENABLE_DEBUG = 'enable_debug'
2729

2830

2931
class EnvironmentVariables:
3032
DOC_CONVERT_PROCESS_TIMEOUT = 'SCIENCEBEAM_DOC_CONVERT_PROCESS_TIMEOUT'
33+
DOC_CONVERT_MAX_UPTIME = 'SCIENCEBEAM_DOC_CONVERT_MAX_UPTIME'
34+
DOC_CONVERT_ENABLE_DEBUG = 'SCIENCEBEAM_DOC_CONVERT_ENABLE_DEBUG'
3135

3236

33-
DEFAULT_DOC_CONVERT_PROCESS_TIMEOUT = 5 * 60
37+
DEFAULT_DOC_CONVERT_PROCESS_TIMEOUT = 5 * 60 # 5 minutes
38+
DEFAULT_DOC_CONVERT_MAX_UPTIME = 24 * 60 * 60 # 24 hours
3439

3540

3641
DEFAULT_CONFIGURATION = dict(
@@ -56,15 +61,28 @@ def _get_default_config():
5661
DOC_CONVERT_SECTION_NAME, AppConfigOptions.PROCESS_TIMEOUT,
5762
fallback=DEFAULT_DOC_CONVERT_PROCESS_TIMEOUT
5863
)
64+
max_uptime = os.environ.get(EnvironmentVariables.DOC_CONVERT_MAX_UPTIME)
65+
if not max_uptime:
66+
max_uptime = app_config.getint(
67+
DOC_CONVERT_SECTION_NAME, AppConfigOptions.MAX_UPTIME,
68+
fallback=DEFAULT_DOC_CONVERT_MAX_UPTIME
69+
)
70+
enable_debug = os.environ.get(EnvironmentVariables.DOC_CONVERT_ENABLE_DEBUG)
71+
if enable_debug:
72+
enable_debug = bool(strtobool(enable_debug))
73+
else:
74+
enable_debug = app_config.getboolean(
75+
DOC_CONVERT_SECTION_NAME, AppConfigOptions.ENABLE_DEBUG,
76+
fallback=DEFAULT_CONFIGURATION['enable_debug']
77+
)
5978
config = {
6079
**config,
6180
'process_timeout': int(process_timeout),
81+
'max_uptime': int(max_uptime),
6282
'stop_listener_on_error': app_config.getboolean(
6383
DOC_CONVERT_SECTION_NAME, AppConfigOptions.STOP_LISTENER_ON_ERROR
6484
),
65-
'enable_debug': app_config.getboolean(
66-
DOC_CONVERT_SECTION_NAME, AppConfigOptions.ENABLE_DEBUG
67-
)
85+
'enable_debug': enable_debug
6886
}
6987
return config
7088

sciencebeam/transformers/doc_converter_wrapper.py

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
import os
33
import socket
4+
import time
45
from contextlib import closing
56
from threading import Lock, current_thread
67

@@ -85,6 +86,21 @@ def __init__(self, port: int, host: str = '127.0.0.1', connect_timeout: int = 10
8586
self.host = host
8687
self.connect_timeout = connect_timeout
8788

89+
def __repr__(self):
90+
return (
91+
'{type_name}('
92+
'port={self.port}'
93+
', host={self.host}'
94+
', connect_timeout={self.connect_timeout}'
95+
', command={command}'
96+
', process={self.process}'
97+
')'
98+
).format(
99+
type_name=type(self).__name__,
100+
self=self,
101+
command=repr(self.command)
102+
)
103+
88104
def is_alive(self):
89105
if not self.is_running():
90106
return False
@@ -94,15 +110,36 @@ def is_alive(self):
94110
return True
95111
return False
96112

97-
def start_and_check_alive(self, **kwargs):
113+
def wait_for_is_alive(self, timeout: float) -> bool:
114+
start_time = time.monotonic()
115+
while not self.is_alive():
116+
if not self.is_running():
117+
return False
118+
if time.monotonic() - start_time >= timeout:
119+
return False
120+
time.sleep(0.5)
121+
return True
122+
123+
def start_and_check_alive(self, timeout=10, **kwargs):
98124
super().start(**kwargs)
99-
if not self.is_alive():
100-
self.stop()
101-
raise ConnectionError('failed to start listener (unable to connect)')
125+
if self.wait_for_is_alive(timeout=timeout):
126+
return
127+
self.stop_if_running()
128+
if self.process.returncode == 81:
129+
# see https://bugs.documentfoundation.org/show_bug.cgi?id=107912
130+
# "headless firstrun crashes (exit code 81)"
131+
LOGGER.info('detected first-run error code 81, re-trying..')
132+
self.start_and_check_alive(timeout=timeout, **kwargs)
133+
return
134+
raise ConnectionError('failed to start listener (unable to connect)')
102135

103-
def start_listener_if_not_running(self, **kwargs):
136+
def start_listener_if_not_running(self, max_uptime: float = None, **kwargs):
104137
if self.is_alive():
105-
return
138+
uptime = self.get_uptime()
139+
if not max_uptime or uptime <= max_uptime:
140+
return
141+
LOGGER.info('stopping listener, exceeded max uptime: %.3f > %.3f', uptime, max_uptime)
142+
self.stop()
106143
self.start_and_check_alive(**kwargs)
107144

108145

@@ -114,19 +151,33 @@ def __init__(
114151
no_launch: bool = True,
115152
keep_listener_running: bool = True,
116153
process_timeout: int = None,
154+
max_uptime: float = 10,
117155
stop_listener_on_error: bool = True):
118156
self.port = port
119157
self.enable_debug = enable_debug
120158
self.no_launch = no_launch
121159
self.keep_listener_running = keep_listener_running
122160
self.process_timeout = process_timeout
161+
self.max_uptime = max_uptime
123162
self.stop_listener_on_error = stop_listener_on_error
124163
self._listener_process = ListenerProcess(port=port)
125164
self._lock = Lock()
126165
self._concurrent_count = 0
127166

167+
def __repr__(self):
168+
return (
169+
'{type_name}('
170+
'port={self.port}'
171+
', keep_listener_running={self.keep_listener_running}'
172+
', _listener_process={self._listener_process}'
173+
')'
174+
).format(
175+
type_name=type(self).__name__,
176+
self=self
177+
)
178+
128179
def start_listener_if_not_running(self):
129-
self._listener_process.start_if_not_running()
180+
self._listener_process.start_listener_if_not_running(max_uptime=self.max_uptime)
130181

131182
def stop_listener_if_running(self):
132183
self._listener_process.stop_if_running()
@@ -136,7 +187,8 @@ def _do_convert(
136187
remove_line_no: bool = True,
137188
remove_header_footer: bool = True,
138189
remove_redline: bool = True):
139-
self.start_listener_if_not_running()
190+
if self.no_launch:
191+
self.start_listener_if_not_running()
140192

141193
temp_target_filename = change_ext(
142194
temp_source_filename, None, '-output.%s' % output_type

0 commit comments

Comments
 (0)