Skip to content

Commit 4c7c3f8

Browse files
committed
Allow using named docker volumes
1 parent 9fc128a commit 4c7c3f8

File tree

2 files changed

+39
-15
lines changed

2 files changed

+39
-15
lines changed

latex2html.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
#!/usr/bin/env bash
22
OUTNAME="$1"
33
echo $OUTNAME
4-
RO_SOURCE_DIR="/files/ro-source"
4+
RO_SOURCE_DIR="${2:-/files/ro-source}"
55
SOURCE_DIR="/files/source"
6-
OUTPUT_DIR="/files/htmls"
6+
OUTPUT_DIR="${3:-/files/htmls}"
77

8+
mkdir -p /files
89
cp -r "$RO_SOURCE_DIR" "$SOURCE_DIR"
910

1011
# turn tikzpciture instances into comments

sota_extractor2/helpers/latex_converter.py

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,34 +18,57 @@ def __init__(self, base_path):
1818
self.client = docker.from_env()
1919
self.base_path = Path(base_path)
2020

21-
def latex2html(self, source_dir, output_dir):
21+
def latex2html(self, source_dir, output_dir, use_named_volumes=False):
2222
base = self.base_path
2323
source_dir = Path(source_dir)
2424
output_dir = Path(output_dir)
25+
scriptname = "/files/latex2html.sh"
26+
filename = "index.html"
27+
2528
volumes = {
2629
base / "latex2html.sh": ro_bind("/files/latex2html.sh"),
2730
base / "guess_main.py": ro_bind("/files/guess_main.py"), # todo: run guess_main outside of docker
28-
base / "patches": ro_bind("/files/patches"), # todo: see which patches can be dropped
29-
source_dir.resolve(): ro_bind("/files/ro-source"),
30-
output_dir.resolve(): rw_bind("/files/htmls")
31+
base / "patches": ro_bind("/files/patches") # todo: see which patches can be dropped
3132
}
3233

34+
# In case of fully dockerized pipeline we use named volumes to share files between the steps.
35+
# This, however, requires as to mount specific volumes with all papers, not only the currently processed one.
36+
# (see https://github.com/moby/moby/issues/32582)
37+
if use_named_volumes:
38+
volumes.update({
39+
"pwc_unpacked_sources": ro_bind("/data/arxiv/unpacked_sources"),
40+
"pwc_htmls": rw_bind("/data/arxiv/htmls")
41+
})
42+
command = [scriptname, filename, str(source_dir), str(output_dir)]
43+
else:
44+
volumes.update({
45+
source_dir.resolve(): ro_bind("/files/ro-source"),
46+
output_dir.resolve(): rw_bind("/files/htmls")
47+
})
48+
command = [scriptname, filename]
49+
3350
output_dir.mkdir(parents=True, exist_ok=True)
34-
filename = "index.html"
35-
command = ["/files/latex2html.sh", filename]
3651
self.client.containers.run("arxivvanity/engrafo:b3db888fefa118eacf4f13566204b68ce100b3a6", command, remove=True, volumes=volumes)
3752

3853
# todo: check for errors
3954

40-
def clean_html(self, path):
55+
def clean_html(self, path, use_named_volumes=False):
4156
path = Path(path)
42-
volumes = {
43-
path.resolve(): ro_bind("/files/index.html"),
44-
}
4557

46-
command = "timeout -s KILL 20 chromium-browser --headless" \
47-
" --disable-gpu --disable-software-rasterizer --no-sandbox" \
48-
" --timeout=30000 --dump-dom /files/index.html"
58+
if use_named_volumes:
59+
index_path = path
60+
volumes = {
61+
"pwc_htmls": ro_bind("/data/arxiv/htmls")
62+
}
63+
else:
64+
index_path = "/files/index.html"
65+
volumes = {
66+
path.resolve(): ro_bind(index_path)
67+
}
68+
69+
command = ["timeout", "-s", "KILL", "20", "chromium-browser", "--headless",
70+
"--disable-gpu", "--disable-software-rasterizer", "--no-sandbox",
71+
"--timeout=30000", "--dump-dom", str(index_path)]
4972
data = self.client.containers.run("zenika/alpine-chrome:73", command, remove=True, entrypoint="",
5073
volumes=volumes)
5174
return data.decode('utf-8')

0 commit comments

Comments
 (0)