@@ -18,34 +18,57 @@ def __init__(self, base_path):
18
18
self .client = docker .from_env ()
19
19
self .base_path = Path (base_path )
20
20
21
- def latex2html (self , source_dir , output_dir ):
21
+ def latex2html (self , source_dir , output_dir , use_named_volumes = False ):
22
22
base = self .base_path
23
23
source_dir = Path (source_dir )
24
24
output_dir = Path (output_dir )
25
+ scriptname = "/files/latex2html.sh"
26
+ filename = "index.html"
27
+
25
28
volumes = {
26
29
base / "latex2html.sh" : ro_bind ("/files/latex2html.sh" ),
27
30
base / "guess_main.py" : ro_bind ("/files/guess_main.py" ), # todo: run guess_main outside of docker
28
- base / "patches" : ro_bind ("/files/patches" ), # todo: see which patches can be dropped
29
- source_dir .resolve (): ro_bind ("/files/ro-source" ),
30
- output_dir .resolve (): rw_bind ("/files/htmls" )
31
+ base / "patches" : ro_bind ("/files/patches" ) # todo: see which patches can be dropped
31
32
}
32
33
34
+ # In case of fully dockerized pipeline we use named volumes to share files between the steps.
35
+ # This, however, requires as to mount specific volumes with all papers, not only the currently processed one.
36
+ # (see https://github.com/moby/moby/issues/32582)
37
+ if use_named_volumes :
38
+ volumes .update ({
39
+ "pwc_unpacked_sources" : ro_bind ("/data/arxiv/unpacked_sources" ),
40
+ "pwc_htmls" : rw_bind ("/data/arxiv/htmls" )
41
+ })
42
+ command = [scriptname , filename , str (source_dir ), str (output_dir )]
43
+ else :
44
+ volumes .update ({
45
+ source_dir .resolve (): ro_bind ("/files/ro-source" ),
46
+ output_dir .resolve (): rw_bind ("/files/htmls" )
47
+ })
48
+ command = [scriptname , filename ]
49
+
33
50
output_dir .mkdir (parents = True , exist_ok = True )
34
- filename = "index.html"
35
- command = ["/files/latex2html.sh" , filename ]
36
51
self .client .containers .run ("arxivvanity/engrafo:b3db888fefa118eacf4f13566204b68ce100b3a6" , command , remove = True , volumes = volumes )
37
52
38
53
# todo: check for errors
39
54
40
- def clean_html (self , path ):
55
+ def clean_html (self , path , use_named_volumes = False ):
41
56
path = Path (path )
42
- volumes = {
43
- path .resolve (): ro_bind ("/files/index.html" ),
44
- }
45
57
46
- command = "timeout -s KILL 20 chromium-browser --headless" \
47
- " --disable-gpu --disable-software-rasterizer --no-sandbox" \
48
- " --timeout=30000 --dump-dom /files/index.html"
58
+ if use_named_volumes :
59
+ index_path = path
60
+ volumes = {
61
+ "pwc_htmls" : ro_bind ("/data/arxiv/htmls" )
62
+ }
63
+ else :
64
+ index_path = "/files/index.html"
65
+ volumes = {
66
+ path .resolve (): ro_bind (index_path )
67
+ }
68
+
69
+ command = ["timeout" , "-s" , "KILL" , "20" , "chromium-browser" , "--headless" ,
70
+ "--disable-gpu" , "--disable-software-rasterizer" , "--no-sandbox" ,
71
+ "--timeout=30000" , "--dump-dom" , str (index_path )]
49
72
data = self .client .containers .run ("zenika/alpine-chrome:73" , command , remove = True , entrypoint = "" ,
50
73
volumes = volumes )
51
74
return data .decode ('utf-8' )
0 commit comments