Skip to content

Commit 064b9ee

Browse files
author
Marcin Kardas
committed
Use html5lib for html cleaning
Chrome DOM dumping was introduced as a cleaning method when htlatex was used for LaTeX to html conversion. latexml gives much better results and html5lib is 3 times faster than running chrome image.
1 parent 0b308c2 commit 064b9ee

File tree

1 file changed

+5
-20
lines changed

1 file changed

+5
-20
lines changed

sota_extractor2/helpers/latex_converter.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from docker.errors import ContainerError, ImageNotFound
33
from pathlib import Path
44
from tempfile import TemporaryDirectory
5+
from bs4 import BeautifulSoup
56

67
from sota_extractor2.errors import LatexConversionError
78

@@ -62,27 +63,11 @@ def latex2html(self, source_dir, output_dir, use_named_volumes=False):
6263
raise
6364

6465
# todo: check for errors
65-
66-
def clean_html(self, path, use_named_volumes=False):
66+
def clean_html(self, path):
6767
path = Path(path)
68-
69-
if use_named_volumes:
70-
index_path = path
71-
volumes = {
72-
"pwc_htmls": ro_bind("/data/arxiv/htmls")
73-
}
74-
else:
75-
index_path = "/files/index.html"
76-
volumes = {
77-
path.resolve(): ro_bind(index_path)
78-
}
79-
80-
command = ["timeout", "-s", "KILL", "20", "chromium-browser", "--headless",
81-
"--disable-gpu", "--disable-software-rasterizer", "--no-sandbox",
82-
"--timeout=30000", "--dump-dom", str(index_path)]
83-
data = self.client.containers.run("zenika/alpine-chrome:73", command, remove=True, entrypoint="",
84-
volumes=volumes)
85-
return data.decode('utf-8')
68+
with path.open("rb") as file:
69+
soup = BeautifulSoup(file, "html5lib")
70+
return str(soup)
8671

8772
def to_html(self, source_dir):
8873
with TemporaryDirectory() as output_dir:

0 commit comments

Comments
 (0)