|
2 | 2 | from docker.errors import ContainerError, ImageNotFound
|
3 | 3 | from pathlib import Path
|
4 | 4 | from tempfile import TemporaryDirectory
|
| 5 | +from bs4 import BeautifulSoup |
5 | 6 |
|
6 | 7 | from sota_extractor2.errors import LatexConversionError
|
7 | 8 |
|
@@ -62,27 +63,11 @@ def latex2html(self, source_dir, output_dir, use_named_volumes=False):
|
62 | 63 | raise
|
63 | 64 |
|
64 | 65 | # todo: check for errors
|
65 |
| - |
66 |
| - def clean_html(self, path, use_named_volumes=False): |
| 66 | + def clean_html(self, path): |
67 | 67 | path = Path(path)
|
68 |
| - |
69 |
| - if use_named_volumes: |
70 |
| - index_path = path |
71 |
| - volumes = { |
72 |
| - "pwc_htmls": ro_bind("/data/arxiv/htmls") |
73 |
| - } |
74 |
| - else: |
75 |
| - index_path = "/files/index.html" |
76 |
| - volumes = { |
77 |
| - path.resolve(): ro_bind(index_path) |
78 |
| - } |
79 |
| - |
80 |
| - command = ["timeout", "-s", "KILL", "20", "chromium-browser", "--headless", |
81 |
| - "--disable-gpu", "--disable-software-rasterizer", "--no-sandbox", |
82 |
| - "--timeout=30000", "--dump-dom", str(index_path)] |
83 |
| - data = self.client.containers.run("zenika/alpine-chrome:73", command, remove=True, entrypoint="", |
84 |
| - volumes=volumes) |
85 |
| - return data.decode('utf-8') |
| 68 | + with path.open("rb") as file: |
| 69 | + soup = BeautifulSoup(file, "html5lib") |
| 70 | + return str(soup) |
86 | 71 |
|
87 | 72 | def to_html(self, source_dir):
|
88 | 73 | with TemporaryDirectory() as output_dir:
|
|
0 commit comments