Skip to content

Commit 9b98cc7

Browse files
committed
Better detect a main document file
* use some heuristics to better detect which file to convert. * apply patches to latexml (to be removed once they're merged to master and available via engrafo docker image)
1 parent 2881a62 commit 9b98cc7

File tree

6 files changed

+359
-13
lines changed

6 files changed

+359
-13
lines changed

docker-latex2html.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ OUTPUT=$(realpath "$2") #~/arxiv/htmls/1701/1701.xyz.html
66
OUTPUT_DIR=$(dirname "$OUTPUT") #~/arxiv/htmls/1701
77
FILENAME=$(basename "$OUTPUT") #1701.xyz.html
88

9-
docker run --rm -v $PWD/latex2html.sh:/files/latex2html.sh:ro -v "$SOURCE_DIR":/files/ro-source:ro -v "$OUTPUT_DIR":/files/htmls arxivvanity/engrafo /files/latex2html.sh "$FILENAME"
9+
docker run --rm -v $PWD/latex2html.sh:/files/latex2html.sh:ro -v $PWD/guess_main.py:/files/guess_main.py:ro -v $PWD/patches:/files/patches:ro -v "$SOURCE_DIR":/files/ro-source:ro -v "$OUTPUT_DIR":/files/htmls arxivvanity/engrafo /files/latex2html.sh "$FILENAME"

guess_main.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from pathlib import Path
2+
import re
3+
import sys
4+
import codecs
5+
6+
doccls = re.compile(r"\s*\\documentclass")
7+
docbeg = re.compile(r"\s*\\begin\s*\{\s*document\s*\}")
8+
title = re.compile(r"\s*\\(icml)?title\s*\{(?P<title>[^%}]*)")
9+
10+
aux = re.compile(r"(rebuttal\s+|instructions\s+(for\s+\\confname|.*proceedings)|(supplementary|supplemental)\s+materials?|appendix|author\s+guidelines|ieeetran\.cls|formatting\s+instructions)")
11+
12+
def aux_title(t):
13+
t = t.strip().lower()
14+
return bool(aux.search(t))
15+
16+
17+
def calc_priority(path):
18+
priority = 0
19+
if path.name.lower() == "ms.tex":
20+
return 30
21+
with codecs.open(path, 'r', encoding='utf8', errors='ignore') as f:
22+
for line in f:
23+
if doccls.match(line):
24+
priority += 10
25+
break
26+
for line in f:
27+
m = title.match(line)
28+
if m:
29+
priority += 5
30+
t = m["title"]
31+
if aux_title(t):
32+
priority = 5
33+
break
34+
return priority
35+
36+
37+
def guess_main(path):
38+
path = Path(path)
39+
files = sorted(path.glob("*.tex"), key=lambda p: p.stem.lower())
40+
if len(files) > 1:
41+
with_priority = [(f, calc_priority(f)) for f in files]
42+
with_priority = sorted(with_priority, key=lambda fp: fp[1], reverse=True)
43+
files = [fp[0] for fp in with_priority]
44+
45+
return files[0] if len(files) else None
46+
47+
if __name__ == '__main__':
48+
if len(sys.argv) != 2:
49+
print(f"Usage:\n\t{sys.argv[0]} DIR", file=sys.stderr)
50+
exit(1)
51+
main = guess_main(sys.argv[1])
52+
if not main:
53+
print("Unable to find any suitable tex file", file=sys.stderr)
54+
exit(1)
55+
else:
56+
print(main)

latex2html.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,23 @@ SOURCE_DIR="/files/source"
66
OUTPUT_DIR="/files/htmls"
77

88
cp -r "$RO_SOURCE_DIR" "$SOURCE_DIR"
9+
10+
# turn tikzpciture instances into comments
911
find "$SOURCE_DIR" -iname '*.tex' -print0 | xargs -0 sed -i \
1012
-e 's/\\begin{document}/\\usepackage{verbatim}\0/g' \
1113
-e 's/\\begin\(\[[^]]*\]\)\?{tikzpicture}/\\begin{comment}/g' \
1214
-e 's/\\end{tikzpicture}/\\end{comment}/g'
1315

14-
if [ -f "$SOURCE_DIR/ms.tex" ]
15-
then
16-
MAINTEX="$SOURCE_DIR/ms.tex"
17-
elif [ -f "$SOURCE_DIR/main.tex" ]
18-
then
19-
MAINTEX="$SOURCE_DIR/main.tex"
20-
elif [ -f "$SOURCE_DIR/00_main.tex" ]
21-
then
22-
MAINTEX="$SOURCE_DIR/00_main.tex"
23-
else
24-
MAINTEX=$(find "$SOURCE_DIR" -maxdepth 1 -type f -iname "*.tex" -print0 | xargs -0 grep -l documentclass | head -1)
25-
fi
16+
# temporary fixes
17+
# https://github.com/brucemiller/LaTeXML/pull/1171
18+
# https://github.com/brucemiller/LaTeXML/pull/1173
19+
# https://github.com/brucemiller/LaTeXML/pull/1177
20+
for patch in /files/patches/*
21+
do
22+
patch -i $patch -p 3 -d /usr/local/share/perl/5.28.1/LaTeXML
23+
done
24+
25+
MAINTEX=$(python3 /files/guess_main.py "$SOURCE_DIR")
2626
timeout -s KILL 300 engrafo "$MAINTEX" /files/output
2727

2828
cp /files/output/index.html "$OUTPUT_DIR/$OUTNAME"

patches/1171.patch

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
From d715def1f4ddd18336e5e49b54baf0efd9acfb94 Mon Sep 17 00:00:00 2001
2+
From: Deyan Ginev <[email protected]>
3+
Date: Sun, 21 Jul 2019 16:14:17 -0400
4+
Subject: [PATCH] neurips binding and reliably preload main article
5+
dependencies for bibliography post-processing
6+
7+
---
8+
MANIFEST | 1 +
9+
lib/LaTeXML/Package/neurips.sty.ltxml | 34 +++++++++++++++++++++++++++
10+
lib/LaTeXML/Post.pm | 22 +++++++++--------
11+
lib/LaTeXML/Post/MakeBibliography.pm | 11 +++++----
12+
4 files changed, 53 insertions(+), 15 deletions(-)
13+
create mode 100644 lib/LaTeXML/Package/neurips.sty.ltxml
14+
15+
diff --git a/MANIFEST b/MANIFEST
16+
index f944d07aa..69b5bdd51 100644
17+
--- a/MANIFEST
18+
+++ b/MANIFEST
19+
@@ -548,6 +548,7 @@ lib/LaTeXML/Package/multido.sty.ltxml
20+
lib/LaTeXML/Package/multirow.sty.ltxml
21+
lib/LaTeXML/Package/nameref.sty.ltxml
22+
lib/LaTeXML/Package/natbib.sty.ltxml
23+
+lib/LaTeXML/Package/neurips.sty.ltxml
24+
lib/LaTeXML/Package/newcent.sty.ltxml
25+
lib/LaTeXML/Package/newfloat.sty.ltxml
26+
lib/LaTeXML/Package/newlfont.sty.ltxml
27+
diff --git a/lib/LaTeXML/Package/neurips.sty.ltxml b/lib/LaTeXML/Package/neurips.sty.ltxml
28+
new file mode 100644
29+
index 000000000..b642e4dd3
30+
--- /dev/null
31+
+++ b/lib/LaTeXML/Package/neurips.sty.ltxml
32+
@@ -0,0 +1,34 @@
33+
+# -*- mode: Perl -*-
34+
+# /=====================================================================\ #
35+
+# | neurips_2019.sty | #
36+
+# | Implementation for LaTeXML | #
37+
+# |=====================================================================| #
38+
+# | Part of LaTeXML: | #
39+
+# | Public domain software, produced as part of work done by the | #
40+
+# | United States Government & not subject to copyright in the US. | #
41+
+# |---------------------------------------------------------------------| #
42+
+# | Bruce Miller <[email protected]> #_# | #
43+
+# | http://dlmf.nist.gov/LaTeXML/ (o o) | #
44+
+# \=========================================================ooo==U==ooo=/ #
45+
+package LaTeXML::Package::Pool;
46+
+use strict;
47+
+use warnings;
48+
+use LaTeXML::Package;
49+
+
50+
+#======================================================================
51+
+RequirePackage('natbib');
52+
+RequirePackage('geometry');
53+
+RequirePackage('lineno');
54+
+# /--------------------------------------------------------------------\
55+
+# | Drafted by texscan --stub neurips_2019.sty |
56+
+# \--------------------------------------------------------------------/
57+
+DefMacro('\AND', Tokens());
58+
+DefMacro('\And', Tokens());
59+
+DefMacro('\bottomfraction', Tokens());
60+
+DefMacro('\patchAmsMathEnvironmentForLineno', Tokens());
61+
+DefMacro('\patchBothAmsMathEnvironmentsForLineno', Tokens());
62+
+DefMacroI('\subsubsubsection', undef, '\@startsection{subsubsubsection}{4}{}{}{}{}', locked => 1);
63+
+DefMacro('\textfraction', Tokens());
64+
+DefMacro('\topfraction', Tokens());
65+
+#======================================================================
66+
+1;
67+
diff --git a/lib/LaTeXML/Post.pm b/lib/LaTeXML/Post.pm
68+
index a1dc74c1c..ec12bf2a8 100644
69+
--- a/lib/LaTeXML/Post.pm
70+
+++ b/lib/LaTeXML/Post.pm
71+
@@ -56,7 +56,7 @@ sub ProcessChain_internal {
72+
foreach my $doc (@docs) {
73+
local $LaTeXML::Post::DOCUMENT = $doc;
74+
if (my @nodes = grep { $_ } $processor->toProcess($doc)) { # If there are nodes to process
75+
- my $n = scalar(@nodes);
76+
+ my $n = scalar(@nodes);
77+
my $msg = join(' ', $processor->getName || '',
78+
$doc->siteRelativeDestination || '',
79+
($n > 1 ? "$n to process" : 'processing'));
80+
@@ -198,7 +198,7 @@ sub generateResourcePathname {
81+
my $subdir = $$self{resource_directory} || '';
82+
my $prefix = $$self{resource_prefix} || "x";
83+
my $counter = join('_', "_max", $subdir, $prefix, "counter_");
84+
- my $n = $doc->cacheLookup($counter) || 0;
85+
+ my $n = $doc->cacheLookup($counter) || 0;
86+
my $name = $prefix . ++$n;
87+
$doc->cacheStore($counter, $n);
88+
return pathname_make(dir => $subdir, name => $name, type => $type); }
89+
@@ -218,11 +218,12 @@ sub find_documentclass_and_packages {
90+
$classoptions = $$entry{options} || 'onecolumn';
91+
$oldstyle = $$entry{oldstyle}; }
92+
elsif ($$entry{package}) {
93+
- push(@packages, [$$entry{package}, $$entry{options} || '']); }
94+
- }
95+
+ push(@packages, [$$entry{package} . ".sty", $$entry{options} || '']); } }
96+
if (!$class) {
97+
Warn('expected', 'class', undef, "No document class found; using article");
98+
$class = 'article'; }
99+
+ if ($class !~ /\.cls$/) {
100+
+ $class = $class . ".cls"; }
101+
return ([$class, $classoptions, $oldstyle], @packages); }
102+
103+
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
104+
@@ -336,7 +337,7 @@ sub processNode {
105+
# XMath will be removed (LATER!), but mark its ids as reusable.
106+
$doc->preremoveNodes($xmath);
107+
if ($$self{parallel}) {
108+
- my $primary = $self->convertNode($doc, $xmath);
109+
+ my $primary = $self->convertNode($doc, $xmath);
110+
my @secondaries = ();
111+
foreach my $proc (@{ $$self{secondary_processors} }) {
112+
local $LaTeXML::Post::MATHPROCESSOR = $proc;
113+
@@ -425,7 +426,7 @@ sub convertXMTextContent {
114+
my $tag = $doc->getQName($node);
115+
if ($tag eq 'ltx:XMath') {
116+
my $conversion = $self->convertNode($doc, $node);
117+
- my $xml = $$conversion{xml};
118+
+ my $xml = $$conversion{xml};
119+
# And if no xml ????
120+
push(@result, $self->outerWrapper($doc, $node, $xml)); }
121+
else {
122+
@@ -516,7 +517,7 @@ sub associateNode {
123+
$document->generateNodeID($sourcenode, '', 1); } # but the ID is reusable
124+
if (my $sourceid = $sourcenode->getAttribute('fragid')) { # If source has ID
125+
my $nodeid = $currentnode->getAttribute('fragid') || $sourceid;
126+
- my $id = $document->uniquifyID($nodeid, $self->IDSuffix);
127+
+ my $id = $document->uniquifyID($nodeid, $self->IDSuffix);
128+
if ($isarray) {
129+
$$node[1]{'xml:id'} = $id; }
130+
else {
131+
@@ -775,7 +776,7 @@ sub setDocument_internal {
132+
my ($tag, $attributes, @children) = @$root;
133+
my ($prefix, $localname) = $tag =~ /^(.*):(.*)$/;
134+
my $nsuri = $$self{namespaces}{$prefix};
135+
- my $node = $$self{document}->createElementNS($nsuri, $localname);
136+
+ my $node = $$self{document}->createElementNS($nsuri, $localname);
137+
$$self{document}->setDocumentElement($node);
138+
map { $$attributes{$_} && $node->setAttribute($_ => $$attributes{$_}) } keys %$attributes
139+
if $attributes;
140+
@@ -927,7 +928,7 @@ sub idcheck {
141+
my %missing = ();
142+
foreach my $node ($self->findnodes("//*[\@xml:id]")) {
143+
my $id = $node->getAttribute('xml:id');
144+
- $dups{$id} = 1 if $idcache{$id};
145+
+ $dups{$id} = 1 if $idcache{$id};
146+
$idcache{$id} = 1; }
147+
foreach my $id (keys %{ $$self{idcache} }) {
148+
$missing{$id} = 1 unless $idcache{$id}; }
149+
@@ -1181,13 +1182,14 @@ sub prependNodes {
150+
sub cloneNode {
151+
my ($self, $node, $idsuffix, %options) = @_;
152+
return $node unless ref $node;
153+
+ return $node if ref $node eq 'ARRAY'; # Should we deep clone if we get an array? Just return for now
154+
my $copy = $node->cloneNode(1);
155+
my $nocache = $options{nocache};
156+
#### $idsuffix = '' unless defined $idsuffix;
157+
# Find all id's defined in the copy and change the id.
158+
my %idmap = ();
159+
foreach my $n ($self->findnodes('descendant-or-self::*[@xml:id]', $copy)) {
160+
- my $id = $n->getAttribute('xml:id');
161+
+ my $id = $n->getAttribute('xml:id');
162+
my $newid = $self->uniquifyID($id, $idsuffix);
163+
$idmap{$id} = $newid;
164+
$self->recordID($newid => $n) unless $nocache;
165+
diff --git a/lib/LaTeXML/Post/MakeBibliography.pm b/lib/LaTeXML/Post/MakeBibliography.pm
166+
index 37c70b92e..6bf6d96fc 100644
167+
--- a/lib/LaTeXML/Post/MakeBibliography.pm
168+
+++ b/lib/LaTeXML/Post/MakeBibliography.pm
169+
@@ -162,13 +162,14 @@ sub convertBibliography {
170+
my ($self, $doc, $bib) = @_;
171+
require LaTeXML;
172+
require LaTeXML::Common::Config;
173+
- my @packages =
174+
- my @preload = ();
175+
- # Might want/need to preload more (all?) packages, but at least do inputenc!
176+
+ my @preload = (); # custom macros often used in e.g. howpublished field
177+
+ # need to preload all packages used by the main article
178+
foreach my $po ($self->find_documentclass_and_packages($doc)) {
179+
my ($pkg, $options) = @$po;
180+
- if ($pkg eq 'inputenc') {
181+
- push(@preload, "[$options]$pkg"); } }
182+
+ if ($options) {
183+
+ push(@preload, "[$options]$pkg"); }
184+
+ else {
185+
+ push(@preload, "$pkg"); } }
186+
NoteProgress(" [Converting bibliography $bib ...");
187+
my $bib_config = LaTeXML::Common::Config->new(
188+
cache_key => 'BibTeX',

patches/1173.patch

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
From 6eeebce933599340b44a0d61d69ad409f6944d44 Mon Sep 17 00:00:00 2001
2+
From: Deyan Ginev <[email protected]>
3+
Date: Wed, 24 Jul 2019 12:49:40 -0400
4+
Subject: [PATCH] avoid Mouth time-travel bug when preparing url from an XUntil
5+
context
6+
7+
---
8+
lib/LaTeXML/Package/hyperref.sty.ltxml | 9 ++++-----
9+
lib/LaTeXML/Package/url.sty.ltxml | 5 ++---
10+
2 files changed, 6 insertions(+), 8 deletions(-)
11+
12+
diff --git a/lib/LaTeXML/Package/hyperref.sty.ltxml b/lib/LaTeXML/Package/hyperref.sty.ltxml
13+
index 48a9af302..d07afc015 100644
14+
--- a/lib/LaTeXML/Package/hyperref.sty.ltxml
15+
+++ b/lib/LaTeXML/Package/hyperref.sty.ltxml
16+
@@ -112,7 +112,7 @@ DefConstructor('\@add@PDF@RDFa@triples', sub {
17+
if (my $entry = ($pdfkey_property{$key})) {
18+
my ($property, $object, $datatype) = @$entry;
19+
my $value = LookupMapping('Hyperref_options', $key);
20+
- my $node = $document->openElementAt($root, 'ltx:rdf',
21+
+ my $node = $document->openElementAt($root, 'ltx:rdf',
22+
property => $property, $object => $value,
23+
($datatype ? (datatype => $datatype) : ()));
24+
# Must do directly; $document->setAttribute omits empty attributes
25+
@@ -136,17 +136,16 @@ DefMacro('\href Verbatim {}', '\@@Url\href{}{}{#1}{#2}');
26+
# Redefine \@url to sanitize the argument less
27+
DefMacro('\@Url Token', sub {
28+
my ($gullet, $cmd) = @_;
29+
- my $mouth = $gullet->getMouth;
30+
my ($open, $close, $url);
31+
$open = $gullet->readToken;
32+
StartSemiverbatim('%');
33+
Let('~', T_OTHER('~')); # Needs special protection?
34+
if ($open->equals(T_BEGIN)) {
35+
$open = T_OTHER('{'); $close = T_OTHER('}');
36+
- $url = $gullet->readBalanced(1); } # Expand as we go!
37+
+ $url = $gullet->readBalanced(1); } # Expand as we go!
38+
else {
39+
$close = $open = T_OTHER($open->getString);
40+
- $url = $mouth->readTokens($close); }
41+
+ $url = $gullet->readUntil($close); }
42+
EndSemiverbatim();
43+
my @toks = grep { $_->getCatcode != CC_SPACE; } $url->unlist;
44+
# Identical with url's \@Url except, let CS's through!
45+
@@ -212,7 +211,7 @@ DefConstructor('\autoref Semiverbatim',
46+
47+
DefMacro('\lx@autorefnum@@{}', sub {
48+
my ($gullet, $type) = @_;
49+
- my $type_s = ToString($type);
50+
+ my $type_s = ToString($type);
51+
my $counter = LookupMapping('counter_for_type', $type_s) || $type_s;
52+
return Tokens(
53+
(LookupDefinition(T_CS('\\' . $type_s . 'autorefname'))
54+
diff --git a/lib/LaTeXML/Package/url.sty.ltxml b/lib/LaTeXML/Package/url.sty.ltxml
55+
index da6223359..c208a8dc5 100644
56+
--- a/lib/LaTeXML/Package/url.sty.ltxml
57+
+++ b/lib/LaTeXML/Package/url.sty.ltxml
58+
@@ -46,16 +46,15 @@ DefMacro('\DeclareUrlCommand{}{}', '\def#1{\begingroup #2\@Url#1}');
59+
# In any case, we read the verbatim arg, and build a Whatsit for @@Url
60+
DefMacro('\@Url Token', sub {
61+
my ($gullet, $cmd) = @_;
62+
- my $mouth = $gullet->getMouth;
63+
my ($open, $close, $url);
64+
StartSemiverbatim('%');
65+
$open = $gullet->readToken;
66+
if ($open->equals(T_BEGIN)) {
67+
$open = T_OTHER('{'); $close = T_OTHER('}');
68+
- $url = $gullet->readBalanced; }
69+
+ $url = $gullet->readBalanced; }
70+
else {
71+
$close = $open = T_OTHER($open->getString);
72+
- $url = $mouth->readTokens($close); }
73+
+ $url = $gullet->readUntil($close); }
74+
EndSemiverbatim();
75+
76+
my @toks = grep { $_->getCatcode != CC_SPACE; } $url->unlist;

patches/1177.patch

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
From 4b3a98e90e790eccb18eb16636c783ec7dfceb3b Mon Sep 17 00:00:00 2001
2+
From: Marcin Kardas <[email protected]>
3+
Date: Sun, 28 Jul 2019 16:51:49 +0200
4+
Subject: [PATCH] Use algorithmic instead of algorithmicx in icml*.sty
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
All icml style files I've tested (2013-2019) require `algorithmic` package, but `algorithmicx` doesn’t define any `algorithmic` commands. [Here's](https://arxiv.org/pdf/1402.5766v1.pdf) an example of paper (using icml2014.sty) on which LaTeXML hangs during processing when `algorithmicx` is used.
10+
---
11+
lib/LaTeXML/Package/icml_support.sty.ltxml | 2 +-
12+
1 file changed, 1 insertion(+), 1 deletion(-)
13+
14+
diff --git a/lib/LaTeXML/Package/icml_support.sty.ltxml b/lib/LaTeXML/Package/icml_support.sty.ltxml
15+
index 176498a05..3696de4b7 100644
16+
--- a/lib/LaTeXML/Package/icml_support.sty.ltxml
17+
+++ b/lib/LaTeXML/Package/icml_support.sty.ltxml
18+
@@ -20,7 +20,7 @@ RequirePackage('times');
19+
RequirePackage('fancyhdr');
20+
RequirePackage('color');
21+
RequirePackage('algorithm');
22+
-RequirePackage('algorithmicx');
23+
+RequirePackage('algorithmic');
24+
RequirePackage('natbib');
25+
# RequirePackage('eso-pic');
26+
# RequirePackage('forloop');

0 commit comments

Comments
 (0)