Skip to content

Commit d04709a

Browse files
committed
Improve template argument processing
1 parent 352d9e8 commit d04709a

File tree

4 files changed

+85
-24
lines changed

4 files changed

+85
-24
lines changed

mathosphere-core/src/main/java/com/formulasearchengine/mathosphere/mlp/contracts/TextAnnotatorMapper.java

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import com.formulasearchengine.mathosphere.mlp.text.MathConverter;
1313
import com.formulasearchengine.mathosphere.mlp.text.PosTagger;
1414
import com.formulasearchengine.mathosphere.mlp.text.WikiTextUtils;
15-
import com.formulasearchengine.mathosphere.mlp.text.WikidataLinkMap;
1615

1716
import org.apache.flink.api.common.functions.RichMapFunction;
1817
import org.apache.flink.configuration.Configuration;
@@ -27,21 +26,12 @@ public class TextAnnotatorMapper extends RichMapFunction<RawWikiDocument, Parsed
2726
private static final Logger LOGGER = LoggerFactory.getLogger(TextAnnotatorMapper.class);
2827

2928
private final BaseConfig config;
30-
private final String language;
31-
private final String model;
32-
private final WikidataLinkMap wl;
29+
3330

3431
private PosTagger posTagger;
3532

3633
public TextAnnotatorMapper(BaseConfig config) {
3734
this.config = config;
38-
this.language = config.getLanguage();
39-
this.model = config.getModel();
40-
if (config.getWikiDataFile() != null) {
41-
wl = new WikidataLinkMap(config.getWikiDataFile());
42-
} else {
43-
wl = null;
44-
}
4535
}
4636

4737
@Override
@@ -66,7 +56,7 @@ public ParsedWikiDocument parse(String wikitext, String title) {
6656
try {
6757
String cleanText;
6858
if (config.getUseTeXIdentifiers()) {
69-
MathConverter c = new MathConverter(wikitext, title, wl);
59+
MathConverter c = new MathConverter(wikitext, title, config);
7060
cleanText = c.getStrippedOutput();
7161
mathTags = c.getMathTags();
7262
links = c.getLinks();

mathosphere-core/src/main/java/com/formulasearchengine/mathosphere/mlp/text/MathConverter.java

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
* limitations under the License.
1010
*/
1111

12+
import com.google.common.collect.Multiset;
13+
14+
import com.formulasearchengine.mathosphere.mlp.cli.BaseConfig;
1215
import com.formulasearchengine.mathosphere.mlp.contracts.TextExtractorMapper;
1316
import com.formulasearchengine.mathosphere.mlp.pojos.MathTag;
1417
import com.formulasearchengine.mathosphere.mlp.pojos.WikidataLink;
@@ -25,15 +28,53 @@
2528
import org.sweble.wikitext.engine.nodes.EngPage;
2629
import org.sweble.wikitext.engine.nodes.EngProcessedPage;
2730
import org.sweble.wikitext.engine.utils.DefaultConfigEnWp;
28-
import org.sweble.wikitext.parser.nodes.*;
31+
import org.sweble.wikitext.parser.nodes.WtBold;
2932
import org.sweble.wikitext.parser.nodes.WtContentNode.WtContentNodeImpl;
33+
import org.sweble.wikitext.parser.nodes.WtExternalLink;
34+
import org.sweble.wikitext.parser.nodes.WtHorizontalRule;
35+
import org.sweble.wikitext.parser.nodes.WtIllegalCodePoint;
36+
import org.sweble.wikitext.parser.nodes.WtImageLink;
37+
import org.sweble.wikitext.parser.nodes.WtInternalLink;
38+
import org.sweble.wikitext.parser.nodes.WtItalics;
39+
import org.sweble.wikitext.parser.nodes.WtListItem;
40+
import org.sweble.wikitext.parser.nodes.WtNewline;
41+
import org.sweble.wikitext.parser.nodes.WtNode;
42+
import org.sweble.wikitext.parser.nodes.WtNodeList;
43+
import org.sweble.wikitext.parser.nodes.WtOrderedList;
44+
import org.sweble.wikitext.parser.nodes.WtPageSwitch;
45+
import org.sweble.wikitext.parser.nodes.WtParagraph;
46+
import org.sweble.wikitext.parser.nodes.WtSection;
47+
import org.sweble.wikitext.parser.nodes.WtTable;
48+
import org.sweble.wikitext.parser.nodes.WtTableCaption;
49+
import org.sweble.wikitext.parser.nodes.WtTableCell;
50+
import org.sweble.wikitext.parser.nodes.WtTableHeader;
51+
import org.sweble.wikitext.parser.nodes.WtTableImplicitTableBody;
52+
import org.sweble.wikitext.parser.nodes.WtTableRow;
53+
import org.sweble.wikitext.parser.nodes.WtTagExtension;
54+
import org.sweble.wikitext.parser.nodes.WtTemplate;
55+
import org.sweble.wikitext.parser.nodes.WtTemplateArgument;
56+
import org.sweble.wikitext.parser.nodes.WtTemplateParameter;
57+
import org.sweble.wikitext.parser.nodes.WtText;
58+
import org.sweble.wikitext.parser.nodes.WtUnorderedList;
59+
import org.sweble.wikitext.parser.nodes.WtUrl;
60+
import org.sweble.wikitext.parser.nodes.WtWhitespace;
61+
import org.sweble.wikitext.parser.nodes.WtXmlCharRef;
62+
import org.sweble.wikitext.parser.nodes.WtXmlComment;
63+
import org.sweble.wikitext.parser.nodes.WtXmlElement;
64+
import org.sweble.wikitext.parser.nodes.WtXmlEntityRef;
3065
import org.sweble.wikitext.parser.parser.LinkTargetException;
66+
import org.xml.sax.SAXException;
3167

68+
import java.io.IOException;
3269
import java.util.ArrayList;
3370
import java.util.LinkedList;
3471
import java.util.List;
3572
import java.util.regex.Pattern;
3673

74+
import javax.xml.parsers.ParserConfigurationException;
75+
import javax.xml.transform.TransformerException;
76+
import javax.xml.xpath.XPathExpressionException;
77+
3778
/**
3879
* A visitor to convert an article AST into a pure text representation. To better understand the
3980
* visitor pattern as implemented by the Visitor class, please take a look at the following
@@ -74,20 +115,27 @@ public class MathConverter
74115
private boolean noWrap;
75116
private LinkedList<Integer> sections;
76117
private PageTitle pageTitle;
118+
private String texInfoUrl;
77119

78120
public MathConverter(String wikiText, String name) throws LinkTargetException, EngineException {
79121
pageTitle = PageTitle.make(config, name);
80122
PageId pageId = new PageId(pageTitle, -1);
81123
page = engine.postprocess(pageId, wikiText, null);
124+
texInfoUrl = (new BaseConfig()).getTexvcinfoUrl();
82125
}
83126

84127
public MathConverter(String wikiText) throws LinkTargetException, EngineException {
85128
this(wikiText, "noname");
86129
}
87130

88-
public MathConverter(String wikitext, String title, WikidataLinkMap wl) throws LinkTargetException, EngineException {
131+
public MathConverter(String wikitext, String title, BaseConfig config) throws LinkTargetException, EngineException {
89132
this(wikitext, title);
90-
this.wl = wl;
133+
if (config.getWikiDataFile() != null) {
134+
wl = new WikidataLinkMap(config.getWikiDataFile());
135+
} else {
136+
wl = null;
137+
}
138+
texInfoUrl = config.getTexvcinfoUrl();
91139
}
92140

93141
public List<MathTag> getMathTags() {
@@ -174,7 +222,7 @@ public void visit(WtItalics i) {
174222
write("\"");
175223
}
176224

177-
public boolean detectHiddenMath(WtContentNodeImpl i) {
225+
public boolean detectHiddenMath(WtNode i) {
178226
if (i.size() == 1 && i.get(0) instanceof WtText) {
179227
final String tex = getTex(i, false);
180228
if (tex != null) {
@@ -450,7 +498,9 @@ public String wiki2Tex(String content) {
450498
}
451499

452500
public void visit(WtTemplateArgument n) {
453-
iterate(n.getValue());
501+
if (!detectHiddenMath(n.getValue())) {
502+
iterate(n.getValue());
503+
}
454504
}
455505

456506
public void visit(WtTemplateParameter n) {
@@ -584,13 +634,23 @@ public List<WikidataLink> getLinks() {
584634
return links;
585635
}
586636

587-
private String getTex(WtContentNodeImpl i, boolean force) {
637+
private String getTex(WtNode i, boolean force) {
588638
if (i.get(0) instanceof WtText) {
589-
String content = ((WtText) i.get(0)).getContent();
590-
//content = TextExtractorMapper.unescape(content);
639+
String content = ((WtText) i.get(0)).getContent().trim();
640+
content = TextExtractorMapper.unescape(content);
591641
String tex = wiki2Tex(content);
592-
if (tex.length() > 0 && (content.length() == 1 ||
593-
(content.length() < 10 && tex.length() > content.length()))) {
642+
if (tex.length() > 0 && (content.length() == 1
643+
|| (content.length() < 100 && !content.equals(tex)))) {
644+
Multiset<String> idents;
645+
try {
646+
idents = TexInfo.getIdentifiers(tex, texInfoUrl);
647+
} catch (XPathExpressionException | ParserConfigurationException | IOException
648+
| SAXException | TransformerException ignored) {
649+
return null;
650+
}
651+
if (idents.size() == 0 && !force) {
652+
return null;
653+
}
594654
if (i instanceof WtBold) {
595655
tex = "\\mathbf{" + tex + "}";
596656
}

mathosphere-core/src/test/java/com/formulasearchengine/mathosphere/mlp/text/MathConverterTest.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,14 @@ public void testGo10() throws Exception {
138138
final String real = mathConverter.getOutput();
139139
assertThat(real, containsString("<math>a</math> x"));
140140
}
141+
@Test
142+
public void testGo11() throws Exception {
143+
String wikiText = "Let the coin tosses be represented by a sequence {{nowrap|1=''X''&lt;sub&gt;0&lt;/sub&gt;, ''X''&lt;sub&gt;1&lt;/sub&gt;, &amp;hellip;}}";
144+
wikiText = TextExtractorMapper.unescape(wikiText);
145+
final MathConverter mathConverter = new MathConverter(wikiText);
146+
final String real = mathConverter.getOutput();
147+
assertThat(real, containsString("<math>X_{0}"));
148+
}
141149

142150
@Test
143151
public void findFormulaFromWikiText() throws Exception {

mathosphere-core/src/test/java/com/formulasearchengine/mathosphere/mlp/text/PlaintextDocumentBuilderTest.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package com.formulasearchengine.mathosphere.mlp.text;
22

3+
import com.formulasearchengine.mathosphere.mlp.cli.BaseConfig;
4+
35
import org.junit.Test;
46

57
import static org.junit.Assert.assertEquals;
@@ -35,8 +37,9 @@ public void extractPlainText_new() throws Exception {
3537
+ "[[Erwin Schrödinger]]. "
3638
+ "In the [[Copenhagen interpretation|standard interpretation of quantum mechanics]], "
3739
+ "the wavefunction is the most complete description that can be given to a physical system.";
38-
WikidataLinkMap wl = new WikidataLinkMap(getClass().getResource("test-map-no-dup.csv").getFile());
39-
String actual = (new MathConverter(input, "hamiltonian", wl)).getOutput();
40+
BaseConfig c = new BaseConfig();
41+
c.setWikiDataFile(getClass().getResource("test-map-no-dup.csv").getFile());
42+
String actual = (new MathConverter(input, "hamiltonian", c)).getOutput();
4043
String expected = "In [[quantum mechanics]] , the \"Schrödinger equation\" is a " +
4144
"[[partial differential equation]] that describes how the LINK_Q230883 of a " +
4245
"[[physical system]] changes with [[time]] . It was formulated in late 1925, and published in" +

0 commit comments

Comments
 (0)