|
9 | 9 | * limitations under the License. |
10 | 10 | */ |
11 | 11 |
|
| 12 | +import com.google.common.collect.Multiset; |
| 13 | + |
| 14 | +import com.formulasearchengine.mathosphere.mlp.cli.BaseConfig; |
12 | 15 | import com.formulasearchengine.mathosphere.mlp.contracts.TextExtractorMapper; |
13 | 16 | import com.formulasearchengine.mathosphere.mlp.pojos.MathTag; |
14 | 17 | import com.formulasearchengine.mathosphere.mlp.pojos.WikidataLink; |
|
25 | 28 | import org.sweble.wikitext.engine.nodes.EngPage; |
26 | 29 | import org.sweble.wikitext.engine.nodes.EngProcessedPage; |
27 | 30 | import org.sweble.wikitext.engine.utils.DefaultConfigEnWp; |
28 | | -import org.sweble.wikitext.parser.nodes.*; |
| 31 | +import org.sweble.wikitext.parser.nodes.WtBold; |
29 | 32 | import org.sweble.wikitext.parser.nodes.WtContentNode.WtContentNodeImpl; |
| 33 | +import org.sweble.wikitext.parser.nodes.WtExternalLink; |
| 34 | +import org.sweble.wikitext.parser.nodes.WtHorizontalRule; |
| 35 | +import org.sweble.wikitext.parser.nodes.WtIllegalCodePoint; |
| 36 | +import org.sweble.wikitext.parser.nodes.WtImageLink; |
| 37 | +import org.sweble.wikitext.parser.nodes.WtInternalLink; |
| 38 | +import org.sweble.wikitext.parser.nodes.WtItalics; |
| 39 | +import org.sweble.wikitext.parser.nodes.WtListItem; |
| 40 | +import org.sweble.wikitext.parser.nodes.WtNewline; |
| 41 | +import org.sweble.wikitext.parser.nodes.WtNode; |
| 42 | +import org.sweble.wikitext.parser.nodes.WtNodeList; |
| 43 | +import org.sweble.wikitext.parser.nodes.WtOrderedList; |
| 44 | +import org.sweble.wikitext.parser.nodes.WtPageSwitch; |
| 45 | +import org.sweble.wikitext.parser.nodes.WtParagraph; |
| 46 | +import org.sweble.wikitext.parser.nodes.WtSection; |
| 47 | +import org.sweble.wikitext.parser.nodes.WtTable; |
| 48 | +import org.sweble.wikitext.parser.nodes.WtTableCaption; |
| 49 | +import org.sweble.wikitext.parser.nodes.WtTableCell; |
| 50 | +import org.sweble.wikitext.parser.nodes.WtTableHeader; |
| 51 | +import org.sweble.wikitext.parser.nodes.WtTableImplicitTableBody; |
| 52 | +import org.sweble.wikitext.parser.nodes.WtTableRow; |
| 53 | +import org.sweble.wikitext.parser.nodes.WtTagExtension; |
| 54 | +import org.sweble.wikitext.parser.nodes.WtTemplate; |
| 55 | +import org.sweble.wikitext.parser.nodes.WtTemplateArgument; |
| 56 | +import org.sweble.wikitext.parser.nodes.WtTemplateParameter; |
| 57 | +import org.sweble.wikitext.parser.nodes.WtText; |
| 58 | +import org.sweble.wikitext.parser.nodes.WtUnorderedList; |
| 59 | +import org.sweble.wikitext.parser.nodes.WtUrl; |
| 60 | +import org.sweble.wikitext.parser.nodes.WtWhitespace; |
| 61 | +import org.sweble.wikitext.parser.nodes.WtXmlCharRef; |
| 62 | +import org.sweble.wikitext.parser.nodes.WtXmlComment; |
| 63 | +import org.sweble.wikitext.parser.nodes.WtXmlElement; |
| 64 | +import org.sweble.wikitext.parser.nodes.WtXmlEntityRef; |
30 | 65 | import org.sweble.wikitext.parser.parser.LinkTargetException; |
| 66 | +import org.xml.sax.SAXException; |
31 | 67 |
|
| 68 | +import java.io.IOException; |
32 | 69 | import java.util.ArrayList; |
33 | 70 | import java.util.LinkedList; |
34 | 71 | import java.util.List; |
35 | 72 | import java.util.regex.Pattern; |
36 | 73 |
|
| 74 | +import javax.xml.parsers.ParserConfigurationException; |
| 75 | +import javax.xml.transform.TransformerException; |
| 76 | +import javax.xml.xpath.XPathExpressionException; |
| 77 | + |
37 | 78 | /** |
38 | 79 | * A visitor to convert an article AST into a pure text representation. To better understand the |
39 | 80 | * visitor pattern as implemented by the Visitor class, please take a look at the following |
@@ -74,20 +115,27 @@ public class MathConverter |
74 | 115 | private boolean noWrap; |
75 | 116 | private LinkedList<Integer> sections; |
76 | 117 | private PageTitle pageTitle; |
| 118 | + private String texInfoUrl; |
77 | 119 |
|
78 | 120 | public MathConverter(String wikiText, String name) throws LinkTargetException, EngineException { |
79 | 121 | pageTitle = PageTitle.make(config, name); |
80 | 122 | PageId pageId = new PageId(pageTitle, -1); |
81 | 123 | page = engine.postprocess(pageId, wikiText, null); |
| 124 | + texInfoUrl = (new BaseConfig()).getTexvcinfoUrl(); |
82 | 125 | } |
83 | 126 |
|
84 | 127 | public MathConverter(String wikiText) throws LinkTargetException, EngineException { |
85 | 128 | this(wikiText, "noname"); |
86 | 129 | } |
87 | 130 |
|
88 | | - public MathConverter(String wikitext, String title, WikidataLinkMap wl) throws LinkTargetException, EngineException { |
| 131 | + public MathConverter(String wikitext, String title, BaseConfig config) throws LinkTargetException, EngineException { |
89 | 132 | this(wikitext, title); |
90 | | - this.wl = wl; |
| 133 | + if (config.getWikiDataFile() != null) { |
| 134 | + wl = new WikidataLinkMap(config.getWikiDataFile()); |
| 135 | + } else { |
| 136 | + wl = null; |
| 137 | + } |
| 138 | + texInfoUrl = config.getTexvcinfoUrl(); |
91 | 139 | } |
92 | 140 |
|
93 | 141 | public List<MathTag> getMathTags() { |
@@ -174,7 +222,7 @@ public void visit(WtItalics i) { |
174 | 222 | write("\""); |
175 | 223 | } |
176 | 224 |
|
177 | | - public boolean detectHiddenMath(WtContentNodeImpl i) { |
| 225 | + public boolean detectHiddenMath(WtNode i) { |
178 | 226 | if (i.size() == 1 && i.get(0) instanceof WtText) { |
179 | 227 | final String tex = getTex(i, false); |
180 | 228 | if (tex != null) { |
@@ -450,7 +498,9 @@ public String wiki2Tex(String content) { |
450 | 498 | } |
451 | 499 |
|
452 | 500 | public void visit(WtTemplateArgument n) { |
453 | | - iterate(n.getValue()); |
| 501 | + if (!detectHiddenMath(n.getValue())) { |
| 502 | + iterate(n.getValue()); |
| 503 | + } |
454 | 504 | } |
455 | 505 |
|
456 | 506 | public void visit(WtTemplateParameter n) { |
@@ -584,13 +634,23 @@ public List<WikidataLink> getLinks() { |
584 | 634 | return links; |
585 | 635 | } |
586 | 636 |
|
587 | | - private String getTex(WtContentNodeImpl i, boolean force) { |
| 637 | + private String getTex(WtNode i, boolean force) { |
588 | 638 | if (i.get(0) instanceof WtText) { |
589 | | - String content = ((WtText) i.get(0)).getContent(); |
590 | | - //content = TextExtractorMapper.unescape(content); |
| 639 | + String content = ((WtText) i.get(0)).getContent().trim(); |
| 640 | + content = TextExtractorMapper.unescape(content); |
591 | 641 | String tex = wiki2Tex(content); |
592 | | - if (tex.length() > 0 && (content.length() == 1 || |
593 | | - (content.length() < 10 && tex.length() > content.length()))) { |
| 642 | + if (tex.length() > 0 && (content.length() == 1 |
| 643 | + || (content.length() < 100 && !content.equals(tex)))) { |
| 644 | + Multiset<String> idents; |
| 645 | + try { |
| 646 | + idents = TexInfo.getIdentifiers(tex, texInfoUrl); |
| 647 | + } catch (XPathExpressionException | ParserConfigurationException | IOException |
| 648 | + | SAXException | TransformerException ignored) { |
| 649 | + return null; |
| 650 | + } |
| 651 | + if (idents.size() == 0 && !force) { |
| 652 | + return null; |
| 653 | + } |
594 | 654 | if (i instanceof WtBold) { |
595 | 655 | tex = "\\mathbf{" + tex + "}"; |
596 | 656 | } |
|
0 commit comments