Skip to content

Commit 4daeb97

Browse files
authored
Merge pull request #32 from natbaca-wmf/master
Fix Wikipedia Text Loading
2 parents 684ddde + 0a853b7 commit 4daeb97

File tree

1 file changed

+30
-76
lines changed

1 file changed

+30
-76
lines changed

wikipedia.js

Lines changed: 30 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,11 @@ async function getWikipediaData(language, topic) {
5858
};
5959

6060
const wikipediaHTMLPromise = function() {
61-
6261
const requestConfig = {
63-
baseURL: "https://" + language + ".wikipedia.org/api/rest_v1/",
64-
url: "/page/mobile-sections/" + encodedTopic,
62+
baseURL: "https://" + language + ".wikipedia.org/w/rest.php/v1/page/",
63+
url: encodedTopic + "/html",
6564
method: "get",
66-
responseType: "json",
65+
responseType: "text",
6766
headers: {
6867
"Api-User-Agent": process.env.WIKIDOCUMENTARIES_API_USER_AGENT
6968
},
@@ -72,65 +71,38 @@ async function getWikipediaData(language, topic) {
7271
else return axios.request(requestConfig);
7372
};
7473

75-
const [wikipediaSummaryResponse, wikipediaHTMLResponse]
76-
= await axios.all([wikipediaSummaryPromise(), wikipediaHTMLPromise()]);
74+
const [summaryRes, htmlRes] = await Promise.allSettled([
75+
wikipediaSummaryPromise(),
76+
wikipediaHTMLPromise()
77+
]);
7778

78-
if (wikipediaHTMLResponse.data == undefined ) {
79-
// No wikipedia article
80-
excerptHTML="";
81-
remainingHTML=null;
82-
}
83-
else {
84-
var origHTML = wikipediaHTMLResponse.data.lead.sections[0].text;
85-
var remainingHTML = null;
86-
87-
if (wikipediaHTMLResponse.data.lead.disambiguation != undefined && wikipediaHTMLResponse.data.lead.disambiguation == true) {
88-
wikipediaHTMLResponse.data.remaining.sections.forEach(section => {
89-
origHTML += section.text;
90-
});
79+
const wikipediaSummaryResponse = summaryRes.status === "fulfilled" ? summaryRes.value : null;
80+
const wikipediaHTMLResponse = htmlRes.status === "fulfilled" ? htmlRes.value : null;
81+
82+
let excerptHTML = "";
83+
let remainingHTML = null;
84+
85+
if (wikipediaHTMLResponse && wikipediaHTMLResponse.data != null && typeof wikipediaHTMLResponse.data === 'string') {
86+
let rawHTML = wikipediaHTMLResponse.data;
87+
88+
const bodyMatch = rawHTML.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
89+
if (bodyMatch) {
90+
rawHTML = bodyMatch[1];
9191
}
92-
else {
93-
var remainingOrigHTML = "";
94-
95-
wikipediaHTMLResponse.data.remaining.sections.forEach(section => {
96-
if (section.isReferenceSection == undefined) {
97-
var sectionHeaderStartTag = "";
98-
var sectionHeaderEndTag = "";
99-
switch(section.toclevel) {
100-
case 1:
101-
sectionHeaderStartTag = "<h2 class='h2'>";
102-
sectionHeaderEndTag = "</h2>";
103-
break;
104-
case 2:
105-
sectionHeaderStartTag = "<h3 class='h3'>";
106-
sectionHeaderEndTag = "</h3>";
107-
break;
108-
case 3:
109-
sectionHeaderStartTag = "<h4 class='h4'>";
110-
sectionHeaderEndTag = "</h4>";
111-
break;
112-
case 4:
113-
sectionHeaderStartTag = "<h5 class='h5'>";
114-
sectionHeaderEndTag = "</h5>";
115-
break;
116-
}
117-
remainingOrigHTML += sectionHeaderStartTag + section.line + sectionHeaderEndTag;
118-
remainingOrigHTML += section.text;
119-
}
120-
});
121-
122-
/* if (remainingOrigHTML.length > 3000) { */ // Small count of HTML should be with the leading section
92+
93+
const splitIndex = rawHTML.search(/<h2[\s>]/i);
94+
const origHTML = splitIndex > -1 ? rawHTML.substring(0, splitIndex) : rawHTML;
95+
96+
if (splitIndex > -1) {
97+
const remainingOrigHTML = rawHTML.substring(splitIndex);
12398
remainingHTML = convertToWikidocumentariesHTML(remainingOrigHTML, topic, language);
124-
/* }
125-
else {
126-
origHTML += remainingOrigHTML;
127-
} */
12899
}
129-
var excerptHTML = convertToWikidocumentariesHTML(origHTML, topic, language);
100+
101+
excerptHTML = convertToWikidocumentariesHTML(origHTML, topic, language);
130102
}
131103

132104
return {
133-
wikipedia: wikipediaSummaryResponse.data,
105+
wikipedia: wikipediaSummaryResponse ? wikipediaSummaryResponse.data : null,
134106
excerptHTML,
135107
remainingHTML,
136108
};
@@ -172,25 +144,7 @@ const convertToWikidocumentariesHTML = function(origHTML, topic, language) {
172144
//$(this).replaceWith($(this).html());
173145
}
174146
});
175-
/* $("table").each(function(index) {
176-
$(this).remove();
177-
});
178-
$("figure").each(function(index) {
179-
$(this).remove();
180-
});
181-
$("figure-inline").each(function(index) {
182-
$(this).remove();
183-
});
184-
$("sup").each(function(index) {
185-
$(this).remove();
186-
});
187-
188-
$("div").each(function(index) {
189-
var div_class = $(this).attr('class');
190-
if (div_class == undefined || div_class != 'noprint') {
191-
$(this).remove();
192-
}
193-
}); */
147+
194148
$("table").each(function(index) { //Remove English Wikipedia infobox
195149
var div_class = $(this).attr('class');
196150
if (div_class != undefined && div_class.indexOf('infobox') != -1) {
@@ -217,4 +171,4 @@ const convertToWikidocumentariesHTML = function(origHTML, topic, language) {
217171
});
218172

219173
return $.html();
220-
}
174+
};

0 commit comments

Comments
 (0)