Skip to content

Commit 7c466de

Browse files
committed
Fix wikipedia parse URL redirects
1 parent 0ce2631 commit 7c466de

File tree

1 file changed

+33
-8
lines changed

1 file changed

+33
-8
lines changed

src/main/java/es/upm/fi/dia/oeg/map4rdf/server/servlet/ParseWikipediaService.java

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import java.io.IOException;
66
import java.io.InputStreamReader;
77
import java.io.OutputStreamWriter;
8+
import java.net.HttpURLConnection;
89
import java.net.MalformedURLException;
910
import java.net.URL;
1011
import java.net.URLConnection;
@@ -14,16 +15,21 @@
1415
import javax.servlet.http.HttpServletRequest;
1516
import javax.servlet.http.HttpServletResponse;
1617

18+
import org.apache.log4j.Level;
19+
1720
import com.google.inject.Singleton;
1821

1922
@Singleton
2023
public class ParseWikipediaService extends HttpServlet{
2124

2225
private static final long serialVersionUID = -8524195705285261839L;
2326
private static final String WIKIPEDIA_PARAM="URL";
27+
private static org.apache.log4j.Logger log = org.apache.log4j.Logger.getLogger("Map4RDF");
2428

29+
@SuppressWarnings("static-access")
2530
@Override
2631
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
32+
log.log(Level.FATAL, "ESCRIBE ALGO!!!!");
2733
try {
2834
resp.setContentType("text/html; charset=UTF-8");
2935
String URL = getWikipediaURL(req);
@@ -35,12 +41,21 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws Se
3541
return;
3642
}
3743
if(!URL.contains("http://")){
38-
URL="http://"+URL;
44+
URL="https://"+URL;
45+
}
46+
if(URL.contains("http://")){
47+
URL=URL.replace("http://", "https://");
3948
}
4049
try {
4150
final URL wikipediaURL = new URL(URL);
4251
final String host=wikipediaURL.getHost();
43-
final URLConnection wikipediaCon = wikipediaURL.openConnection();
52+
final HttpURLConnection wikipediaCon = (HttpURLConnection)wikipediaURL.openConnection();
53+
wikipediaCon.setFollowRedirects(true);
54+
wikipediaCon.addRequestProperty("Content-Type", "text/plain; charset=utf-8");
55+
wikipediaCon.setRequestProperty("Content-Type", "text/plain; charset=utf-8");
56+
wikipediaCon.setRequestProperty("content-type", "text/plain; charset=utf-8");
57+
log.log(Level.FATAL, "content-type:"+wikipediaCon.getContentType());
58+
wikipediaCon.connect();
4459
BufferedReader buffReader = new BufferedReader(
4560
new InputStreamReader(wikipediaCon.getInputStream(),"UTF-8"));
4661
String toReturn=htmlParseWikipediaInfobox(buffReader,host);
@@ -112,7 +127,7 @@ private String htmlParseWikipediaFirtsDescription(
112127
inputLine = buffReader.readLine();
113128
}
114129
if(!foundFirtsP){
115-
return "";
130+
return "Not found infobox or description";
116131
}
117132
result+="</body></html>";
118133
} catch (IOException e) {
@@ -133,19 +148,29 @@ private String htmlParseWikipediaInfobox(BufferedReader buffReader, String host)
133148
boolean finish=false;
134149
int countTables=-1;
135150
try {
151+
boolean firtsExecution = true;
136152
String inputLine = buffReader.readLine();
153+
if(firtsExecution){
154+
while(!buffReader.ready()){}
155+
inputLine = buffReader.readLine();
156+
firtsExecution = false;
157+
}
158+
String inputLineContains = "";
137159
while (inputLine != null && !finish) {
138-
//System.out.println(inputLine);
139-
if(inputLine.contains("</head")){
160+
inputLineContains = "";
161+
if(inputLine !=null ){
162+
inputLineContains = inputLine.toLowerCase();
163+
}
164+
if(inputLineContains.contains("</head")){
140165
finalHead = true;
141166
result+=inputLine;
142167
result+="<body>";
143168
}
144-
if(finalHead && inputLine.contains("infobox_v2")){
169+
if(finalHead && (inputLineContains.contains("infobox_v2") || inputLineContains.contains("infobox"))){
145170
foundInfobox=true;
146171
}
147172
if(!finalHead || foundInfobox){
148-
if(inputLine.contains("infobox_v2")){
173+
if((inputLineContains.contains("infobox_v2") || inputLineContains.contains("infobox"))){
149174
result+="<table style=\"width:15px; text-align:left;\">";
150175
}else{
151176
if(finalHead){
@@ -161,7 +186,7 @@ private String htmlParseWikipediaInfobox(BufferedReader buffReader, String host)
161186
if(foundInfobox && inputLine.contains("<table")){
162187
countTables++;
163188
}
164-
if(inputLine.contains("</table")){
189+
if(inputLineContains.contains("</table")){
165190
if(countTables==0){
166191
finish=true;
167192
}else{

0 commit comments

Comments
 (0)