55import java .io .IOException ;
66import java .io .InputStreamReader ;
77import java .io .OutputStreamWriter ;
8+ import java .net .HttpURLConnection ;
89import java .net .MalformedURLException ;
910import java .net .URL ;
1011import java .net .URLConnection ;
1415import javax .servlet .http .HttpServletRequest ;
1516import javax .servlet .http .HttpServletResponse ;
1617
18+ import org .apache .log4j .Level ;
19+
1720import com .google .inject .Singleton ;
1821
1922@ Singleton
2023public class ParseWikipediaService extends HttpServlet {
2124
2225 private static final long serialVersionUID = -8524195705285261839L ;
2326 private static final String WIKIPEDIA_PARAM ="URL" ;
27+ private static org .apache .log4j .Logger log = org .apache .log4j .Logger .getLogger ("Map4RDF" );
2428
29+ @ SuppressWarnings ("static-access" )
2530 @ Override
2631 protected void doGet (HttpServletRequest req , HttpServletResponse resp ) throws ServletException , IOException {
32+ log .log (Level .FATAL , "ESCRIBE ALGO!!!!" );
2733 try {
2834 resp .setContentType ("text/html; charset=UTF-8" );
2935 String URL = getWikipediaURL (req );
@@ -35,12 +41,21 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws Se
3541 return ;
3642 }
3743 if (!URL .contains ("http://" )){
38- URL ="http://" +URL ;
44+ URL ="https://" +URL ;
45+ }
46+ if (URL .contains ("http://" )){
47+ URL =URL .replace ("http://" , "https://" );
3948 }
4049 try {
4150 final URL wikipediaURL = new URL (URL );
4251 final String host =wikipediaURL .getHost ();
43- final URLConnection wikipediaCon = wikipediaURL .openConnection ();
52+ final HttpURLConnection wikipediaCon = (HttpURLConnection )wikipediaURL .openConnection ();
53+ wikipediaCon .setFollowRedirects (true );
54+ wikipediaCon .addRequestProperty ("Content-Type" , "text/plain; charset=utf-8" );
55+ wikipediaCon .setRequestProperty ("Content-Type" , "text/plain; charset=utf-8" );
56+ wikipediaCon .setRequestProperty ("content-type" , "text/plain; charset=utf-8" );
57+ log .log (Level .FATAL , "content-type:" +wikipediaCon .getContentType ());
58+ wikipediaCon .connect ();
4459 BufferedReader buffReader = new BufferedReader (
4560 new InputStreamReader (wikipediaCon .getInputStream (),"UTF-8" ));
4661 String toReturn =htmlParseWikipediaInfobox (buffReader ,host );
@@ -112,7 +127,7 @@ private String htmlParseWikipediaFirtsDescription(
112127 inputLine = buffReader .readLine ();
113128 }
114129 if (!foundFirtsP ){
115- return "" ;
130+ return "Not found infobox or description " ;
116131 }
117132 result +="</body></html>" ;
118133 } catch (IOException e ) {
@@ -133,19 +148,29 @@ private String htmlParseWikipediaInfobox(BufferedReader buffReader, String host)
133148 boolean finish =false ;
134149 int countTables =-1 ;
135150 try {
151+ boolean firtsExecution = true ;
136152 String inputLine = buffReader .readLine ();
153+ if (firtsExecution ){
154+ while (!buffReader .ready ()){}
155+ inputLine = buffReader .readLine ();
156+ firtsExecution = false ;
157+ }
158+ String inputLineContains = "" ;
137159 while (inputLine != null && !finish ) {
138- //System.out.println(inputLine);
139- if (inputLine .contains ("</head" )){
160+ inputLineContains = "" ;
161+ if (inputLine !=null ){
162+ inputLineContains = inputLine .toLowerCase ();
163+ }
164+ if (inputLineContains .contains ("</head" )){
140165 finalHead = true ;
141166 result +=inputLine ;
142167 result +="<body>" ;
143168 }
144- if (finalHead && inputLine .contains ("infobox_v2" )){
169+ if (finalHead && ( inputLineContains .contains ("infobox_v2" ) || inputLineContains . contains ( "infobox" ) )){
145170 foundInfobox =true ;
146171 }
147172 if (!finalHead || foundInfobox ){
148- if (inputLine .contains ("infobox_v2" )){
173+ if (( inputLineContains .contains ("infobox_v2" ) || inputLineContains . contains ( "infobox" ) )){
149174 result +="<table style=\" width:15px; text-align:left;\" >" ;
150175 }else {
151176 if (finalHead ){
@@ -161,7 +186,7 @@ private String htmlParseWikipediaInfobox(BufferedReader buffReader, String host)
161186 if (foundInfobox && inputLine .contains ("<table" )){
162187 countTables ++;
163188 }
164- if (inputLine .contains ("</table" )){
189+ if (inputLineContains .contains ("</table" )){
165190 if (countTables ==0 ){
166191 finish =true ;
167192 }else {
0 commit comments