88import org .jsoup .nodes .Element ;
99import org .jsoup .select .Elements ;
1010
11+ import java .io .BufferedReader ;
1112import java .io .IOException ;
13+ import java .io .InputStreamReader ;
14+ import java .net .HttpURLConnection ;
1215import java .net .MalformedURLException ;
1316import java .net .URL ;
1417
@@ -50,6 +53,7 @@ private static WebPageData extractWebPageInfo(String url) throws IOException {
5053 doc = Jsoup .connect (url ).userAgent (USER_AGENT ).get ();
5154 } catch (Exception e ) {
5255 LOGGER .error ("Error extracting web page info " + url , e );
56+ doc = Jsoup .parse (getHtmlContentManually (url ), url );
5357 }
5458
5559 if (doc == null ) {
@@ -131,10 +135,31 @@ private static String extractLogo(Document doc, String url) {
131135 }
132136
133137 // If not found, try to find a prominent image
134- Elements images = doc .select ("img[src~=(?i)\\ .(png|jpe?g)]" );
138+ // Find images with src containing svg, png, jpg, jpeg
139+ // or <object type="image/svg+xml" data="https://fdn.gsmarena.com/vv/assets12/i/logo.svg"><img src="https://fdn.gsmarena.com/vv/assets12/i/logo-fallback.gif" alt="GSMArena.com"></object>
140+
141+ try {
142+ for (Element element : doc .select ("object[type=image/svg+xml]" )) {
143+ if (element .hasAttr ("data" )) {
144+ String data = element .absUrl ("data" );
145+ if (data .contains ("logo" ) && data .startsWith ("http" )) {
146+ return data ;
147+ }
148+ }
149+ }
150+ } catch (Throwable ignore ) {
151+ }
152+
153+ Elements images = doc .select ("img[src~=(?i)\\ .(svg|png|jpe?g)]" );
135154 for (Element image : images ) {
136155 if (image .hasAttr ("alt" ) && image .attr ("alt" ).toLowerCase ().contains ("logo" )) {
137156 return image .absUrl ("src" );
157+ } else if (image .hasAttr ("title" ) && image .attr ("title" ).toLowerCase ().contains ("logo" )) {
158+ return image .absUrl ("src" );
159+ } else if (image .hasAttr ("class" ) && image .attr ("class" ).toLowerCase ().contains ("logo" )) {
160+ return image .absUrl ("src" );
161+ } else if (image .absUrl ("src" ).contains ("logo" )) {
162+ return image .absUrl ("src" );
138163 }
139164 }
140165
@@ -170,4 +195,26 @@ public String getLogo() {
170195 return logo ;
171196 }
172197 }
198+
199+ private static String getHtmlContentManually (String url ) throws IOException {
200+ URL obj = new URL (url );
201+ HttpURLConnection con = (HttpURLConnection ) obj .openConnection ();
202+ con .setRequestMethod ("GET" );
203+ con .setRequestProperty ("User-Agent" , USER_AGENT );
204+
205+ int responseCode = con .getResponseCode ();
206+ if (responseCode == HttpURLConnection .HTTP_OK ) {
207+ BufferedReader in = new BufferedReader (new InputStreamReader (con .getInputStream ()));
208+ String inputLine ;
209+ StringBuilder response = new StringBuilder ();
210+ while ((inputLine = in .readLine ()) != null ) {
211+ response .append (inputLine );
212+ }
213+ in .close ();
214+ return response .toString ();
215+ } else {
216+ LOGGER .error ("Manual HTTP GET request failed." + url + " Response Code: " + responseCode );
217+ }
218+ return null ;
219+ }
173220}
0 commit comments