1414import java .net .HttpURLConnection ;
1515import java .net .MalformedURLException ;
1616import java .net .URL ;
17+ import java .nio .charset .StandardCharsets ;
18+ import java .util .zip .GZIPInputStream ;
1719
1820/*
1921 * @author indianBond
2022 */
2123public class WebPageExtractorUtil {
2224
2325 private static final Logger LOGGER = LoggerFactory .getLogger (WebPageExtractorUtil .class );
24- private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36" ;
26+ private static final String USER_AGENT = "DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)" ;
27+ private static final String COMMON_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36" ;
2528
2629 public static WebPageData getPageData (String userId , String orgId , String url ) {
2730 try {
@@ -50,10 +53,14 @@ public static WebPageData getPageData(String userId, String orgId, String url) {
5053 private static WebPageData extractWebPageInfo (String url ) throws IOException {
5154 Document doc = null ;
5255 try {
53- doc = Jsoup .connect (url ).userAgent (USER_AGENT ).get ();
56+ doc = Jsoup .connect (url ).userAgent (COMMON_USER_AGENT ).get ();
5457 } catch (Exception e ) {
5558 LOGGER .error ("Error extracting web page info " + url , e );
56- doc = Jsoup .parse (getHtmlContentManually (url ), url );
59+ String htmlContentManually = getHtmlContentManually (url );
60+ if (htmlContentManually == null ) {
61+ return null ;
62+ }
63+ doc = Jsoup .parse (htmlContentManually , url );
5764 }
5865
5966 if (doc == null ) {
@@ -201,20 +208,38 @@ private static String getHtmlContentManually(String url) throws IOException {
201208 HttpURLConnection con = (HttpURLConnection ) obj .openConnection ();
202209 con .setRequestMethod ("GET" );
203210 con .setRequestProperty ("User-Agent" , USER_AGENT );
211+ con .setRequestProperty ("Accept" , "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" );
212+ con .setRequestProperty ("Accept-Language" , "en-SG,en-GB;q=0.9,en-US;q=0.8,en;q=0.7" );
213+ con .setRequestProperty ("Upgrade-Insecure-Requests" , "1" );
214+ con .setRequestProperty ("dnt" , "1" );
215+ con .setRequestProperty ("Cache-Control" , "no-cache" );
216+
217+
218+ // Accept gzip encoding
219+ con .setRequestProperty ("Accept-Encoding" , "gzip, deflate" );
204220
205221 int responseCode = con .getResponseCode ();
206222 if (responseCode == HttpURLConnection .HTTP_OK ) {
207- BufferedReader in = new BufferedReader (new InputStreamReader (con .getInputStream ()));
208- String inputLine ;
223+ String encoding = con .getContentEncoding ();
224+ InputStreamReader reader ;
225+
226+ if ("gzip" .equalsIgnoreCase (encoding )) {
227+ reader = new InputStreamReader (new GZIPInputStream (con .getInputStream ()), StandardCharsets .UTF_8 );
228+ } else {
229+ reader = new InputStreamReader (con .getInputStream (), StandardCharsets .UTF_8 );
230+ }
231+
232+ BufferedReader in = new BufferedReader (reader );
209233 StringBuilder response = new StringBuilder ();
234+ String inputLine ;
210235 while ((inputLine = in .readLine ()) != null ) {
211236 response .append (inputLine );
212237 }
213238 in .close ();
214239 return response .toString ();
215240 } else {
216- LOGGER .error ("Manual HTTP GET request failed." + url + " Response Code: " + responseCode );
241+ LOGGER .error ("Manual HTTP GET request failed. " + url + " Response Code: " + responseCode );
242+ return null ;
217243 }
218- return null ;
219244 }
220245}
0 commit comments