silvia-YQY · silvia-YQY · Apr 5, 2020
diff --git a/pom.xml b/pom.xml
@@ -38,6 +38,13 @@
             <version>5.6.0</version>
             <scope>test</scope>
         </dependency>
+        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+            <version>1.12.2</version>
+        </dependency>
+
     </dependencies>
 
     <build>

diff --git a/src/main/java/com/github/crawler/Main.java b/src/main/java/com/github/crawler/Main.java
@@ -6,19 +6,85 @@
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
 import org.apache.http.util.EntityUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
 
 public class Main {
     public static void main(String[] args) throws IOException {
+        // 待处理的连接池
+        List<String> linkPool = new ArrayList<>();
+        // 已经处理的连接池
+        Set<String> processedLinks = new HashSet<>();
+        linkPool.add("https://sina.cn");
+
+        while (true) {
+            if (linkPool.isEmpty()) {
+                break;
+            }
+            // ArrayList 从尾部删除更有效率
+            String link = linkPool.remove(linkPool.size() - 1);
+            if (processedLinks.contains(link)) {
+                continue;
+            }
+            if (isInterestingLink("sina.cn")) {
+                Document doc = httpGetAndParseHtml(link);
+                doc.select("a").stream().map(aTag -> aTag.attr("href")).forEach(linkPool::add);
+                // 假如这是一个新闻的详情页面，就存入数据库，否则，就什么都不做
+                storeIntoDatabaseIfItIsNewsPages(doc);
+                processedLinks.add(link);
+            } else {
+                // this is not target link
+                continue;
+            }
+        }
+    }
+
+    private static void storeIntoDatabaseIfItIsNewsPages(Document doc) {
+        ArrayList<Element> articleTags = doc.select("article");
+        if (!articleTags.isEmpty()) {
+            for (Element articleTag : articleTags) {
+                String title = articleTags.get(0).child(0).text();
+                System.out.println(title);
+            }
+        }
+    }
+
+    private static Document httpGetAndParseHtml(String link) throws IOException {
         CloseableHttpClient httpclient = HttpClients.createDefault();
-        HttpGet httpGet = new HttpGet("https://sina.cn/");
+        if (link.startsWith("///")) {
+            link = "https://" + link;
+        } else if (link.contains("\\")) {
+            link = link.replace("\\", "");
+        }
+        HttpGet httpGet = new HttpGet(link);
+        httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
         try (CloseableHttpResponse response1 = httpclient.execute(httpGet)) {
-            System.out.println(response1.getStatusLine());
             HttpEntity entity1 = response1.getEntity();
-            // do something useful with the response body
-            // and ensure it is fully consumed
-            System.out.println(EntityUtils.toString(entity1));
+            String html = EntityUtils.toString(entity1);
+            return Jsoup.parse(html);
         }
     }
+
+    public static boolean isInterestingLink(String link) {
+        return (isNewsPage(link) || isIndexPage(link) && isNotLoginPage(link));
+    }
+
+    public static boolean isIndexPage(String link) {
+        return "https://sina.cn".equals(link);
+    }
+
+    public static boolean isNewsPage(String link) {
+        return link.contains("news.sina.cn");
+    }
+
+    public static boolean isNotLoginPage(String link) {
+        return !link.contains("passport.sina.cn");
+    }
 }