diff --git a/pom.xml b/pom.xml
index 3fd97d1..a80ae8d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -38,6 +38,13 @@
5.6.0
test
+
+
+ org.jsoup
+ jsoup
+ 1.12.2
+
+
diff --git a/src/main/java/com/github/crawler/Main.java b/src/main/java/com/github/crawler/Main.java
index 494f859..7227835 100644
--- a/src/main/java/com/github/crawler/Main.java
+++ b/src/main/java/com/github/crawler/Main.java
@@ -6,19 +6,85 @@
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
public class Main {
public static void main(String[] args) throws IOException {
+ // 待处理的连接池
+ List linkPool = new ArrayList<>();
+ // 已经处理的连接池
+ Set processedLinks = new HashSet<>();
+ linkPool.add("https://sina.cn");
+
+ while (true) {
+ if (linkPool.isEmpty()) {
+ break;
+ }
+ // ArrayList 从尾部删除更有效率
+ String link = linkPool.remove(linkPool.size() - 1);
+ if (processedLinks.contains(link)) {
+ continue;
+ }
+ if (isInterestingLink("sina.cn")) {
+ Document doc = httpGetAndParseHtml(link);
+ doc.select("a").stream().map(aTag -> aTag.attr("href")).forEach(linkPool::add);
+ // 假如这是一个新闻的详情页面,就存入数据库,否则,就什么都不做
+ storeIntoDatabaseIfItIsNewsPages(doc);
+ processedLinks.add(link);
+ } else {
+ // this is not target link
+ continue;
+ }
+ }
+ }
+
+ private static void storeIntoDatabaseIfItIsNewsPages(Document doc) {
+ ArrayList articleTags = doc.select("article");
+ if (!articleTags.isEmpty()) {
+ for (Element articleTag : articleTags) {
+ String title = articleTags.get(0).child(0).text();
+ System.out.println(title);
+ }
+ }
+ }
+
+ private static Document httpGetAndParseHtml(String link) throws IOException {
CloseableHttpClient httpclient = HttpClients.createDefault();
- HttpGet httpGet = new HttpGet("https://sina.cn/");
+ if (link.startsWith("///")) {
+ link = "https://" + link;
+ } else if (link.contains("\\")) {
+ link = link.replace("\\", "");
+ }
+ HttpGet httpGet = new HttpGet(link);
+ httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
try (CloseableHttpResponse response1 = httpclient.execute(httpGet)) {
- System.out.println(response1.getStatusLine());
HttpEntity entity1 = response1.getEntity();
- // do something useful with the response body
- // and ensure it is fully consumed
- System.out.println(EntityUtils.toString(entity1));
+ String html = EntityUtils.toString(entity1);
+ return Jsoup.parse(html);
}
}
+
+ public static boolean isInterestingLink(String link) {
+ return (isNewsPage(link) || isIndexPage(link) && isNotLoginPage(link));
+ }
+
+ public static boolean isIndexPage(String link) {
+ return "https://sina.cn".equals(link);
+ }
+
+ public static boolean isNewsPage(String link) {
+ return link.contains("news.sina.cn");
+ }
+
+ public static boolean isNotLoginPage(String link) {
+ return !link.contains("passport.sina.cn");
+ }
}