Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@
<version>5.6.0</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.2</version>
</dependency>

</dependencies>

<build>
Expand Down
76 changes: 71 additions & 5 deletions src/main/java/com/github/crawler/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,85 @@
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class Main {
public static void main(String[] args) throws IOException {
// 待处理的连接池
List<String> linkPool = new ArrayList<>();
// 已经处理的连接池
Set<String> processedLinks = new HashSet<>();
linkPool.add("https://sina.cn");

while (true) {
if (linkPool.isEmpty()) {
break;
}
// ArrayList 从尾部删除更有效率
String link = linkPool.remove(linkPool.size() - 1);
if (processedLinks.contains(link)) {
continue;
}
if (isInterestingLink("sina.cn")) {
Document doc = httpGetAndParseHtml(link);
doc.select("a").stream().map(aTag -> aTag.attr("href")).forEach(linkPool::add);
// 假如这是一个新闻的详情页面,就存入数据库,否则,就什么都不做
storeIntoDatabaseIfItIsNewsPages(doc);
processedLinks.add(link);
} else {
// this is not target link
continue;
}
}
}

private static void storeIntoDatabaseIfItIsNewsPages(Document doc) {
ArrayList<Element> articleTags = doc.select("article");
if (!articleTags.isEmpty()) {
for (Element articleTag : articleTags) {
String title = articleTags.get(0).child(0).text();
System.out.println(title);
}
}
}

private static Document httpGetAndParseHtml(String link) throws IOException {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("https://sina.cn/");
if (link.startsWith("///")) {
link = "https://" + link;
} else if (link.contains("\\")) {
link = link.replace("\\", "");
}
HttpGet httpGet = new HttpGet(link);
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
try (CloseableHttpResponse response1 = httpclient.execute(httpGet)) {
System.out.println(response1.getStatusLine());
HttpEntity entity1 = response1.getEntity();
// do something useful with the response body
// and ensure it is fully consumed
System.out.println(EntityUtils.toString(entity1));
String html = EntityUtils.toString(entity1);
return Jsoup.parse(html);
}
}

public static boolean isInterestingLink(String link) {
return (isNewsPage(link) || isIndexPage(link) && isNotLoginPage(link));
}

public static boolean isIndexPage(String link) {
return "https://sina.cn".equals(link);
}

public static boolean isNewsPage(String link) {
return link.contains("news.sina.cn");
}

public static boolean isNotLoginPage(String link) {
return !link.contains("passport.sina.cn");
}
}