diff --git a/pom.xml b/pom.xml index 3fd97d1..a80ae8d 100644 --- a/pom.xml +++ b/pom.xml @@ -38,6 +38,13 @@ 5.6.0 test + + + org.jsoup + jsoup + 1.12.2 + + diff --git a/src/main/java/com/github/crawler/Main.java b/src/main/java/com/github/crawler/Main.java index 494f859..7227835 100644 --- a/src/main/java/com/github/crawler/Main.java +++ b/src/main/java/com/github/crawler/Main.java @@ -6,19 +6,85 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; public class Main { public static void main(String[] args) throws IOException { + // 待处理的连接池 + List linkPool = new ArrayList<>(); + // 已经处理的连接池 + Set processedLinks = new HashSet<>(); + linkPool.add("https://sina.cn"); + + while (true) { + if (linkPool.isEmpty()) { + break; + } + // ArrayList 从尾部删除更有效率 + String link = linkPool.remove(linkPool.size() - 1); + if (processedLinks.contains(link)) { + continue; + } + if (isInterestingLink("sina.cn")) { + Document doc = httpGetAndParseHtml(link); + doc.select("a").stream().map(aTag -> aTag.attr("href")).forEach(linkPool::add); + // 假如这是一个新闻的详情页面,就存入数据库,否则,就什么都不做 + storeIntoDatabaseIfItIsNewsPages(doc); + processedLinks.add(link); + } else { + // this is not target link + continue; + } + } + } + + private static void storeIntoDatabaseIfItIsNewsPages(Document doc) { + ArrayList articleTags = doc.select("article"); + if (!articleTags.isEmpty()) { + for (Element articleTag : articleTags) { + String title = articleTags.get(0).child(0).text(); + System.out.println(title); + } + } + } + + private static Document httpGetAndParseHtml(String link) throws IOException { CloseableHttpClient httpclient = HttpClients.createDefault(); - HttpGet httpGet = new HttpGet("https://sina.cn/"); + if (link.startsWith("///")) { + link = "https://" + link; + } else if (link.contains("\\")) { + link = link.replace("\\", ""); + } + HttpGet httpGet = new HttpGet(link); + httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"); try (CloseableHttpResponse response1 = httpclient.execute(httpGet)) { - System.out.println(response1.getStatusLine()); HttpEntity entity1 = response1.getEntity(); - // do something useful with the response body - // and ensure it is fully consumed - System.out.println(EntityUtils.toString(entity1)); + String html = EntityUtils.toString(entity1); + return Jsoup.parse(html); } } + + public static boolean isInterestingLink(String link) { + return (isNewsPage(link) || isIndexPage(link) && isNotLoginPage(link)); + } + + public static boolean isIndexPage(String link) { + return "https://sina.cn".equals(link); + } + + public static boolean isNewsPage(String link) { + return link.contains("news.sina.cn"); + } + + public static boolean isNotLoginPage(String link) { + return !link.contains("passport.sina.cn"); + } }