Skip to content

Commit 3944cb9

Browse files
rajat-gargrajatgarg
andauthored
[BAEL-9328] Add code for Web Crawler using WebMagic Library (#18712)
* [BAEL-9328] Add code for Web Crawler using WebMagic Library * [BAEL-9328] Add Live Tests for Web Crawler using WebMagic Library --------- Co-authored-by: rajatgarg <[email protected]>
1 parent cd29ace commit 3944cb9

File tree

4 files changed

+175
-0
lines changed

4 files changed

+175
-0
lines changed

libraries-4/pom.xml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,17 @@
9999
<artifactId>github-api</artifactId>
100100
<version>${github-api.version}</version>
101101
</dependency>
102+
103+
<dependency>
104+
<groupId>us.codecraft</groupId>
105+
<artifactId>webmagic-core</artifactId>
106+
<version>1.0.3</version>
107+
</dependency>
108+
<dependency>
109+
<groupId>us.codecraft</groupId>
110+
<artifactId>webmagic-extension</artifactId>
111+
<version>1.0.3</version>
112+
</dependency>
102113
</dependencies>
103114

104115
<properties>
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package com.baeldung.webmagic;
2+
3+
public class Book {
4+
private final String title;
5+
private final String price;
6+
7+
public Book(String title, String price) {
8+
this.title = title;
9+
this.price = price;
10+
}
11+
12+
public String getTitle() {
13+
return title;
14+
}
15+
16+
public String getPrice() {
17+
return price;
18+
}
19+
}
20+
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package com.baeldung.webmagic;
2+
3+
import us.codecraft.webmagic.Page;
4+
import us.codecraft.webmagic.Site;
5+
import us.codecraft.webmagic.Spider;
6+
import us.codecraft.webmagic.processor.PageProcessor;
7+
8+
import java.util.ArrayList;
9+
import java.util.Collections;
10+
import java.util.List;
11+
12+
public class BookScraper implements PageProcessor {
13+
14+
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
15+
private final List<Book> books = new ArrayList<>();
16+
17+
@Override
18+
public void process(Page page) {
19+
var bookNodes = page.getHtml().css("article.product_pod");
20+
21+
for (int i = 0; i < Math.min(10, bookNodes.nodes().size()); i++) {
22+
var book = bookNodes.nodes().get(i);
23+
24+
String title = book.css("h3 a", "title").get();
25+
String price = book.css(".price_color", "text").get();
26+
27+
books.add(new Book(title, price));
28+
}
29+
}
30+
31+
@Override
32+
public Site getSite() {
33+
return site;
34+
}
35+
36+
public List<Book> getBooks() {
37+
return Collections.unmodifiableList(books);
38+
}
39+
40+
public static void main(String[] args) {
41+
BookScraper bookScraper = new BookScraper();
42+
Spider.create(bookScraper)
43+
.addUrl("https://books.toscrape.com/")
44+
.thread(1)
45+
.run();
46+
47+
bookScraper.getBooks().forEach(book ->
48+
System.out.println("Title: " + book.getTitle() + " | Price: " + book.getPrice()));
49+
}
50+
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
package com.baeldung.webmagic;
2+
3+
import us.codecraft.webmagic.Spider;
4+
import org.junit.jupiter.api.Test;
5+
import java.util.List;
6+
import static org.junit.jupiter.api.Assertions.*;
7+
8+
public class BookScraperLiveTest {
9+
10+
@Test
11+
void whenScrapeABookSite_thenShouldReturnTitleAndPrice() {
12+
BookScraper scraper = new BookScraper();
13+
14+
Spider.create(scraper)
15+
.addUrl("https://books.toscrape.com/")
16+
.thread(1)
17+
.run();
18+
19+
List<Book> books = scraper.getBooks();
20+
21+
assertFalse(books.isEmpty(), "Expected to scrape at least one book.");
22+
assertTrue(books.size() <= 10, "Should not scrape more than 10 books.");
23+
24+
for (Book book : books) {
25+
assertNotNull(book.getTitle(), "Book title should not be null.");
26+
assertFalse(book.getTitle().isBlank(), "Book title should not be blank.");
27+
assertNotNull(book.getPrice(), "Book price should not be null.");
28+
assertTrue(book.getPrice().matches("£?\\d+(\\.\\d{2})?"), "Book price format seems invalid: " + book.getPrice());
29+
}
30+
}
31+
32+
@Test
33+
void whenScrapeBookSite_thenParseAndSortBookPrices() {
34+
BookScraper scraper = new BookScraper();
35+
Spider.create(scraper)
36+
.addUrl("https://books.toscrape.com/")
37+
.thread(1)
38+
.run();
39+
40+
List<Book> books = scraper.getBooks();
41+
assertFalse(books.isEmpty(), "No books were scraped.");
42+
43+
// Extract numerical prices from string (e.g., £51.77 -> 51.77)
44+
List<Double> prices = books.stream()
45+
.map(Book::getPrice)
46+
.map(p -> p.replace("£", ""))
47+
.map(Double::parseDouble)
48+
.toList();
49+
50+
List<Double> sorted = prices.stream().sorted((a, b) -> Double.compare(b, a)).toList();
51+
52+
assertEquals(sorted, prices.stream().sorted((a, b) -> Double.compare(b, a)).toList(),
53+
"Prices are not in descending order after sorting.");
54+
}
55+
56+
@Test
57+
void whenScrapeBookSite_thenBookTitlesShouldContainExpectedWords() {
58+
BookScraper scraper = new BookScraper();
59+
Spider.create(scraper)
60+
.addUrl("https://books.toscrape.com/")
61+
.thread(1)
62+
.run();
63+
64+
List<Book> books = scraper.getBooks();
65+
assertFalse(books.isEmpty(), "No books were scraped.");
66+
67+
boolean foundKeyword = books.stream()
68+
.map(Book::getTitle)
69+
.anyMatch(title -> title.toLowerCase().matches(".*\\b(book|story|novel|guide|life)\\b.*"));
70+
71+
assertTrue(foundKeyword, "No book titles contain expected keywords.");
72+
}
73+
74+
75+
@Test
76+
void whenScrapeBookSiteMultipleTimes_thenBookCountShouldStableBetweenRuns() {
77+
BookScraper scraper1 = new BookScraper();
78+
Spider.create(scraper1)
79+
.addUrl("https://books.toscrape.com/")
80+
.thread(1)
81+
.run();
82+
83+
BookScraper scraper2 = new BookScraper();
84+
Spider.create(scraper2)
85+
.addUrl("https://books.toscrape.com/")
86+
.thread(1)
87+
.run();
88+
89+
int count1 = scraper1.getBooks().size();
90+
int count2 = scraper2.getBooks().size();
91+
92+
assertEquals(count1, count2, "Book count is not stable between two runs.");
93+
}
94+
}

0 commit comments

Comments
 (0)