Skip to content

Commit 21d5c46

Browse files
authored
Merge pull request #10 from JadeCara/jade/webcrawl_wikipedia
webcrawler
2 parents 1465985 + 4eadd28 commit 21d5c46

File tree

3 files changed

+124
-0
lines changed

3 files changed

+124
-0
lines changed

module2/webcrawl-rayon/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[package]
2+
name = "webcrawl-rayon"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
[dependencies]
7+
wikipedia = "0.3.4"
8+
rayon = "1.7.0"

module2/webcrawl-rayon/Makefile

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
SHELL := /bin/bash
2+
.PHONY: help
3+
4+
help:
5+
@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
6+
7+
clean: ## Clean the project using cargo
8+
cargo clean
9+
10+
build: ## Build the project using cargo
11+
cargo build
12+
13+
run: ## Run the project using cargo
14+
cargo run
15+
16+
test: ## Run the tests using cargo
17+
cargo test
18+
19+
lint: ## Run the linter using cargo
20+
@rustup component add clippy 2> /dev/null
21+
cargo clippy
22+
23+
format: ## Format the code using cargo
24+
@rustup component add rustfmt 2> /dev/null
25+
cargo fmt
26+
27+
release:
28+
cargo build --release
29+
30+
all: format lint test run
31+
32+
bump: ## Bump the version of the project
33+
@echo "Current version is $(shell cargo pkgid | cut -d# -f2)"
34+
@read -p "Enter the new version: " version; \
35+
updated_version=$$(cargo pkgid | cut -d# -f2 | sed "s/$(shell cargo pkgid | cut -d# -f2)/$$version/"); \
36+
sed -i -E "s/^version = .*/version = \"$$updated_version\"/" Cargo.toml
37+
@echo "Version bumped to $$(cargo pkgid | cut -d# -f2)"
38+
rm Cargo.toml-e

module2/webcrawl-rayon/src/main.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
3+
* Uses wikipedia crate to fetch pages
4+
5+
* Processes page content
6+
7+
* Collects timing metrics
8+
9+
* Concurrent page processing
10+
11+
* Shows crate usage and concurrency in Rust
12+
*/
13+
14+
use rayon::prelude::*;
15+
use wikipedia::http::default::Client;
16+
use wikipedia::Page;
17+
use wikipedia::Wikipedia;
18+
19+
struct ProcessedPage {
20+
title: String,
21+
data: String,
22+
}
23+
24+
const PAGES: [&str; 9] = [
25+
"Giannis Antetokounmpo",
26+
"James Harden",
27+
"Russell Westbrook",
28+
"Stephen Curry",
29+
"Kevin Durant",
30+
"LeBron James",
31+
"Kobe Bryant",
32+
"Michael Jordan",
33+
"Shaquille O'Neal",
34+
];
35+
36+
fn process_page(page: &Page<Client>) -> ProcessedPage {
37+
let title = page.get_title().unwrap();
38+
let content = page.get_content().unwrap();
39+
ProcessedPage {
40+
title,
41+
data: content,
42+
}
43+
}
44+
45+
//times how long it takes to process the pages and total time
46+
fn main() {
47+
//start timer
48+
let start = std::time::Instant::now();
49+
let wikipedia = Wikipedia::<Client>::default();
50+
let pages: Vec<_> = PAGES
51+
.par_iter() //parallel iterator
52+
.map(|&p| wikipedia.page_from_title(p.to_string()))
53+
.collect();
54+
55+
let processed_pages: Vec<ProcessedPage> = pages.par_iter().map(process_page).collect();
56+
for page in processed_pages {
57+
//time how long it takes to process each page
58+
let start_page = std::time::Instant::now();
59+
60+
println!("Title: {}", page.title.as_str());
61+
//grab first sentence of the page
62+
let first_sentence = page.data.split('.').next().unwrap();
63+
println!("First sentence: {}", first_sentence);
64+
//count the number of words in the page
65+
let word_count = page.data.split_whitespace().count();
66+
println!("Word count: {}", word_count);
67+
//prints time it took to process each page
68+
println!("Page time: {:?}", start_page.elapsed());
69+
}
70+
//descriptive statistics of: total time, average time per page, and total number of pages, as well as the number of threads used
71+
println!("Total time: {:?}", start.elapsed());
72+
println!(
73+
"Average time per page: {:?}",
74+
start.elapsed() / PAGES.len() as u32
75+
);
76+
println!("Total number of pages: {}", PAGES.len());
77+
println!("Number of threads: {}", rayon::current_num_threads());
78+
}

0 commit comments

Comments
 (0)