Merge pull request #10 from JadeCara/jade/webcrawl_wikipedia

JadeCara · web-flow · commit 21d5c4692747 · 2025-01-03T15:37:58.000-07:00
webcrawler
diff --git a/module2/webcrawl-rayon/Cargo.toml b/module2/webcrawl-rayon/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "webcrawl-rayon"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+wikipedia = "0.3.4"
+rayon = "1.7.0"
diff --git a/module2/webcrawl-rayon/Makefile b/module2/webcrawl-rayon/Makefile
@@ -0,0 +1,38 @@
+SHELL := /bin/bash
+.PHONY: help
+
+help:
+	@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
+
+clean: ## Clean the project using cargo
+	cargo clean
+
+build: ## Build the project using cargo
+	cargo build
+
+run: ## Run the project using cargo
+	cargo run
+
+test: ## Run the tests using cargo
+	cargo test
+
+lint: ## Run the linter using cargo
+	@rustup component add clippy 2> /dev/null
+	cargo clippy
+
+format: ## Format the code using cargo
+	@rustup component add rustfmt 2> /dev/null
+	cargo fmt
+
+release:
+	cargo build --release
+
+all: format lint test run
+
+bump: ## Bump the version of the project
+	@echo "Current version is $(shell cargo pkgid | cut -d# -f2)"
+	@read -p "Enter the new version: " version; \
+	updated_version=$$(cargo pkgid | cut -d# -f2 | sed "s/$(shell cargo pkgid | cut -d# -f2)/$$version/"); \
+	sed -i -E "s/^version = .*/version = \"$$updated_version\"/" Cargo.toml
+	@echo "Version bumped to $$(cargo pkgid | cut -d# -f2)"
+	rm Cargo.toml-e
diff --git a/module2/webcrawl-rayon/src/main.rs b/module2/webcrawl-rayon/src/main.rs
@@ -0,0 +1,78 @@
+/*
+
+* Uses wikipedia crate to fetch pages
+
+* Processes page content
+
+* Collects timing metrics
+
+* Concurrent page processing
+
+* Shows crate usage and concurrency in Rust
+*/
+
+use rayon::prelude::*;
+use wikipedia::http::default::Client;
+use wikipedia::Page;
+use wikipedia::Wikipedia;
+
+struct ProcessedPage {
+    title: String,
+    data: String,
+}
+
+const PAGES: [&str; 9] = [
+    "Giannis Antetokounmpo",
+    "James Harden",
+    "Russell Westbrook",
+    "Stephen Curry",
+    "Kevin Durant",
+    "LeBron James",
+    "Kobe Bryant",
+    "Michael Jordan",
+    "Shaquille O'Neal",
+];
+
+fn process_page(page: &Page<Client>) -> ProcessedPage {
+    let title = page.get_title().unwrap();
+    let content = page.get_content().unwrap();
+    ProcessedPage {
+        title,
+        data: content,
+    }
+}
+
+//times how long it takes to process the pages and total time
+fn main() {
+    //start timer
+    let start = std::time::Instant::now();
+    let wikipedia = Wikipedia::<Client>::default();
+    let pages: Vec<_> = PAGES
+        .par_iter() //parallel iterator
+        .map(|&p| wikipedia.page_from_title(p.to_string()))
+        .collect();
+
+    let processed_pages: Vec<ProcessedPage> = pages.par_iter().map(process_page).collect();
+    for page in processed_pages {
+        //time how long it takes to process each page
+        let start_page = std::time::Instant::now();
+
+        println!("Title: {}", page.title.as_str());
+        //grab first sentence of the page
+        let first_sentence = page.data.split('.').next().unwrap();
+        println!("First sentence: {}", first_sentence);
+        //count the number of words in the page
+        let word_count = page.data.split_whitespace().count();
+        println!("Word count: {}", word_count);
+        //prints time it took to process each page
+        println!("Page time: {:?}", start_page.elapsed());
+    }
+    //descriptive statistics of: total time, average time per page, and total number of pages, as well as the number of threads used
+    println!("Total time: {:?}", start.elapsed());
+    println!(
+        "Average time per page: {:?}",
+        start.elapsed() / PAGES.len() as u32
+    );
+    println!("Total number of pages: {}", PAGES.len());
+    println!("Number of threads: {}", rayon::current_num_threads());
+}