Skip to content

Commit 7cd21c2

Browse files
authored
update scraper (#684)
1 parent 1700471 commit 7cd21c2

File tree

11 files changed

+181
-36
lines changed

11 files changed

+181
-36
lines changed

.cspell.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@
2121
"pkgs",
2222
"psql",
2323
"qiita",
24+
"replacen",
2425
"reqwest",
2526
"rustc",
2627
"safify",
2728
"stdenv",
2829
"supabase",
29-
"swiper"
30+
"swiper",
31+
"zenki"
3032
],
3133
"dictionaries": [
3234
"softwareTerms",
@@ -52,6 +54,7 @@
5254
"**/*.svg",
5355
"**/migration.sql",
5456
"**/data.json",
57+
"**/server/src/seeds/json/**",
5558
"**/Cargo.*",
5659
"scraper/target",
5760
"**/rust-toolchain.toml",

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
/.direnv
44
/.husky
55

6+
.cache
7+
data.json
8+
69
# Logs
710
logs
811
*.log

biome.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"bun.lockb",
2323
"server/target",
2424
"data.json",
25+
"server/src/seeds/json",
2526
"scraper/target",
2627
".next",
2728
"next-env.d.ts",

flake.lock

Lines changed: 9 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

flake.nix

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
};
3333
unstable = nixpkgs-unstable.legacyPackages.${system};
3434

35-
rust-bin = pkgs.rust-bin.fromRustupToolchainFile ./scraper/rust-toolchain.toml;
35+
rust-bin = pkgs.rust-bin.beta.latest.default; # pkgs.rust-bin.fromRustupToolchainFile ./scraper/rust-toolchain.toml;
3636
prisma = pkgs.callPackage ./server/prisma.nix {inherit prisma-utils;};
3737

3838
common = {
@@ -62,8 +62,7 @@
6262
};
6363
in {
6464
packages.scraper = pkgs.callPackage ./scraper {toolchain = rust-bin;};
65-
devShells.default = pkgs.mkShell common;
66-
devShells.scraper = pkgs.mkShell {
65+
devShells.default = pkgs.mkShell {
6766
inherit (common) env;
6867
packages =
6968
common.packages

scraper/sample.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[
2+
{
3+
name: "zenki",
4+
courses: [
5+
{
6+
name: "数理科学基礎",
7+
teacher: "(人名)",
8+
semester: "S1",
9+
period: "月曜2限、水曜1限",
10+
code: "30003 CAS-FC1871L1",
11+
},
12+
],
13+
},
14+
];

scraper/src/io.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
use crate::types::*;
22
use anyhow::ensure;
3-
use sha2::{Digest, Sha256};
43
use tokio::fs;
54
use tokio::io::AsyncWriteExt;
65

@@ -10,13 +9,20 @@ pub async fn write_to(file: &mut fs::File, content: Entry) -> anyhow::Result<()>
109
Ok(())
1110
}
1211

13-
use crate::CACHE_DIR;
12+
use crate::cache_dir;
1413

1514
pub async fn request(url: &str) -> anyhow::Result<String> {
1615
println!("[request] sending request to {}", url);
1716

18-
let hash = Sha256::digest(url.as_bytes());
19-
let path = format!("{CACHE_DIR}/{:x}", hash);
17+
let cache_key = url
18+
.to_string()
19+
.replacen("/", "_", 1000)
20+
.replacen(":", "_", 1000)
21+
.replacen("?", "_", 1000)
22+
.replacen("&", "_", 1000)
23+
.replacen("=", "_", 1000)
24+
.to_string();
25+
let path = format!("{}/{cache_key}", cache_dir());
2026
if let Ok(bytes) = fs::read(&path).await {
2127
if let Ok(text) = String::from_utf8(bytes) {
2228
return Ok(text);

scraper/src/main.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,16 @@ use scraper::{Html, Selector};
1616
use urls::URLS;
1717

1818
const RESULT_FILE: &str = "./data.json";
19-
const CACHE_DIR: &str = "./.cache";
19+
20+
fn cache_dir() -> String {
21+
"./.cache".to_string()
22+
}
2023

2124
#[tokio::main(flavor = "multi_thread")]
2225
async fn main() {
2326
println!("[log] starting...");
2427

25-
let _ = fs::DirBuilder::new().create(CACHE_DIR).await;
28+
let _ = fs::DirBuilder::new().create(cache_dir()).await;
2629

2730
let mut file = fs::File::create(RESULT_FILE)
2831
.await
@@ -59,7 +62,8 @@ async fn get_courses_of(base_url: &str) -> Vec<Course> {
5962
futures::future::join_all(courses)
6063
.await
6164
.into_iter()
62-
.collect::<Vec<_>>()
65+
.flatten()
66+
.collect()
6367
}
6468

6569
lazy_static! {

scraper/src/parser.rs

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use anyhow::anyhow;
22
use lazy_static::lazy_static;
3-
use scraper::{Html, Selector};
3+
use scraper::{ElementRef, Html, Selector};
44

55
use crate::types::*;
66

@@ -17,19 +17,24 @@ lazy_static! {
1717
Selector::parse(".catalog-page-detail-table-cell.code-cell").unwrap();
1818
}
1919

20-
pub fn parse_course_info(html: Html) -> anyhow::Result<Course> {
21-
Ok(Course {
22-
name: select(&html, &NAME_SELECTOR, 1)?,
23-
teacher: select(&html, &TEACHER_SELECTOR, 1)?,
24-
semester: select_all(&html, &SEMESTER_SELECTOR, 1)?.join(","),
25-
period: select(&html, &PERIOD_SELECTOR, 1)?,
26-
code: select_all(&html, &CODE_SELECTOR, 1)?.join(" "),
27-
})
20+
pub fn parse_course_info(html: Html) -> anyhow::Result<Vec<Course>> {
21+
html.select(&Selector::parse(".catalog-page-detail-table-row").unwrap())
22+
.skip(1)
23+
.map(|el| {
24+
Ok(Course {
25+
name: select(&el, &NAME_SELECTOR)?,
26+
teacher: select(&el, &TEACHER_SELECTOR)?,
27+
semester: select_all(&el, &SEMESTER_SELECTOR)?.join(","),
28+
period: select(&el, &PERIOD_SELECTOR)?,
29+
code: select_all(&el, &CODE_SELECTOR)?.join(" "),
30+
})
31+
})
32+
.collect()
2833
}
2934

30-
fn select(html: &Html, selector: &Selector, nth: usize) -> anyhow::Result<String> {
31-
html.select(selector)
32-
.nth(nth)
35+
fn select(el: &ElementRef, selector: &Selector) -> anyhow::Result<String> {
36+
el.select(selector)
37+
.next()
3338
.ok_or(anyhow!(
3439
"Couldn't find matching element for selector {:?}",
3540
selector,
@@ -38,12 +43,12 @@ fn select(html: &Html, selector: &Selector, nth: usize) -> anyhow::Result<String
3843
}
3944

4045
fn select_all<'a>(
41-
html: &'a Html,
46+
html: &'a ElementRef,
4247
selector: &'static Selector,
43-
nth: usize,
48+
// nth: usize,
4449
) -> anyhow::Result<Vec<&'a str>> {
4550
html.select(selector)
46-
.nth(nth)
51+
.next()
4752
.ok_or(anyhow!(
4853
"Couldn't find matching element for selector {:?}",
4954
selector,

scraper/src/urls.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
pub static URLS: [(&str, &str); 10] = [
1+
pub static URLS: [(&str, &str); 11] = [
2+
(
3+
"zenki",
4+
"https://catalog.he.u-tokyo.ac.jp/result?q=&type=all&faculty_id=&facet=%7B%22faculty_type%22%3A%5B%22jd%22%5D%7D&page=",
5+
),
26
(
37
"law",
48
"https://catalog.he.u-tokyo.ac.jp/result?type=ug&faculty_id=1&page=",

0 commit comments

Comments
 (0)