Skip to content

Commit 35787dc

Browse files
committed
feat(ads,tracking): add split ads and tracking list
1 parent 6520ea4 commit 35787dc

File tree

4 files changed

+106
-15
lines changed

4 files changed

+106
-15
lines changed

Cargo.lock

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_firewall"
3-
version = "2.28.7"
3+
version = "2.30.2"
44
authors = [
55
"j-mendez <jeff@spider.cloud>"
66
]

build.rs

Lines changed: 72 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1+
use hashbrown::HashSet;
12
use reqwest::blocking::Client;
23
use serde::Deserialize;
3-
use hashbrown::HashSet;
44
use std::env;
55
use std::fs;
66
use std::path::PathBuf;
@@ -19,6 +19,9 @@ struct GithubContent {
1919
fn main() -> std::io::Result<()> {
2020
let client = Client::new();
2121
let mut unique_entries = HashSet::new();
22+
let mut unique_ads_entries = HashSet::new();
23+
let mut unique_tracking_entries = HashSet::new();
24+
let mut unique_gambling_entries = HashSet::new();
2225

2326
// Fetch and process GitHub directory files
2427
let base_url = "https://api.github.com/repos/ShadowWhisperer/BlockLists/contents/RAW";
@@ -30,7 +33,20 @@ fn main() -> std::io::Result<()> {
3033

3134
let contents: Vec<GithubContent> = response.json().expect("Failed to parse JSON response");
3235

36+
let skip_list = vec![
37+
"Cryptocurrency",
38+
"Dating",
39+
"Fonts",
40+
"Microsoft",
41+
"Marketing",
42+
"Wild_Tracking",
43+
];
44+
3345
for item in contents {
46+
// ignore these websites.
47+
if skip_list.contains(&item.name.as_str()) {
48+
continue;
49+
}
3450
if item.content_type == "file" {
3551
let file_url = format!(
3652
"https://raw.githubusercontent.com/ShadowWhisperer/BlockLists/master/{}",
@@ -42,9 +58,30 @@ fn main() -> std::io::Result<()> {
4258
.expect("Failed to fetch file content");
4359

4460
let file_content = file_response.text().expect("Failed to read file content");
45-
for line in file_content.lines() {
46-
if !line.is_empty() {
47-
unique_entries.insert(line.to_string());
61+
62+
if item.name == "Wild_Tracking" || item.name == "Tracking" {
63+
for line in file_content.lines() {
64+
if !line.is_empty() {
65+
unique_tracking_entries.insert(line.to_string());
66+
}
67+
}
68+
} else if item.name == "Wild_Ads" || item.name == "Ads" {
69+
for line in file_content.lines() {
70+
if !line.is_empty() {
71+
unique_ads_entries.insert(line.to_string());
72+
}
73+
}
74+
} else if item.name == "Gambling" {
75+
for line in file_content.lines() {
76+
if !line.is_empty() {
77+
unique_gambling_entries.insert(line.to_string());
78+
}
79+
}
80+
} else {
81+
for line in file_content.lines() {
82+
if !line.is_empty() {
83+
unique_entries.insert(line.to_string());
84+
}
4885
}
4986
}
5087
}
@@ -68,23 +105,50 @@ fn main() -> std::io::Result<()> {
68105
}
69106
}
70107

71-
// Begin building the phf set from the unique entries
72108
let mut set = phf_codegen::Set::new();
73109

74110
for entry in unique_entries {
75111
set.entry(entry);
76112
}
77113

114+
let mut ads_set = phf_codegen::Set::new();
115+
116+
for entry in unique_ads_entries {
117+
ads_set.entry(entry);
118+
}
119+
120+
let mut tracking_set = phf_codegen::Set::new();
121+
122+
for entry in unique_tracking_entries {
123+
tracking_set.entry(entry);
124+
}
125+
126+
let mut gambling_set = phf_codegen::Set::new();
127+
128+
for entry in unique_gambling_entries {
129+
gambling_set.entry(entry);
130+
}
131+
132+
78133
// Write to destination
79134
let out_dir = env::var("OUT_DIR").unwrap();
80135
let dest_path = PathBuf::from(out_dir).join("bad_websites.rs");
81136

82137
fs::write(
83138
&dest_path,
84139
format!(
85-
"/// Bad websites that we should not crawl.\n\
86-
static BAD_WEBSITES: phf::Set<&'static str> = {};",
87-
set.build()
140+
"/// Bad websites that we should not connect to.\n\
141+
static BAD_WEBSITES: phf::Set<&'static str> = {};\n
142+
/// Ads websites that we should not connect to.\n\
143+
static ADS_WEBSITES: phf::Set<&'static str> = {};\n
144+
/// Tracking websites that we should not connect to.\n\
145+
static TRACKING_WEBSITES: phf::Set<&'static str> = {};\n
146+
/// Gambling websites that we should not connect to.\n\
147+
static GAMBLING_WEBSITES: phf::Set<&'static str> = {};",
148+
set.build(),
149+
ads_set.build(),
150+
tracking_set.build(),
151+
gambling_set.build()
88152
),
89153
)?;
90154

src/lib.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,25 @@
11
include!(concat!(env!("OUT_DIR"), "/bad_websites.rs"));
22

3-
/// The url is in the bad website.
3+
/// The url is in the bad list.
44
pub fn is_bad_website_url(host: &str) -> bool {
55
BAD_WEBSITES.contains(&host)
66
}
77

8+
/// The url is in the ads list.
9+
pub fn is_ad_website_url(host: &str) -> bool {
10+
ADS_WEBSITES.contains(&host)
11+
}
12+
13+
/// The url is in the tracking list.
14+
pub fn is_tracking_website_url(host: &str) -> bool {
15+
TRACKING_WEBSITES.contains(&host)
16+
}
17+
18+
/// The url is in the gambling list.
19+
pub fn is_gambling_website_url(host: &str) -> bool {
20+
GAMBLING_WEBSITES.contains(&host)
21+
}
22+
823
#[cfg(test)]
924
mod tests {
1025
use super::*;
@@ -31,4 +46,16 @@ mod tests {
3146
let bad_website = "10minutesto1.net";
3247
assert!(is_bad_website_url(bad_website.to_lowercase().as_str()));
3348
}
49+
50+
#[test]
51+
fn test_is_ad_website_url() {
52+
let bad_website = "admob.google.com";
53+
assert!(is_ad_website_url(bad_website.as_str()));
54+
}
55+
56+
#[test]
57+
fn test_is_tracking_website_url() {
58+
let bad_website = "2.atlasroofing.com";
59+
assert!(is_tracking_website_url(bad_website.as_str()));
60+
}
3461
}

0 commit comments

Comments
 (0)