1+ use hashbrown:: HashSet ;
12use reqwest:: blocking:: Client ;
23use serde:: Deserialize ;
3- use hashbrown:: HashSet ;
44use std:: env;
55use std:: fs;
66use std:: path:: PathBuf ;
@@ -19,6 +19,9 @@ struct GithubContent {
1919fn main ( ) -> std:: io:: Result < ( ) > {
2020 let client = Client :: new ( ) ;
2121 let mut unique_entries = HashSet :: new ( ) ;
22+ let mut unique_ads_entries = HashSet :: new ( ) ;
23+ let mut unique_tracking_entries = HashSet :: new ( ) ;
24+ let mut unique_gambling_entries = HashSet :: new ( ) ;
2225
2326 // Fetch and process GitHub directory files
2427 let base_url = "https://api.github.com/repos/ShadowWhisperer/BlockLists/contents/RAW" ;
@@ -30,7 +33,20 @@ fn main() -> std::io::Result<()> {
3033
3134 let contents: Vec < GithubContent > = response. json ( ) . expect ( "Failed to parse JSON response" ) ;
3235
36+ let skip_list = vec ! [
37+ "Cryptocurrency" ,
38+ "Dating" ,
39+ "Fonts" ,
40+ "Microsoft" ,
41+ "Marketing" ,
42+ "Wild_Tracking" ,
43+ ] ;
44+
3345 for item in contents {
46+ // ignore these websites.
47+ if skip_list. contains ( & item. name . as_str ( ) ) {
48+ continue ;
49+ }
3450 if item. content_type == "file" {
3551 let file_url = format ! (
3652 "https://raw.githubusercontent.com/ShadowWhisperer/BlockLists/master/{}" ,
@@ -42,9 +58,30 @@ fn main() -> std::io::Result<()> {
4258 . expect ( "Failed to fetch file content" ) ;
4359
4460 let file_content = file_response. text ( ) . expect ( "Failed to read file content" ) ;
45- for line in file_content. lines ( ) {
46- if !line. is_empty ( ) {
47- unique_entries. insert ( line. to_string ( ) ) ;
61+
62+ if item. name == "Wild_Tracking" || item. name == "Tracking" {
63+ for line in file_content. lines ( ) {
64+ if !line. is_empty ( ) {
65+ unique_tracking_entries. insert ( line. to_string ( ) ) ;
66+ }
67+ }
68+ } else if item. name == "Wild_Ads" || item. name == "Ads" {
69+ for line in file_content. lines ( ) {
70+ if !line. is_empty ( ) {
71+ unique_ads_entries. insert ( line. to_string ( ) ) ;
72+ }
73+ }
74+ } else if item. name == "Gambling" {
75+ for line in file_content. lines ( ) {
76+ if !line. is_empty ( ) {
77+ unique_gambling_entries. insert ( line. to_string ( ) ) ;
78+ }
79+ }
80+ } else {
81+ for line in file_content. lines ( ) {
82+ if !line. is_empty ( ) {
83+ unique_entries. insert ( line. to_string ( ) ) ;
84+ }
4885 }
4986 }
5087 }
@@ -68,23 +105,50 @@ fn main() -> std::io::Result<()> {
68105 }
69106 }
70107
71- // Begin building the phf set from the unique entries
72108 let mut set = phf_codegen:: Set :: new ( ) ;
73109
74110 for entry in unique_entries {
75111 set. entry ( entry) ;
76112 }
77113
114+ let mut ads_set = phf_codegen:: Set :: new ( ) ;
115+
116+ for entry in unique_ads_entries {
117+ ads_set. entry ( entry) ;
118+ }
119+
120+ let mut tracking_set = phf_codegen:: Set :: new ( ) ;
121+
122+ for entry in unique_tracking_entries {
123+ tracking_set. entry ( entry) ;
124+ }
125+
126+ let mut gambling_set = phf_codegen:: Set :: new ( ) ;
127+
128+ for entry in unique_gambling_entries {
129+ gambling_set. entry ( entry) ;
130+ }
131+
132+
78133 // Write to destination
79134 let out_dir = env:: var ( "OUT_DIR" ) . unwrap ( ) ;
80135 let dest_path = PathBuf :: from ( out_dir) . join ( "bad_websites.rs" ) ;
81136
82137 fs:: write (
83138 & dest_path,
84139 format ! (
85- "/// Bad websites that we should not crawl.\n \
86- static BAD_WEBSITES: phf::Set<&'static str> = {};",
87- set. build( )
140+ "/// Bad websites that we should not connect to.\n \
141+ static BAD_WEBSITES: phf::Set<&'static str> = {};\n
142+ /// Ads websites that we should not connect to.\n \
143+ static ADS_WEBSITES: phf::Set<&'static str> = {};\n
144+ /// Tracking websites that we should not connect to.\n \
145+ static TRACKING_WEBSITES: phf::Set<&'static str> = {};\n
146+ /// Gambling websites that we should not connect to.\n \
147+ static GAMBLING_WEBSITES: phf::Set<&'static str> = {};",
148+ set. build( ) ,
149+ ads_set. build( ) ,
150+ tracking_set. build( ) ,
151+ gambling_set. build( )
88152 ) ,
89153 ) ?;
90154
0 commit comments