diff --git a/topics/web-scraper/Cargo.lock b/topics/web-scraper/Cargo.lock new file mode 100644 index 0000000..bdbf716 --- /dev/null +++ b/topics/web-scraper/Cargo.lock @@ -0,0 +1,2183 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.3", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.60.2", +] + +[[package]] +name = "anyhow" +version = "1.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "cc" +version = "1.2.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" + +[[package]] +name = "clap" +version = "4.5.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cssparser" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.11.3", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.106", +] + +[[package]] +name = "derive_more" +version = "0.99.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "dtoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "find-msvc-tools" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "pin-utils", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.1+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi 0.14.7+wasi-0.2.4", +] + +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + +[[package]] +name = "h2" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "0.14.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.5.10", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "io-uring" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" +dependencies = [ + "bitflags 2.9.4", + "cfg-if", + "libc", +] + +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.177" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" + +[[package]] +name = "linux-raw-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf 0.10.1", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +dependencies = [ + "libc", + "wasi 0.11.1+wasi-snapshot-preview1", + "windows-sys 0.59.0", +] + +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + +[[package]] +name = "openssl" +version = "0.10.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" +dependencies = [ + "bitflags 2.9.4", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + +[[package]] +name = "openssl-sys" +version = "0.9.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher 0.3.11", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher 1.0.1", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "potential_utf" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +dependencies = [ + "zerovec", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "proc-macro2" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.16", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags 2.9.4", +] + +[[package]] +name = "reqwest" +version = "0.11.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-tls", + "ipnet", + "js-sys", + "log", + "mime", + "native-tls", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "system-configuration", + "tokio", + "tokio-native-tls", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" + +[[package]] +name = "rustix" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +dependencies = [ + "bitflags 2.9.4", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "scraper" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c95a930e03325234c18c7071fd2b60118307e025d6fff3e12745ffbf63a3d29c" +dependencies = [ + "ahash", + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "once_cell", + "selectors", + "smallvec", + "tendril", +] + +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.9.4", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "selectors" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" +dependencies = [ + "bitflags 2.9.4", + "cssparser", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf 0.10.1", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "serde_json" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", + "serde_core", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "servo_arc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" +dependencies = [ + "libc", +] + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + +[[package]] +name = "slab" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "socket2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +dependencies = [ + "fastrand", + "getrandom 0.3.3", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.47.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +dependencies = [ + "backtrace", + "bytes", + "io-uring", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "slab", + "socket2 0.6.0", + "tokio-macros", + "windows-sys 0.59.0", +] + +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-test" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2468baabc3311435b55dd935f702f42cd1b8abb7e754fb7dfb16bd36aa88f9f7" +dependencies = [ + "async-stream", + "bytes", + "futures-core", + "tokio", + "tokio-stream", +] + +[[package]] +name = "tokio-util" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "unicode-ident" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "url" +version = "2.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasi" +version = "0.14.7+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" +dependencies = [ + "wasip2", +] + +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn 2.0.106", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webcrawl" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "reqwest", + "scraper", + "tempfile", + "tokio", + "tokio-test", + "url", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] diff --git a/topics/web-scraper/Cargo.toml b/topics/web-scraper/Cargo.toml new file mode 100644 index 0000000..d85ed72 --- /dev/null +++ b/topics/web-scraper/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "webcrawl" +version = "0.1.0" +edition = "2024" + +[dependencies] +clap = { version = "4.0", features = ["derive"] } +url = "2.4" +tokio = { version = "1.0", features = ["full"] } +reqwest = { version = "0.11", features = ["json"] } +anyhow = "1.0" +scraper = "0.17" + +[dev-dependencies] +tempfile = "3.0" +tokio-test = "0.4" diff --git a/topics/web-scraper/docs/architecture.md b/topics/web-scraper/docs/architecture.md new file mode 100644 index 0000000..b644eb4 --- /dev/null +++ b/topics/web-scraper/docs/architecture.md @@ -0,0 +1,158 @@ +# Web Scraper Architecture + +## Project Definition + +### What is it? +The web scraper is a command-line application written in Rust that recursively downloads and processes web pages starting from given URLs. It crawls websites by following links, downloads pages concurrently using multiple async workers, and stores them locally in a depth-organized directory structure that maintains domain hierarchy while tracking the crawling depth of each page. + +### Goals +- **Concurrent web crawling**: Download multiple pages simultaneously using async/await and tokio +- **Recursive link following**: Discover and follow links up to a specified depth with same-domain filtering +- **Depth-organized storage**: Organize downloaded content in folders that track crawling depth (depth_0, depth_1, etc.) +- **Command-line interface**: Provide an intuitive CLI with configurable output directory, depth, and worker count +- **Robust error handling**: Gracefully handle network errors, invalid URLs, and file system issues with detailed logging + +## Components and Modules + +### 1. CLI Module (`cli.rs`) +**Purpose**: Handle command-line argument parsing and validation. +- Parse command-line arguments (URL, output directory, depth, concurrency) +- Validate input parameters +- Display help information + +### 2. Crawler Engine (`crawler.rs`) +**Purpose**: Core crawling logic and coordination using SimpleCrawler. +- Manage the crawling queue and visited URLs HashSet for deduplication +- Coordinate multiple async worker tasks via mpsc channels +- Implement depth-limited crawling with round-robin work distribution +- Handle graceful worker shutdown and result processing + +### 3. Downloader Module (`downloader.rs`) +**Purpose**: Handle HTTP requests and page downloading with async support. +- Make asynchronous HTTP requests using reqwest with 30-second timeout +- Custom user-agent and proper error handling +- Return page content and metadata for processing + +### 4. Parser Module (`parser.rs`) +**Purpose**: Extract links from downloaded HTML pages with filtering. +- Parse HTML content using scraper crate with CSS selectors +- Extract and normalize URLs from anchor tags (``) +- Filter to same-domain links only (excludes external sites) +- Remove URL fragments and handle duplicates + +### 5. Storage Module (`storage.rs`) +**Purpose**: Manage file system operations with depth-based organization. +- Create hierarchical directory structures organized by crawling depth +- Save downloaded pages to depth-specific folders (depth_0, depth_1, etc.) +- Handle file naming conflicts and path sanitization +- Convert URLs to appropriate file paths maintaining domain structure + +### 6. Worker Module (`worker.rs`) +**Purpose**: Handle concurrent downloading tasks with message-passing coordination. +- Define WorkItem and WorkResult message types for communication +- Implement async workers that process URLs from a shared channel +- Coordinate downloader, parser, and storage operations +- Handle round-robin work distribution through mpsc channels + +## Module Interactions + +``` + CLI + | + v + Crawler ←→ Worker Pool + | | + v v + Parser ←→ Downloader + | | + v v + Storage +``` + +1. **CLI** parses arguments and initializes the **Crawler** +2. **Crawler** creates a pool of **Workers** and manages the crawling queue +3. **Workers** use the **Downloader** to fetch pages +4. Downloaded content is processed by the **Parser** to extract links +5. **Storage** saves pages and creates directory structure +6. New links are fed back to the **Crawler** queue + +### Architecture Justification + +This modular design provides: +- **Separation of concerns**: Each module has a single responsibility +- **Testability**: Modules can be unit tested independently (29 comprehensive unit tests included) +- **Concurrency**: Async worker-based design enables efficient parallel processing +- **Extensibility**: Easy to add features like robots.txt support or different output formats +- **Error isolation**: Failures in one component don't crash the entire application + +### Key Technologies +- **Rust 2024 Edition**: Memory-safe systems programming with excellent async support +- **Tokio**: Async runtime for concurrent operations and channels +- **Reqwest**: HTTP client for reliable web requests with timeout handling +- **Scraper**: HTML parsing with CSS selector support +- **Clap**: Command-line argument parsing with derive macros +- **Anyhow**: Unified error handling across all modules + +## Usage + +### Installation +To use the `webcrawl` command directly from anywhere in your system: + +```bash +# Install to ~/.cargo/bin (make sure it's in your PATH) +cargo install --path . + +# Then you can use webcrawl directly +webcrawl --output ./crawled_url --depth 10 https://example.com +``` + +### Basic Usage +```bash +# Crawl a website with default settings +webcrawl https://example.com + +# Specify output directory and depth +webcrawl --output ./crawled_data --depth 3 https://example.com + +# Control concurrency +webcrawl --output ./output --depth 2 --workers 5 https://example.com +``` + +### Command-line Options +- ``: Starting URL to crawl (required) +- `--output, -o`: Output directory for downloaded pages (default: "./crawled") +- `--depth, -d`: Maximum crawling depth (default: 2) +- `--workers, -w`: Number of concurrent workers (default: 4) +- `--help, -h`: Display help information + +### Output Structure +The downloaded pages are organized in a hierarchical structure based on crawling depth and URL structure: + +``` +output/ +├── depth_0/ +│ └── example.com/ +│ └── index.html # Root page (depth 0) +├── depth_1/ +│ └── example.com/ +│ ├── about/ +│ │ └── index.html # /about page (depth 1) +│ └── products/ +│ └── index.html # /products page (depth 1) +└── depth_2/ + └── example.com/ + ├── about/ + │ └── team/ + │ └── index.html # /about/team page (depth 2) + └── products/ + └── software/ + └── index.html # /products/software page (depth 2) +``` + +This depth-based organization allows easy tracking of how deep each page was discovered in the crawling process and provides clear separation between different crawling levels. + +### Example Usage Scenarios + +1. **Website backup**: `webcrawl --depth 5 --output ./backup https://mysite.com` +2. **Content analysis**: `webcrawl --depth 2 --workers 8 https://news.site.com` +3. **Link validation**: `webcrawl --depth 1 https://example.com` (shallow crawl) diff --git a/topics/web-scraper/src/cli.rs b/topics/web-scraper/src/cli.rs new file mode 100644 index 0000000..8447d90 --- /dev/null +++ b/topics/web-scraper/src/cli.rs @@ -0,0 +1,128 @@ +use clap::Parser; +use std::path::PathBuf; +use url::Url; + +/// A web crawler that downloads pages and follows links +#[derive(Parser, Debug)] +#[command(name = "webcrawl")] +#[command(about = "A concurrent web scraper that downloads pages and follows links")] +pub struct Args { + /// The starting URL to crawl + #[arg(value_parser = parse_url)] + pub url: Url, + + /// Output directory for downloaded pages + #[arg(short, long, default_value = "./crawled")] + pub output: PathBuf, + + /// Maximum crawling depth + #[arg(short, long, default_value = "2")] + pub depth: usize, + + /// Number of concurrent workers + #[arg(short, long, default_value = "4")] + pub workers: usize, +} + +fn parse_url(s: &str) -> Result { + Url::parse(s) +} + +#[cfg(test)] +mod tests { + use super::*; + use clap::Parser; + + #[test] + fn test_parse_url_valid() { + let result = parse_url("https://example.com"); + assert!(result.is_ok()); + assert_eq!(result.unwrap().as_str(), "https://example.com/"); + } + + #[test] + fn test_parse_url_invalid() { + let result = parse_url("not-a-url"); + assert!(result.is_err()); + } + + #[test] + fn test_args_with_defaults() { + let args = Args::try_parse_from(&["webcrawl", "https://example.com"]).unwrap(); + + assert_eq!(args.url.as_str(), "https://example.com/"); + assert_eq!(args.output, PathBuf::from("./crawled")); + assert_eq!(args.depth, 2); + assert_eq!(args.workers, 4); + } + + #[test] + fn test_args_with_custom_values() { + let args = Args::try_parse_from(&[ + "webcrawl", + "--output", + "./custom_output", + "--depth", + "5", + "--workers", + "8", + "https://test.com", + ]) + .unwrap(); + + assert_eq!(args.url.as_str(), "https://test.com/"); + assert_eq!(args.output, PathBuf::from("./custom_output")); + assert_eq!(args.depth, 5); + assert_eq!(args.workers, 8); + } + + #[test] + fn test_args_short_flags() { + let args = Args::try_parse_from(&[ + "webcrawl", + "-o", + "./short_output", + "-d", + "3", + "-w", + "6", + "https://short.com", + ]) + .unwrap(); + + assert_eq!(args.url.as_str(), "https://short.com/"); + assert_eq!(args.output, PathBuf::from("./short_output")); + assert_eq!(args.depth, 3); + assert_eq!(args.workers, 6); + } + + #[test] + fn test_args_invalid_url() { + let result = Args::try_parse_from(&["webcrawl", "invalid-url"]); + assert!(result.is_err()); + } + + #[test] + fn test_args_missing_url() { + let result = Args::try_parse_from(&["webcrawl"]); + assert!(result.is_err()); + } + + #[test] + fn test_args_invalid_depth() { + let result = + Args::try_parse_from(&["webcrawl", "--depth", "not-a-number", "https://example.com"]); + assert!(result.is_err()); + } + + #[test] + fn test_args_invalid_workers() { + let result = Args::try_parse_from(&[ + "webcrawl", + "--workers", + "not-a-number", + "https://example.com", + ]); + assert!(result.is_err()); + } +} diff --git a/topics/web-scraper/src/crawler.rs b/topics/web-scraper/src/crawler.rs new file mode 100644 index 0000000..7cdf898 --- /dev/null +++ b/topics/web-scraper/src/crawler.rs @@ -0,0 +1,147 @@ +use crate::storage::Storage; +use crate::worker::{WorkItem, WorkResult, Worker}; +use anyhow::Result; +use std::collections::{HashSet, VecDeque}; +use tokio::sync::mpsc; +use url::Url; + +/// Simple crawler coordinator that manages workers +pub struct SimpleCrawler { + storage: Storage, + max_depth: usize, + num_workers: usize, + visited: HashSet, + queue: VecDeque, +} + +impl SimpleCrawler { + /// Create a new simple crawler + pub fn new(storage: Storage, max_depth: usize, num_workers: usize, start_url: Url) -> Self { + let mut queue = VecDeque::new(); + queue.push_back(WorkItem { + url: start_url, + depth: 0, + }); + + SimpleCrawler { + storage, + max_depth, + num_workers, + visited: HashSet::new(), + queue, + } + } + + /// Start the crawling process + pub async fn crawl(&mut self) -> Result<()> { + println!( + "Starting crawler with {} workers, max depth {}", + self.num_workers, self.max_depth + ); + + // Initialize storage + self.storage.init().await?; + + // Create channels for worker communication + let (result_sender, mut result_receiver) = mpsc::channel::(100); + + // Create individual work channels and start workers + let mut work_senders = Vec::new(); + let mut worker_handles = Vec::new(); + + for i in 0..self.num_workers { + let (work_sender, work_receiver) = mpsc::channel::(10); + work_senders.push(work_sender); + + let worker_storage = Storage::new(self.storage.base_path.clone()); + let worker = Worker::new(i, worker_storage)?; + let result_tx = result_sender.clone(); + + let handle = tokio::spawn(async move { + worker.run(work_receiver, result_tx).await; + }); + worker_handles.push(handle); + } + + // Drop the original result sender so we can detect when all workers finish + drop(result_sender); + + let mut active_work = 0; + let mut total_processed = 0; + let mut current_worker = 0; + + // Main crawling loop + loop { + // Send work items to workers (round-robin) + while let Some(work_item) = self.queue.pop_front() { + if self.visited.contains(&work_item.url) { + continue; + } + + self.visited.insert(work_item.url.clone()); + + // Send to next worker (round-robin) + if work_senders[current_worker].send(work_item).await.is_err() { + // Worker channel is closed, break the loop + break; + } + + current_worker = (current_worker + 1) % self.num_workers; + active_work += 1; + } + + // If no active work and queue is empty, we're done + if active_work == 0 { + break; + } + + // Process results from workers + if let Some(result) = result_receiver.recv().await { + active_work -= 1; + total_processed += 1; + + if result.success { + println!( + "Completed: {} (depth {}) - {} links found", + result.url, + result.depth, + result.links.len() + ); + + // Add new links to queue if we haven't reached max depth + if result.depth < self.max_depth { + for link in result.links { + if !self.visited.contains(&link) { + self.queue.push_back(WorkItem { + url: link, + depth: result.depth + 1, + }); + } + } + } + } else { + println!( + "Failed: {} (depth {}) - {}", + result.url, + result.depth, + result.error.unwrap_or_else(|| "Unknown error".to_string()) + ); + } + } else { + // All workers have finished + break; + } + } + + // Close all work channels to signal workers to stop + drop(work_senders); + + // Wait for all workers to finish + for handle in worker_handles { + let _ = handle.await; + } + + println!("Crawling complete! Processed {} pages.", total_processed); + Ok(()) + } +} diff --git a/topics/web-scraper/src/downloader.rs b/topics/web-scraper/src/downloader.rs new file mode 100644 index 0000000..f44f76f --- /dev/null +++ b/topics/web-scraper/src/downloader.rs @@ -0,0 +1,44 @@ +use anyhow::Result; +use reqwest::Client; +use std::time::Duration; +use url::Url; + +/// HTTP client for downloading web pages +pub struct Downloader { + client: Client, +} + +/// Downloaded page data +pub struct Page { + pub url: Url, + pub content: String, +} + +impl Downloader { + /// Create a new downloader with reasonable defaults + pub fn new() -> Result { + let client = Client::builder() + .timeout(Duration::from_secs(30)) + .user_agent("webcrawl/0.1.0") + .build()?; + + Ok(Downloader { client }) + } + + /// Download a page from the given URL + pub async fn download(&self, url: Url) -> Result { + println!("Downloading: {}", url); + + let response = self.client.get(url.clone()).send().await?; + + if !response.status().is_success() { + return Err(anyhow::anyhow!("HTTP error {}: {}", response.status(), url)); + } + + let content = response.text().await?; + + println!("Downloaded {} bytes from {}", content.len(), url); + + Ok(Page { url, content }) + } +} diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs new file mode 100644 index 0000000..983f492 --- /dev/null +++ b/topics/web-scraper/src/main.rs @@ -0,0 +1,34 @@ +use anyhow::Result; +use clap::Parser; + +mod cli; +mod crawler; +mod downloader; +mod parser; +mod storage; +mod worker; + +use cli::Args; +use crawler::SimpleCrawler; +use storage::Storage; + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + + println!("Starting web crawler..."); + println!("Target URL: {}", args.url); + println!("Output directory: {}", args.output.display()); + println!("Max depth: {}", args.depth); + println!("Workers: {}", args.workers); + println!(); + + // Create and start the concurrent crawler + let storage = Storage::new(args.output); + let mut crawler = SimpleCrawler::new(storage, args.depth, args.workers, args.url); + + crawler.crawl().await?; + + println!("Web crawler completed successfully!"); + Ok(()) +} diff --git a/topics/web-scraper/src/parser.rs b/topics/web-scraper/src/parser.rs new file mode 100644 index 0000000..3aebe92 --- /dev/null +++ b/topics/web-scraper/src/parser.rs @@ -0,0 +1,166 @@ +use anyhow::Result; +use scraper::{Html, Selector}; +use url::Url; + +/// HTML parser for extracting links from pages +pub struct Parser { + link_selector: Selector, +} + +impl Parser { + /// Create a new parser + pub fn new() -> Result { + let link_selector = Selector::parse("a[href]") + .map_err(|e| anyhow::anyhow!("Failed to create CSS selector: {:?}", e))?; + + Ok(Parser { link_selector }) + } + + /// Extract all links from an HTML page + pub fn extract_links(&self, html: &str, base_url: &Url) -> Result> { + let document = Html::parse_document(html); + let mut links = Vec::new(); + + for element in document.select(&self.link_selector) { + if let Some(href) = element.value().attr("href") { + // Skip empty hrefs and fragments + if href.is_empty() || href.starts_with('#') { + continue; + } + + // Try to resolve the URL relative to the base + match base_url.join(href) { + Ok(url) => { + // Only include HTTP/HTTPS URLs from the same domain + if url.scheme() == "http" || url.scheme() == "https" { + if let Some(base_host) = base_url.host_str() { + if let Some(url_host) = url.host_str() { + if base_host == url_host { + links.push(url); + } + } + } + } + } + Err(_) => { + // Skip invalid URLs + continue; + } + } + } + } + + // Remove duplicates + links.sort(); + links.dedup(); + + Ok(links) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parser_creation() { + let parser = Parser::new(); + assert!(parser.is_ok()); + } + + #[test] + fn test_extract_links_basic() { + let parser = Parser::new().unwrap(); + let html = r##" + + + Link 1 + Link 2 + + + "##; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 2); + assert!(links.contains(&Url::parse("https://example.com/page1").unwrap())); + assert!(links.contains(&Url::parse("https://example.com/page2").unwrap())); + } + + #[test] + fn test_extract_links_filters_external() { + let parser = Parser::new().unwrap(); + let html = r##" + + + Internal Link + External Link + Same Domain + + + "##; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 2); + assert!(links.contains(&Url::parse("https://example.com/internal").unwrap())); + assert!(links.contains(&Url::parse("https://example.com/same-domain").unwrap())); + assert!( + !links + .iter() + .any(|url| url.host_str() == Some("external.com")) + ); + } + + #[test] + fn test_extract_links_skips_fragments() { + let parser = Parser::new().unwrap(); + let html = r##" + + + Valid Link + Fragment + Empty + + + "##; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 1); + assert_eq!(links[0], Url::parse("https://example.com/page1").unwrap()); + } + + #[test] + fn test_extract_links_removes_duplicates() { + let parser = Parser::new().unwrap(); + let html = r##" + + + Link 1 + Link 1 Again + Link 2 + + + "##; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 2); + } + + #[test] + fn test_extract_links_empty_html() { + let parser = Parser::new().unwrap(); + let html = ""; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 0); + } +} diff --git a/topics/web-scraper/src/storage.rs b/topics/web-scraper/src/storage.rs new file mode 100644 index 0000000..4886c38 --- /dev/null +++ b/topics/web-scraper/src/storage.rs @@ -0,0 +1,203 @@ +use anyhow::Result; +use std::path::PathBuf; +use tokio::fs; +use url::Url; + +/// File system storage manager +pub struct Storage { + pub base_path: PathBuf, +} + +impl Storage { + /// Create a new storage manager + pub fn new(base_path: PathBuf) -> Self { + Storage { base_path } + } + + /// Initialize the storage directory + pub async fn init(&self) -> Result<()> { + fs::create_dir_all(&self.base_path).await?; + println!( + "Initialized storage directory: {}", + self.base_path.display() + ); + Ok(()) + } + + /// Save a page to the appropriate location + pub async fn save_page(&self, url: &Url, content: &str, depth: usize) -> Result { + let file_path = self.url_to_path(url, depth)?; + + // Create parent directories + if let Some(parent) = file_path.parent() { + fs::create_dir_all(parent).await?; + } + + // Write the content + fs::write(&file_path, content).await?; + + println!("Saved: {} -> {}", url, file_path.display()); + Ok(file_path) + } + + /// Convert a URL to a file path following the hierarchical structure + fn url_to_path(&self, url: &Url, depth: usize) -> Result { + let mut path = self.base_path.clone(); + + // Add depth folder for organization + path.push(format!("depth_{}", depth)); + + // Add host + if let Some(host) = url.host_str() { + path.push(sanitize_filename(host)); + } + + // Add path components + let url_path = url.path(); + if url_path != "/" && !url_path.is_empty() { + for segment in url_path.split('/').filter(|s| !s.is_empty()) { + path.push(sanitize_filename(segment)); + } + } + + // Ensure we have a filename with .html extension + if path.is_dir() || path.to_string_lossy().ends_with('/') || url_path == "/" { + path.push("index.html"); + } else if !path.to_string_lossy().ends_with(".html") { + path.set_extension("html"); + } + + Ok(path) + } +} + +/// Sanitize a string to be safe for use as a filename +fn sanitize_filename(input: &str) -> String { + input + .chars() + .map(|c| match c { + '/' | '\\' | ':' | '*' | '?' | '"' | '<' | '>' | '|' => '_', + c if c.is_control() => '_', + c => c, + }) + .collect::() + .trim_matches('.') + .to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_storage_creation() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + assert_eq!(storage.base_path, temp_dir.path()); + } + + #[tokio::test] + async fn test_storage_init() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().join("test_output")); + + let result = storage.init().await; + assert!(result.is_ok()); + assert!(storage.base_path.exists()); + } + + #[tokio::test] + async fn test_save_page() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + storage.init().await.unwrap(); + + let url = Url::parse("https://example.com/page1").unwrap(); + let content = "Test content"; + + let file_path = storage.save_page(&url, content, 0).await.unwrap(); + + assert!(file_path.exists()); + let saved_content = tokio::fs::read_to_string(&file_path).await.unwrap(); + assert_eq!(saved_content, content); + } + + #[test] + fn test_url_to_path_root() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + + let url = Url::parse("https://example.com/").unwrap(); + let path = storage.url_to_path(&url, 0).unwrap(); + + let expected = temp_dir + .path() + .join("depth_0") + .join("example.com") + .join("index.html"); + assert_eq!(path, expected); + } + + #[test] + fn test_url_to_path_with_path() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + + let url = Url::parse("https://example.com/blog/post1").unwrap(); + let path = storage.url_to_path(&url, 1).unwrap(); + + let expected = temp_dir + .path() + .join("depth_1") + .join("example.com") + .join("blog") + .join("post1.html"); + assert_eq!(path, expected); + } + + #[test] + fn test_url_to_path_depth_organization() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + + let url = Url::parse("https://example.com/test").unwrap(); + + let path_depth_0 = storage.url_to_path(&url, 0).unwrap(); + let path_depth_2 = storage.url_to_path(&url, 2).unwrap(); + + assert!(path_depth_0.to_string_lossy().contains("depth_0")); + assert!(path_depth_2.to_string_lossy().contains("depth_2")); + } + + #[test] + fn test_sanitize_filename() { + assert_eq!(sanitize_filename("normal"), "normal"); + assert_eq!(sanitize_filename("with/slash"), "with_slash"); + assert_eq!(sanitize_filename("with:colon"), "with_colon"); + assert_eq!(sanitize_filename("with*star"), "with_star"); + assert_eq!(sanitize_filename("with?question"), "with_question"); + assert_eq!(sanitize_filename("with\"quote"), "with_quote"); + assert_eq!(sanitize_filename("with"), "with_greater_"); + assert_eq!(sanitize_filename("with|pipe"), "with_pipe"); + assert_eq!(sanitize_filename(".hidden."), "hidden"); + } + + #[tokio::test] + async fn test_save_page_creates_directories() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + storage.init().await.unwrap(); + + let url = Url::parse("https://example.com/deep/nested/path").unwrap(); + let content = "test content"; + + let file_path = storage.save_page(&url, content, 1).await.unwrap(); + + assert!(file_path.exists()); + assert!(file_path.parent().unwrap().exists()); + + let saved_content = tokio::fs::read_to_string(&file_path).await.unwrap(); + assert_eq!(saved_content, content); + } +} diff --git a/topics/web-scraper/src/worker.rs b/topics/web-scraper/src/worker.rs new file mode 100644 index 0000000..b13c495 --- /dev/null +++ b/topics/web-scraper/src/worker.rs @@ -0,0 +1,216 @@ +use crate::downloader::Downloader; +use crate::parser::Parser as HtmlParser; +use crate::storage::Storage; +use anyhow::Result; +use tokio::sync::mpsc; +use url::Url; + +/// Message types for worker communication +#[derive(Debug, Clone)] +pub struct WorkItem { + pub url: Url, + pub depth: usize, +} + +#[derive(Debug)] +pub struct WorkResult { + pub url: Url, + pub depth: usize, + pub links: Vec, + pub success: bool, + pub error: Option, +} + +/// Worker that downloads and processes web pages +pub struct Worker { + id: usize, + downloader: Downloader, + parser: HtmlParser, + storage: Storage, +} + +impl Worker { + /// Create a new worker + pub fn new(id: usize, storage: Storage) -> Result { + let downloader = Downloader::new()?; + let parser = HtmlParser::new()?; + + Ok(Worker { + id, + downloader, + parser, + storage, + }) + } + + /// Start the worker loop + pub async fn run( + mut self, + mut receiver: mpsc::Receiver, + sender: mpsc::Sender, + ) { + println!("Worker {} started", self.id); + + while let Some(work_item) = receiver.recv().await { + let result = self.process_work_item(work_item).await; + + if let Err(e) = sender.send(result).await { + eprintln!("Worker {} failed to send result: {}", self.id, e); + break; + } + } + + println!("Worker {} shutting down", self.id); + } + + /// Process a single work item + async fn process_work_item(&mut self, work_item: WorkItem) -> WorkResult { + let url = work_item.url.clone(); + let depth = work_item.depth; + + match self.download_and_process(&work_item).await { + Ok(links) => WorkResult { + url, + depth, + links, + success: true, + error: None, + }, + Err(e) => { + let error_msg = format!("{}", e); + eprintln!( + "Worker {} failed to process {}: {}", + self.id, url, error_msg + ); + WorkResult { + url, + depth, + links: Vec::new(), + success: false, + error: Some(error_msg), + } + } + } + } + + /// Download and process a single page + async fn download_and_process(&mut self, work_item: &WorkItem) -> Result> { + // Download the page + let page = self.downloader.download(work_item.url.clone()).await?; + + // Save the page + self.storage + .save_page(&page.url, &page.content, work_item.depth) + .await?; + + // Extract links + let links = self.parser.extract_links(&page.content, &page.url)?; + + println!( + "Worker {} processed {} (depth {}) - found {} links", + self.id, + page.url, + work_item.depth, + links.len() + ); + + Ok(links) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_work_item_creation() { + let url = Url::parse("https://example.com").unwrap(); + let work_item = WorkItem { + url: url.clone(), + depth: 1, + }; + + assert_eq!(work_item.url, url); + assert_eq!(work_item.depth, 1); + } + + #[test] + fn test_work_result_success() { + let url = Url::parse("https://example.com").unwrap(); + let links = vec![Url::parse("https://example.com/page1").unwrap()]; + + let result = WorkResult { + url: url.clone(), + depth: 0, + links: links.clone(), + success: true, + error: None, + }; + + assert_eq!(result.url, url); + assert_eq!(result.depth, 0); + assert_eq!(result.links, links); + assert!(result.success); + assert!(result.error.is_none()); + } + + #[test] + fn test_work_result_failure() { + let url = Url::parse("https://example.com").unwrap(); + let error_msg = "Network error".to_string(); + + let result = WorkResult { + url: url.clone(), + depth: 1, + links: Vec::new(), + success: false, + error: Some(error_msg.clone()), + }; + + assert_eq!(result.url, url); + assert_eq!(result.depth, 1); + assert!(result.links.is_empty()); + assert!(!result.success); + assert_eq!(result.error, Some(error_msg)); + } + + #[tokio::test] + async fn test_worker_creation() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + + let worker = Worker::new(0, storage); + assert!(worker.is_ok()); + assert_eq!(worker.unwrap().id, 0); + } + + #[test] + fn test_work_item_clone() { + let url = Url::parse("https://example.com").unwrap(); + let work_item = WorkItem { + url: url.clone(), + depth: 2, + }; + let cloned = work_item.clone(); + + assert_eq!(work_item.url, cloned.url); + assert_eq!(work_item.depth, cloned.depth); + } + + #[test] + fn test_work_result_debug() { + let url = Url::parse("https://example.com").unwrap(); + let result = WorkResult { + url, + depth: 1, + links: Vec::new(), + success: true, + error: None, + }; + + let debug_str = format!("{:?}", result); + assert!(debug_str.contains("WorkResult")); + assert!(debug_str.contains("example.com")); + } +}