diff --git a/Cargo.toml b/Cargo.toml index 31cc132..ca9a18d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,21 +44,18 @@ path = "src/main.rs" [dependencies] # Dependencies required for building and running the project. -clap = "4.5" +clap = "4.5.23" dtt = "0.0.8" -env_logger = "0.11" -html-generator = "0.0.2" -indicatif = "0.17" -lazy_static = "1.5" -log = "0.4" -regex = "1.11" -scraper = "0.22" -tempfile = "3.13" -thiserror = "2.0" -time = "0.3" -tokio = "1.40" -url = "2.5" -xml-rs = "0.8" +env_logger = "0.11.6" +indicatif = "0.17.9" +lazy_static = "1.5.0" +log = "0.4.22" +regex = "1.11.1" +tempfile = "3.14.0" +thiserror = "2.0.9" +time = "0.3.37" +url = "2.5.4" +xml-rs = "0.8.24" # ----------------------------------------------------------------------------- # Build Dependencies @@ -66,7 +63,7 @@ xml-rs = "0.8" [build-dependencies] # Dependencies for build scripts. -version_check = "0.9" +version_check = "0.9.5" # ----------------------------------------------------------------------------- # Development Dependencies @@ -74,10 +71,10 @@ version_check = "0.9" [dev-dependencies] # Dependencies required for testing and development. -criterion = "0.5" -assert_fs = "1.1" -predicates = "3.1" -pretty_assertions = "1.4" +criterion = "0.5.1" +assert_fs = "1.1.2" +predicates = "3.1.3" +pretty_assertions = "1.4.1" # ----------------------------------------------------------------------------- # Examples diff --git a/src/error.rs b/src/error.rs index 6a84be3..3725da5 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,3 +1,6 @@ +// Copyright © 2025 Sitemap Gen. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! Error types for the sitemap library. //! //! This module defines various error types that can occur during sitemap operations, @@ -72,18 +75,19 @@ impl SitemapError { /// /// # Returns /// A string slice describing the context of the error. - pub fn context(&self) -> &'static str { + #[must_use] + pub const fn context(&self) -> &'static str { match self { - SitemapError::XmlWriteError(_) => "Error occurred while writing XML data", - SitemapError::XmlParseError(_) => "Error occurred while parsing XML data", - SitemapError::DateError(_) => "Error occurred while parsing or formatting dates", - SitemapError::UrlError(_) => "Error occurred while parsing URLs", - SitemapError::IoError(_) => "Error occurred during file or network operations", - SitemapError::EncodingError(_) => "Error occurred during UTF-8 string encoding or decoding", - SitemapError::InvalidChangeFreq(_) => "An invalid change frequency value was provided", - SitemapError::CustomError(_) => "An unexpected error occurred", - SitemapError::SitemapTooLarge => "The generated sitemap exceeds the maximum allowed size", - SitemapError::MaxUrlLimitExceeded(_) => "The number of URLs exceeds the maximum allowed limit", + Self::XmlWriteError(_) => "Error occurred while writing XML data", + Self::XmlParseError(_) => "Error occurred while parsing XML data", + Self::DateError(_) => "Error occurred while parsing or formatting dates", + Self::UrlError(_) => "Error occurred while parsing URLs", + Self::IoError(_) => "Error occurred during file or network operations", + Self::EncodingError(_) => "Error occurred during UTF-8 string encoding or decoding", + Self::InvalidChangeFreq(_) => "An invalid change frequency value was provided", + Self::CustomError(_) => "An unexpected error occurred", + Self::SitemapTooLarge => "The generated sitemap exceeds the maximum allowed size", + Self::MaxUrlLimitExceeded(_) => "The number of URLs exceeds the maximum allowed limit", } } } diff --git a/src/lib.rs b/src/lib.rs index 45613fa..37aa82b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ -// src/lib.rs +// Copyright © 2025 Sitemap Gen. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR MIT #![doc = include_str!("../README.md")] #![doc( @@ -6,41 +7,115 @@ html_logo_url = "https://kura.pro/sitemap-gen/images/logos/sitemap-gen.svg", html_root_url = "https://docs.rs/sitemap-gen" )] -#![crate_name = "sitemap_gen"] -#![crate_type = "lib"] +#![warn( + clippy::all, + clippy::pedantic, + clippy::nursery, + clippy::cargo, + missing_docs +)] +#![deny( + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::result_unit_err, + clippy::clone_on_ref_ptr +)] -//! A Rust library for generating and managing sitemaps. +//! # Sitemap Generator Library +//! +//! A comprehensive Rust library for generating and managing XML sitemaps according to the +//! [Sitemaps XML format](https://www.sitemaps.org/protocol.html) specification. +//! +//! ## Key Features +//! +//! - Create and manage XML sitemaps with proper validation +//! - Support for URL normalization and deduplication +//! - Customizable change frequencies and last modification dates +//! - Comprehensive error handling with detailed diagnostics +//! - Size and entry count validation according to sitemap standards +//! +//! ## Example Usage +//! +//! ```rust +//! use sitemap_gen::prelude::*; +//! use url::Url; //! -//! This crate provides functionality to create, modify, and serialize XML sitemaps according to the [Sitemaps XML format](https://www.sitemaps.org/protocol.html). -//! It includes support for handling various sitemap-specific data types and error conditions. +//! # fn main() -> SitemapResult<()> { +//! let mut sitemap = Sitemap::new(); +//! +//! // Create a sitemap entry +//! let entry = SiteMapData { +//! loc: Url::parse("https://example.com")?, +//! lastmod: "2024-10-08".to_string(), +//! changefreq: ChangeFreq::Daily, +//! }; +//! +//! // Add the entry and generate XML +//! sitemap.add_entry(entry)?; +//! let xml = sitemap.to_xml()?; +//! # Ok(()) +//! # } +//! ``` + +/// Configuration constants for sitemap generation and validation. +pub mod config { + /// Maximum allowed size of a sitemap in bytes (10MB). + pub const MAX_SITEMAP_SIZE: usize = 10 * 1024 * 1024; + + /// Maximum number of URLs allowed in a single sitemap. + pub const MAX_URLS: usize = 50_000; + + /// Default XML namespace for sitemaps. + pub const SITEMAP_XMLNS: &str = + "http://www.sitemaps.org/schemas/sitemap/0.9"; +} -/// Contains error types specific to sitemap operations. -/// -/// This module defines a comprehensive set of error types that can occur during -/// sitemap creation, modification, and serialization processes. +/// Error types and handling for sitemap operations. pub mod error; -/// Provides the core functionality for creating and managing sitemaps. -/// -/// This module contains the main structures and functions for working with sitemaps, -/// including creating sitemap entries, setting change frequencies, and serializing to XML. +/// Core sitemap functionality and data structures. pub mod sitemap; -/// Utility functions and helper methods for sitemap operations. +/// Utility functions for sitemap generation and management. pub mod utils; -// Re-exports +// Re-exports for convenience +pub use config::{MAX_SITEMAP_SIZE, MAX_URLS, SITEMAP_XMLNS}; pub use error::SitemapError; pub use sitemap::{ convert_date_format, create_site_map_data, ChangeFreq, SiteMapData, Sitemap, }; +/// Current crate version. +pub const VERSION: &str = env!("CARGO_PKG_VERSION"); + /// Result type alias for sitemap operations. +/// +/// This type is used throughout the library to handle operations that might fail. +/// The error type is always [`SitemapError`]. +/// +/// See [`error::SitemapError`] for more details about possible error conditions. pub type SitemapResult = Result; -/// A prelude module for convenient importing of commonly used items. +/// Prelude module providing commonly used types and traits. +/// +/// This module re-exports the most frequently used types and traits from the library, +/// allowing users to import them with a single `use` statement. +/// +/// # Example +/// +/// ```rust +/// use sitemap_gen::prelude::*; +/// +/// # fn main() -> SitemapResult<()> { +/// let sitemap = Sitemap::new(); +/// # Ok(()) +/// # } +/// ``` pub mod prelude { + pub use crate::config::{MAX_SITEMAP_SIZE, MAX_URLS}; pub use crate::error::SitemapError; pub use crate::sitemap::{ChangeFreq, SiteMapData, Sitemap}; pub use crate::SitemapResult; @@ -48,157 +123,86 @@ pub mod prelude { #[cfg(test)] mod tests { - use url::Url; - use super::*; - use crate::error::SitemapError; - use crate::sitemap::{ChangeFreq, SiteMapData, Sitemap}; - use crate::SitemapResult; + use url::Url; #[test] - fn test_create_sitemap() { - // Create an empty sitemap + fn test_sitemap_creation() -> SitemapResult<()> { let mut sitemap = Sitemap::new(); - - // Create a SiteMapData entry let entry = SiteMapData { - loc: Url::parse("http://example.com") - .expect("Failed to parse URL"), + loc: Url::parse("http://example.com")?, lastmod: "2024-10-08".to_string(), changefreq: ChangeFreq::Daily, }; - // Add the entry to the sitemap - sitemap.add_entry(entry).expect("Failed to add entry"); - - // Verify the sitemap contains the correct data + sitemap.add_entry(entry)?; assert_eq!(sitemap.len(), 1); assert!(!sitemap.is_empty()); + Ok(()) } #[test] - fn test_serialize_sitemap() { - // Create a new sitemap and add an entry + fn test_sitemap_serialization() -> SitemapResult<()> { let mut sitemap = Sitemap::new(); let entry = SiteMapData { - loc: Url::parse("http://example.com") - .expect("Failed to parse URL"), + loc: Url::parse("http://example.com")?, lastmod: "2024-10-08".to_string(), changefreq: ChangeFreq::Daily, }; - sitemap.add_entry(entry).expect("Failed to add entry"); - - // Serialize the sitemap to XML - let serialized = - sitemap.to_xml().expect("Failed to serialize sitemap"); + sitemap.add_entry(entry)?; + let xml = sitemap.to_xml()?; - // Assert that the serialized XML contains the correct information - assert!(serialized.contains("")); - assert!(serialized.contains("http://example.com/")); // Note the trailing slash - assert!(serialized.contains("daily")); - assert!(serialized.contains("2024-10-08")); + assert!(xml.contains("")); + assert!(xml.contains("http://example.com/")); + assert!(xml.contains("daily")); + assert!(xml.contains("2024-10-08")); + Ok(()) } #[test] - fn test_invalid_url_error() { - // Try to add an entry with an invalid URL and expect an error + fn test_invalid_url() { let mut sitemap = Sitemap::new(); - - let invalid_url = Url::parse("invalid-url"); - let result = match invalid_url { - Ok(valid_url) => sitemap.add_entry(SiteMapData { - loc: valid_url, + let result = Url::parse("invalid-url").map(|url| { + sitemap.add_entry(SiteMapData { + loc: url, lastmod: "2024-10-08".to_string(), changefreq: ChangeFreq::Daily, - }), - Err(e) => Err(SitemapError::UrlError(e)), - }; - - // Assert that the result is an error due to an invalid URL - assert!(matches!(result, Err(SitemapError::UrlError(_)))); - } - - #[test] - fn test_convert_date_format() { - // Test converting date formats using the helper function - let date = "2024-10-08T00:00:00Z"; - let converted = convert_date_format(date); - assert_eq!(converted, "2024-10-08"); - } - - #[test] - fn test_change_freq_enum() { - // Test the ChangeFreq enum values - assert_eq!(ChangeFreq::Daily.to_string(), "daily"); - assert_eq!(ChangeFreq::Monthly.to_string(), "monthly"); - } - - #[test] - fn test_sitemap_data_creation() { - // Test creating a new SiteMapData instance - let sitemap_entry = SiteMapData { - loc: Url::parse("http://example.com") - .expect("Failed to parse URL"), - lastmod: "2024-10-08".to_string(), - changefreq: ChangeFreq::Daily, - }; - - // Create an empty sitemap and add the entry - let mut sitemap = Sitemap::new(); - sitemap - .add_entry(sitemap_entry) - .expect("Failed to add entry"); + }) + }); - // Check that the entry was added - assert_eq!(sitemap.len(), 1); + assert!(result.is_err()); } #[test] - fn test_sitemap_error_handling() { - // Test various error types defined in SitemapError - let url_error: SitemapError = - SitemapError::UrlError(url::ParseError::EmptyHost); - let io_error: SitemapError = - SitemapError::IoError(std::io::Error::new( - std::io::ErrorKind::NotFound, - "File not found", - )); - - assert!(matches!(url_error, SitemapError::UrlError(_))); - assert!(matches!(io_error, SitemapError::IoError(_))); + fn test_date_conversion() -> () { + let formatted = convert_date_format("20 May 2023"); + assert_eq!(formatted, "2023-05-20"); } #[test] - fn test_sitemap_result() { - // Test that SitemapResult works with Ok and Err variants - let success: SitemapResult<&str> = Ok("Success"); - let failure: SitemapResult<&str> = - Err(SitemapError::UrlError(url::ParseError::EmptyHost)); - - assert!(success.is_ok()); - assert!(failure.is_err()); - } - #[test] - fn test_valid_url_addition() { - // Create a new empty sitemap + fn test_size_limits() -> SitemapResult<()> { let mut sitemap = Sitemap::new(); + let url = Url::parse("http://example.com")?; - // Try to add a valid URL - let valid_url = Url::parse("http://example.com") - .expect("Failed to parse valid URL"); + // Add MAX_URLS entries + for i in 0..MAX_URLS { + sitemap.add_entry(SiteMapData { + loc: Url::parse(&format!("{}?id={}", url, i))?, + lastmod: "2024-10-08".to_string(), + changefreq: ChangeFreq::Daily, + })?; + } + // Try to add one more let result = sitemap.add_entry(SiteMapData { - loc: valid_url, + loc: url, lastmod: "2024-10-08".to_string(), changefreq: ChangeFreq::Daily, }); - // Assert that the entry was successfully added - assert!(result.is_ok(), "Failed to add valid URL to sitemap"); - - // Check that the sitemap now contains the entry - assert_eq!(sitemap.len(), 1); - assert!(!sitemap.is_empty()); + assert!(result.is_err()); + Ok(()) } } diff --git a/src/main.rs b/src/main.rs index 4fbe20d..fbc1f96 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,6 @@ +// Copyright © 2025 Sitemap Gen. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! # Sitemap Generator CLI //! //! This command-line application allows users to generate XML sitemaps. diff --git a/src/sitemap.rs b/src/sitemap.rs index 1e16b31..2cd117c 100644 --- a/src/sitemap.rs +++ b/src/sitemap.rs @@ -1,5 +1,11 @@ -// src/sitemap.rs +// Copyright © 2025 Sitemap Gen. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR MIT +//! Provides core sitemap functionality and data structures. +//! +//! This module implements the main sitemap generation functionality according to the [Sitemaps XML format](https://www.sitemaps.org/protocol.html) specification. + +use crate::config::{MAX_SITEMAP_SIZE, MAX_URLS, SITEMAP_XMLNS}; use crate::error::{SitemapError, SitemapResult}; use dtt::datetime::DateTime; use lazy_static::lazy_static; @@ -10,53 +16,91 @@ use std::str::FromStr; use url::Url; use xml::writer::{EventWriter, XmlEvent}; -/// Maximum number of URLs allowed in a sitemap. -const MAX_URLS: usize = 50_000; +lazy_static! { + static ref DATE_REGEX: Regex = + Regex::new(r"(\d{2}) (\w{3}) (\d{4})") + .expect("Invalid date regex pattern"); +} -/// Represents the data for a sitemap entry. -#[derive(Debug, Clone, PartialEq)] +/// Represents the data for a sitemap URL entry. +/// +/// This struct contains all required fields for a sitemap URL entry according to the +/// [Sitemaps XML format](https://www.sitemaps.org/protocol.html). +#[derive(Debug, Clone, PartialEq, Eq)] pub struct SiteMapData { - /// The change frequency of the URL. + /// How frequently the page is likely to change. + /// This value provides a hint to search engines about the page's update frequency. pub changefreq: ChangeFreq, - /// The last modification date of the URL in YYYY-MM-DD format. + + /// The date of last modification in YYYY-MM-DD format. + /// Must be a valid date string in W3C Datetime format. pub lastmod: String, - /// The location (URL) of the page. + + /// The canonical URL of the page. + /// Must be a fully qualified URL that begins with http:// or https://. pub loc: Url, } +impl SiteMapData { + /// Creates a new `SiteMapData` instance with the provided values. + /// + /// # Arguments + /// + /// * `loc` - The URL of the page + /// * `lastmod` - The last modification date + /// * `changefreq` - How frequently the page is expected to change + /// + /// # Returns + /// + /// A new `SiteMapData` instance + #[must_use] + pub const fn new( + loc: Url, + lastmod: String, + changefreq: ChangeFreq, + ) -> Self { + Self { + loc, + lastmod, + changefreq, + } + } +} + /// Represents the change frequency of a URL in the sitemap. /// /// This enum is used to indicate how frequently the page is likely to change. /// Search engines use this information when deciding how often to crawl the page. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ChangeFreq { - /// The page is changed every time it's accessed. + /// The page is changed every time it's accessed Always, - /// The page is changed every hour. + /// The page is changed every hour Hourly, - /// The page is changed every day. + /// The page is changed every day Daily, - /// The page is changed every week. + /// The page is changed every week Weekly, - /// The page is changed every month. + /// The page is changed every month Monthly, - /// The page is changed every year. + /// The page is changed every year Yearly, - /// The page is archived and never expected to change. + /// The page is archived and never expected to change Never, } impl ChangeFreq { /// Returns the string representation of the change frequency. - pub fn as_str(&self) -> &'static str { + #[must_use] + pub const fn as_str(&self) -> &'static str { match self { - ChangeFreq::Always => "always", - ChangeFreq::Hourly => "hourly", - ChangeFreq::Daily => "daily", - ChangeFreq::Weekly => "weekly", - ChangeFreq::Monthly => "monthly", - ChangeFreq::Yearly => "yearly", - ChangeFreq::Never => "never", + Self::Always => "always", + Self::Hourly => "hourly", + Self::Daily => "daily", + Self::Weekly => "weekly", + Self::Monthly => "monthly", + Self::Yearly => "yearly", + Self::Never => "never", } } } @@ -66,13 +110,13 @@ impl FromStr for ChangeFreq { fn from_str(s: &str) -> Result { match s.to_lowercase().as_str() { - "always" => Ok(ChangeFreq::Always), - "hourly" => Ok(ChangeFreq::Hourly), - "daily" => Ok(ChangeFreq::Daily), - "weekly" => Ok(ChangeFreq::Weekly), - "monthly" => Ok(ChangeFreq::Monthly), - "yearly" => Ok(ChangeFreq::Yearly), - "never" => Ok(ChangeFreq::Never), + "always" => Ok(Self::Always), + "hourly" => Ok(Self::Hourly), + "daily" => Ok(Self::Daily), + "weekly" => Ok(Self::Weekly), + "monthly" => Ok(Self::Monthly), + "yearly" => Ok(Self::Yearly), + "never" => Ok(Self::Never), _ => Err(SitemapError::InvalidChangeFreq(s.to_string())), } } @@ -80,26 +124,180 @@ impl FromStr for ChangeFreq { impl fmt::Display for ChangeFreq { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let s = match self { - ChangeFreq::Always => "always", - ChangeFreq::Hourly => "hourly", - ChangeFreq::Daily => "daily", - ChangeFreq::Weekly => "weekly", - ChangeFreq::Monthly => "monthly", - ChangeFreq::Yearly => "yearly", - ChangeFreq::Never => "never", - }; - write!(f, "{}", s) + f.write_str(self.as_str()) + } +} + +/// Represents a complete sitemap containing URL entries. +#[derive(Debug, Default, Clone)] +pub struct Sitemap { + entries: Vec, +} + +impl Sitemap { + /// Creates a new empty `Sitemap`. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Creates a new `Sitemap` with the specified capacity. + /// + /// The capacity will be capped at [`MAX_URLS`] to prevent excessive memory allocation. + /// + /// # Arguments + /// + /// * `capacity` - The desired capacity for the sitemap + #[must_use] + pub fn with_capacity(capacity: usize) -> Self { + Self { + entries: Vec::with_capacity(capacity.min(MAX_URLS)), + } + } + + /// Returns the current number of entries in the sitemap. + #[must_use] + pub fn len(&self) -> usize { + self.entries.len() + } + + /// Checks if the sitemap is empty. + #[must_use] + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + /// Adds a new entry to the sitemap. + /// + /// # Errors + /// + /// Returns an error if adding the entry would exceed [`MAX_URLS`]. + pub fn add_entry( + &mut self, + entry: SiteMapData, + ) -> SitemapResult<()> { + if self.entries.len() >= MAX_URLS { + return Err(SitemapError::MaxUrlLimitExceeded( + self.entries.len(), + )); + } + self.entries.push(entry); + Ok(()) + } + + /// Adds multiple entries to the sitemap. + /// + /// # Errors + /// + /// Returns an error if adding any entry would exceed [`MAX_URLS`]. + pub fn add_entries(&mut self, entries: I) -> SitemapResult<()> + where + I: IntoIterator, + { + for entry in entries { + self.add_entry(entry)?; + } + Ok(()) + } + + /// Generates the XML representation of the sitemap. + /// + /// # Errors + /// + /// Returns an error if: + /// - XML writing fails + /// - The generated XML exceeds [`MAX_SITEMAP_SIZE`] + /// - UTF-8 encoding fails + pub fn to_xml(&self) -> SitemapResult { + let estimated_size = self.entries.len().saturating_mul(300); + let mut output = Vec::with_capacity(estimated_size); + let mut writer = EventWriter::new(&mut output); + + self.write_xml_header(&mut writer)?; + + for entry in &self.entries { + self.write_entry(&mut writer, entry)?; + } + + writer.write(XmlEvent::end_element())?; + + let xml = String::from_utf8(output) + .map_err(SitemapError::EncodingError)?; + + if xml.len() > MAX_SITEMAP_SIZE { + return Err(SitemapError::SitemapTooLarge); + } + + Ok(xml) + } + + fn write_xml_header( + &self, + writer: &mut EventWriter<&mut Vec>, + ) -> SitemapResult<()> { + writer.write(XmlEvent::StartDocument { + version: xml::common::XmlVersion::Version10, + encoding: Some("UTF-8"), + standalone: None, + })?; + + writer.write( + XmlEvent::start_element("urlset").default_ns(SITEMAP_XMLNS), + )?; + Ok(()) + } + + fn write_entry( + &self, + writer: &mut EventWriter<&mut Vec>, + entry: &SiteMapData, + ) -> SitemapResult<()> { + writer.write(XmlEvent::start_element("url"))?; + self.write_element(writer, "loc", entry.loc.as_str())?; + self.write_element(writer, "lastmod", &entry.lastmod)?; + self.write_element( + writer, + "changefreq", + entry.changefreq.as_str(), + )?; + writer.write(XmlEvent::end_element())?; + Ok(()) + } + + fn write_element( + &self, + writer: &mut EventWriter<&mut Vec>, + name: &str, + value: &str, + ) -> SitemapResult<()> { + writer.write(XmlEvent::start_element(name))?; + writer.write(XmlEvent::characters(value))?; + writer.write(XmlEvent::end_element())?; + Ok(()) } } /// Generates `SiteMapData` from metadata. /// +/// Creates a sitemap entry from a metadata hash map containing page information. +/// /// # Arguments -/// * `metadata` - A hashmap containing page metadata, including last build date, change frequency, and page location. +/// +/// * `metadata` - A hashmap containing page metadata with the following keys: +/// * `last_build_date` - The date the page was last modified +/// * `changefreq` - How frequently the page changes (optional, defaults to "weekly") +/// * `permalink` - The URL of the page (required) /// /// # Returns -/// A `SiteMapData` object populated with values from the metadata, or an error if the data is invalid. +/// +/// Returns a `SiteMapData` instance or an error if required data is missing or invalid. +/// +/// # Errors +/// +/// Returns an error if: +/// - The permalink is missing +/// - The URL is invalid +/// - The change frequency is invalid pub fn create_site_map_data( metadata: &HashMap, ) -> SitemapResult { @@ -120,32 +318,29 @@ pub fn create_site_map_data( })?; let loc = Url::parse(loc).map_err(SitemapError::UrlError)?; - Ok(SiteMapData { - changefreq, - lastmod, - loc, - }) -} - -lazy_static! { - static ref DATE_REGEX: Regex = - Regex::new(r"(\d{2}) (\w{3}) (\d{4})").unwrap(); + Ok(SiteMapData::new(loc, lastmod, changefreq)) } /// Converts date strings from various formats to "YYYY-MM-DD". /// -/// Supports conversion from "DD MMM YYYY" format and checks if input is already in target format. +/// Supports conversion from multiple date formats: +/// - "DD MMM YYYY" (e.g., "20 May 2023") +/// - W3C Datetime format +/// - Any format supported by the `DateTime` parser /// /// # Arguments -/// * `input` - A string slice representing the input date. +/// +/// * `input` - A string slice representing the input date /// /// # Returns -/// A string representing the date in "YYYY-MM-DD" format, or the original input if conversion is not applicable. +/// +/// A string in "YYYY-MM-DD" format, or the original input if conversion fails +#[must_use] pub fn convert_date_format(input: &str) -> String { if let Some(caps) = DATE_REGEX.captures(input) { - let day = caps.get(1).map(|m| m.as_str()).unwrap_or(""); - let month = caps.get(2).map(|m| m.as_str()).unwrap_or(""); - let year = caps.get(3).map(|m| m.as_str()).unwrap_or(""); + let day = caps.get(1).map_or("", |m| m.as_str()); + let month = caps.get(2).map_or("", |m| m.as_str()); + let year = caps.get(3).map_or("", |m| m.as_str()); let month_num = match month.to_lowercase().as_str() { "jan" => "01", @@ -163,213 +358,407 @@ pub fn convert_date_format(input: &str) -> String { _ => return input.to_string(), }; - return format!("{}-{}-{}", year, month_num, day); + return format!("{year}-{month_num}-{day}"); } - if let Ok(dt) = DateTime::parse(input) { - if let Ok(formatted) = dt.format("[year]-[month]-[day]") { - return formatted; + DateTime::parse(input) + .and_then(|dt| dt.format("[year]-[month]-[day]")) + .unwrap_or_else(|_| input.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::error::SitemapError; + + // --------------------------- + // SiteMapData-specific Tests + // --------------------------- + mod site_map_data_tests { + use super::*; + + /// Verifies that `SiteMapData::new` initializes all fields correctly. + #[test] + fn test_site_map_data_new() { + let loc = Url::parse("https://example.net").unwrap(); + let lastmod = "2026-01-01".to_string(); + let changefreq = ChangeFreq::Hourly; + + let data = SiteMapData::new( + loc.clone(), + lastmod.clone(), + changefreq, + ); + + assert_eq!(data.loc, loc); + assert_eq!(data.lastmod, lastmod); + assert_eq!(data.changefreq, changefreq); } } - input.to_string() -} + // --------------------------- + // create_site_map_data Tests + // --------------------------- + mod create_site_map_data_tests { + use super::*; + + /// Checks that `create_site_map_data` builds the correct `SiteMapData` from metadata. + #[test] + fn test_create_site_map_data() -> SitemapResult<()> { + let mut metadata = HashMap::new(); + let _ = metadata.insert( + "last_build_date".to_string(), + "20 May 2023".to_string(), + ); + let _ = metadata + .insert("changefreq".to_string(), "weekly".to_string()); + let _ = metadata.insert( + "permalink".to_string(), + "https://example.com".to_string(), + ); + + let site_map_data = create_site_map_data(&metadata)?; + + assert_eq!(site_map_data.lastmod, "2023-05-20"); + assert_eq!(site_map_data.changefreq, ChangeFreq::Weekly); + assert_eq!( + site_map_data.loc, + Url::parse("https://example.com")? + ); + + Ok(()) + } -/// Represents a complete sitemap. -#[derive(Debug, Default, Clone)] -pub struct Sitemap { - entries: Vec, -} + /// Ensures an error is raised if the `permalink` field is missing. + #[test] + fn test_create_site_map_data_missing_permalink() { + let mut metadata = HashMap::new(); + // Missing "permalink" key + let _ = metadata.insert( + "last_build_date".to_string(), + "20 May 2023".to_string(), + ); + let _ = metadata + .insert("changefreq".to_string(), "weekly".to_string()); + + let result = create_site_map_data(&metadata); + assert!( + matches!(result, Err(SitemapError::CustomError(msg)) if msg.contains("Missing permalink")), + "Expected an error about missing permalink" + ); + } -impl Sitemap { - /// Creates a new empty `Sitemap`. - pub fn new() -> Self { - Sitemap { - entries: Vec::new(), + /// Ensures an error is raised if the `permalink` is not a valid URL. + #[test] + fn test_create_site_map_data_invalid_permalink() { + let mut metadata = HashMap::new(); + let _ = metadata.insert( + "permalink".to_string(), + "not-a-valid-url".to_string(), + ); + // "last_build_date" omitted for brevity + + let result = create_site_map_data(&metadata); + assert!( + matches!(result, Err(SitemapError::UrlError(_))), + "Expected a URL parsing error" + ); } - } - /// Entry count of the sitemap. - pub fn entry_count(&self) -> usize { - self.entries.len() + /// Ensures an error is raised if `changefreq` is not recognized. + #[test] + fn test_create_site_map_data_invalid_changefreq() { + let mut metadata = HashMap::new(); + let _ = metadata.insert( + "permalink".to_string(), + "https://example.com".to_string(), + ); + let _ = metadata.insert( + "changefreq".to_string(), + "very-often".to_string(), + ); + + let result = create_site_map_data(&metadata); + assert!( + matches!(result, Err(SitemapError::InvalidChangeFreq(freq)) if freq == "very-often"), + "Expected an InvalidChangeFreq error for unrecognized freq" + ); + } } - /// Adds a new entry to the sitemap. - /// - /// # Arguments - /// * `entry` - The `SiteMapData` entry to add to the sitemap. - /// - /// # Returns - /// `Ok(())` if the entry was added successfully, or an error if the sitemap would exceed size limits. - pub fn add_entry( - &mut self, - entry: SiteMapData, - ) -> SitemapResult<()> { - if self.entries.len() >= MAX_URLS { - return Err(SitemapError::MaxUrlLimitExceeded( - self.entries.len(), - )); + // ---------------------- + // ChangeFreq Tests + // ---------------------- + mod change_freq_tests { + use super::*; + + /// Verifies that `ChangeFreq::as_str()` returns the correct string for each variant. + #[test] + fn test_change_freq_as_str() { + assert_eq!(ChangeFreq::Always.as_str(), "always"); + assert_eq!(ChangeFreq::Hourly.as_str(), "hourly"); + assert_eq!(ChangeFreq::Daily.as_str(), "daily"); + assert_eq!(ChangeFreq::Weekly.as_str(), "weekly"); + assert_eq!(ChangeFreq::Monthly.as_str(), "monthly"); + assert_eq!(ChangeFreq::Yearly.as_str(), "yearly"); + assert_eq!(ChangeFreq::Never.as_str(), "never"); } - self.entries.push(entry); - Ok(()) - } - /// Returns the current number of entries in the sitemap. - pub fn len(&self) -> usize { - self.entries.len() + /// Checks the `Display` implementation of a few `ChangeFreq` variants. + #[test] + fn test_change_freq_display() { + assert_eq!(ChangeFreq::Daily.to_string(), "daily"); + assert_eq!(ChangeFreq::Weekly.to_string(), "weekly"); + assert_eq!(ChangeFreq::Monthly.to_string(), "monthly"); + } + + /// Ensures `from_str` can parse valid variants and fails on invalid input. + #[test] + fn test_change_freq_from_str() { + assert_eq!( + "daily".parse::().unwrap(), + ChangeFreq::Daily + ); + assert_eq!( + "WEEKLY".parse::().unwrap(), + ChangeFreq::Weekly + ); + assert!("invalid".parse::().is_err()); + } } - /// Checks if the sitemap is empty. - pub fn is_empty(&self) -> bool { - self.entries.is_empty() + // -------------------------- + // convert_date_format Tests + // -------------------------- + mod date_format_tests { + use super::*; + + /// Checks that common date formats convert correctly (or remain unchanged if invalid). + #[test] + fn test_convert_date_format() { + assert_eq!( + convert_date_format("20 May 2023"), + "2023-05-20" + ); + assert_eq!(convert_date_format("2023-05-20"), "2023-05-20"); + assert_eq!( + convert_date_format("Invalid Date"), + "Invalid Date" + ); + } + + /// Covers edge cases, including empty strings, partially valid strings, etc. + #[test] + fn test_convert_date_format_edge_cases() { + assert_eq!(convert_date_format(""), ""); + assert_eq!( + convert_date_format("Invalid Date"), + "Invalid Date" + ); + assert_eq!( + convert_date_format("32 Jan 2023"), + "2023-01-32" + ); + assert_eq!( + convert_date_format("01 Foo 2023"), + "01 Foo 2023" + ); + } } - /// Generates the XML representation of the sitemap. - /// - /// # Returns - /// A string containing the XML representation of the sitemap, or an error if generation fails. - pub fn to_xml(&self) -> SitemapResult { - // Pre-allocate enough space in the Vec to avoid reallocations. - let estimated_size = self.entries.len() * 300; // Rough estimate of average entry size in bytes - let mut output = Vec::with_capacity(estimated_size); - let mut writer = EventWriter::new(&mut output); + // ---------------------- + // Sitemap Tests + // ---------------------- + mod sitemap_tests { + use super::*; + + /// Ensures a `Sitemap` created via `Sitemap::new()` is empty and has length 0. + #[test] + fn test_sitemap_new_is_empty() { + let sitemap = Sitemap::new(); + assert_eq!( + sitemap.len(), + 0, + "Newly created sitemap should have length 0" + ); + assert!( + sitemap.is_empty(), + "Newly created sitemap should be empty" + ); + } - writer.write(XmlEvent::StartDocument { - version: xml::common::XmlVersion::Version10, - encoding: Some("UTF-8"), - standalone: None, - })?; + /// Demonstrates `Sitemap::with_capacity` respects capacity up to `MAX_URLS`. + #[test] + fn test_sitemap_with_capacity() { + let sitemap = Sitemap::with_capacity(100); + assert!(sitemap.entries.capacity() >= 100); + assert!(sitemap.entries.capacity() <= MAX_URLS); + } - writer.write(XmlEvent::start_element("urlset").default_ns( - "http://www.sitemaps.org/schemas/sitemap/0.9", - ))?; + /// Verifies behavior of `Sitemap::len()` and `Sitemap::is_empty()` after adding an entry. + #[test] + fn test_sitemap_len_and_is_empty_with_entries( + ) -> SitemapResult<()> { + let mut sitemap = Sitemap::new(); + let entry = SiteMapData::new( + Url::parse("https://example.com")?, + "2023-05-20".to_string(), + ChangeFreq::Weekly, + ); + sitemap.add_entry(entry)?; + + assert_eq!( + sitemap.len(), + 1, + "Sitemap should have length 1 after adding one entry" + ); + assert!( + !sitemap.is_empty(), + "Sitemap should not be empty after adding an entry" + ); + Ok(()) + } - for entry in &self.entries { - // Start the element - writer.write(XmlEvent::start_element("url"))?; - - // entry - writer.write(XmlEvent::start_element("loc"))?; - writer.write(XmlEvent::characters(entry.loc.as_ref()))?; - writer.write(XmlEvent::end_element())?; - - // entry - writer.write(XmlEvent::start_element("lastmod"))?; - writer.write(XmlEvent::characters(&entry.lastmod))?; - writer.write(XmlEvent::end_element())?; - - // entry - writer.write(XmlEvent::start_element("changefreq"))?; - writer.write(XmlEvent::characters( - entry.changefreq.as_str(), - ))?; - writer.write(XmlEvent::end_element())?; + /// Tests adding a single entry to the sitemap. + #[test] + fn test_add_entry_single() -> SitemapResult<()> { + let mut sitemap = Sitemap::new(); + let entry = SiteMapData::new( + Url::parse("https://example.org")?, + "2025-12-30".to_string(), + ChangeFreq::Daily, + ); + sitemap.add_entry(entry.clone())?; + + assert_eq!(sitemap.len(), 1); + assert_eq!(sitemap.entries[0], entry); + Ok(()) + } - // End the element - writer.write(XmlEvent::end_element())?; + /// Tests bulk addition of entries using `Sitemap::add_entries`. + #[test] + fn test_add_entries_bulk() -> SitemapResult<()> { + let mut sitemap = Sitemap::new(); + let entries = vec![ + SiteMapData::new( + Url::parse("https://example.com/1")?, + "2024-01-01".to_string(), + ChangeFreq::Daily, + ), + SiteMapData::new( + Url::parse("https://example.com/2")?, + "2024-01-02".to_string(), + ChangeFreq::Weekly, + ), + ]; + + sitemap.add_entries(entries)?; + assert_eq!(sitemap.len(), 2); + Ok(()) } - // Close the element - writer.write(XmlEvent::end_element())?; + /// Validates the XML serialization logic of `Sitemap::to_xml()`. + #[test] + fn test_sitemap_to_xml() -> SitemapResult<()> { + let mut sitemap = Sitemap::new(); + sitemap.add_entry(SiteMapData::new( + Url::parse("https://example.com")?, + "2023-05-20".to_string(), + ChangeFreq::Weekly, + ))?; - // Convert the output Vec directly into a string without intermediate allocations - let xml = unsafe { String::from_utf8_unchecked(output) }; + let xml = sitemap.to_xml()?; - // Check size before returning to ensure the sitemap isn't too large - if xml.len() > 10 * 1024 * 1024 { - return Err(SitemapError::SitemapTooLarge); + assert!(xml.contains("")); + assert!(xml.contains("")); + assert!(xml.contains("https://example.com/")); + assert!(xml.contains("2023-05-20")); + assert!(xml.contains("weekly")); + Ok(()) } - Ok(xml) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use dtt::dtt_now; - - #[test] - fn test_create_site_map_data() -> SitemapResult<()> { - let mut metadata = HashMap::new(); - let _ = metadata.insert( - "last_build_date".to_string(), - "20 May 2023".to_string(), - ); - let _ = metadata - .insert("changefreq".to_string(), "weekly".to_string()); - let _ = metadata.insert( - "permalink".to_string(), - "https://example.com".to_string(), - ); - - let site_map_data = create_site_map_data(&metadata)?; - - assert_eq!(site_map_data.lastmod, "2023-05-20"); - assert_eq!(site_map_data.changefreq, ChangeFreq::Weekly); - assert_eq!( - site_map_data.loc, - Url::parse("https://example.com")? - ); - Ok(()) - } + /// Ensures that adding more URLs than allowed triggers `SitemapError::MaxUrlLimitExceeded`. + #[test] + fn test_sitemap_size_limit() -> SitemapResult<()> { + let mut sitemap = Sitemap::with_capacity(MAX_URLS); + + for i in 0..MAX_URLS { + sitemap.add_entry(SiteMapData::new( + Url::parse(&format!("https://example.com/{i}"))?, + "2023-05-20".to_string(), + ChangeFreq::Weekly, + ))?; + } + + let result = sitemap.add_entry(SiteMapData::new( + Url::parse("https://example.com/toomany")?, + "2023-05-20".to_string(), + ChangeFreq::Weekly, + )); - #[test] - fn test_convert_date_format() { - assert_eq!(convert_date_format("20 May 2023"), "2023-05-20"); - assert_eq!(convert_date_format("2023-05-20"), "2023-05-20"); - assert_eq!(convert_date_format("Invalid Date"), "Invalid Date"); - } + assert!( + matches!( + result, + Err(SitemapError::MaxUrlLimitExceeded(_)) + ), + "Expected an error when exceeding max URLs" + ); + Ok(()) + } - #[test] - fn test_sitemap_to_xml() -> SitemapResult<()> { - let mut sitemap = Sitemap::new(); - sitemap.add_entry(SiteMapData { - loc: Url::parse("https://example.com")?, - lastmod: "2023-05-20".to_string(), - changefreq: ChangeFreq::Weekly, - })?; + /// Tests that generating an extremely large XML triggers `SitemapError::SitemapTooLarge`. + #[test] + fn test_sitemap_too_large_error() { + let mut sitemap = Sitemap::new(); - let xml = sitemap.to_xml()?; - assert!(xml.contains("")); - assert!(xml.contains("")); - assert!(xml.contains("https://example.com/")); - assert!(xml.contains("2023-05-20")); - assert!(xml.contains("weekly")); - Ok(()) - } + // Construct a large string that should exceed MAX_SITEMAP_SIZE when serialized. + let huge_loc_string = format!( + "https://example.com/{}", + "a".repeat(MAX_SITEMAP_SIZE + 10) // Enough to push over the limit + ); - #[test] - fn test_sitemap_size_limit() -> SitemapResult<()> { - let mut sitemap = Sitemap::new(); - for i in 0..50_000 { - sitemap.add_entry(SiteMapData { - loc: Url::parse(&format!("https://example.com/{}", i))?, + let entry = SiteMapData { + loc: Url::parse(&huge_loc_string).unwrap(), lastmod: "2023-05-20".to_string(), changefreq: ChangeFreq::Weekly, - })?; + }; + + // Add a single entry that pushes us over the size threshold. + sitemap.add_entry(entry).unwrap(); + + let result = sitemap.to_xml(); + assert!( + matches!(result, Err(SitemapError::SitemapTooLarge)), + "Expected a SitemapTooLarge error" + ); } - assert!(matches!( - sitemap.add_entry(SiteMapData { - loc: Url::parse("https://example.com/toomany")?, - lastmod: "2023-05-20".to_string(), - changefreq: ChangeFreq::Weekly, - }), - Err(SitemapError::MaxUrlLimitExceeded(_)) - )); - Ok(()) - } - #[test] - fn test_dtt_now_macro() { - let now = dtt_now!(); - assert!(now.year() >= 2023); - } - #[test] - fn test_convert_date_format_edge_cases() { - assert_eq!(convert_date_format(""), ""); - assert_eq!(convert_date_format("Invalid Date"), "Invalid Date"); - assert_eq!(convert_date_format("32 Jan 2023"), "2023-01-32"); - assert_eq!(convert_date_format("01 Foo 2023"), "01 Foo 2023"); + /// Ensures that concurrent writes don't interfere with each other. + #[test] + fn test_concurrent_sitemap_read() -> SitemapResult<()> { + use std::sync::Arc; + use std::thread; + + let sitemap = Arc::new(Sitemap::new()); + let mut handles = Vec::new(); + + for _ in 0..10 { + let sitemap = Arc::clone(&sitemap); + handles.push(thread::spawn(move || { + assert_eq!(sitemap.len(), 0); + assert!(sitemap.is_empty()); + })); + } + + for handle in handles { + handle.join().map_err(|_| { + SitemapError::CustomError( + "Thread panicked during read test".to_string(), + ) + })?; + } + + Ok(()) + } } } diff --git a/src/utils.rs b/src/utils.rs index b82c63e..f30e6a5 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,3 +1,6 @@ +// Copyright © 2025 Sitemap Gen. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR MIT + use crate::{ ChangeFreq, SiteMapData, Sitemap, SitemapError, SitemapResult, }; @@ -568,39 +571,62 @@ mod tests { use std::sync::{Arc, Mutex}; use std::thread; + // Create test URLs let urls = Arc::new(Mutex::new(vec![ - Url::parse("http://example.com").unwrap(), - Url::parse("https://example.org").unwrap(), + Url::parse("http://example.com") + .map_err(SitemapError::UrlError)?, + Url::parse("https://example.org") + .map_err(SitemapError::UrlError)?, ])); let sitemap_result = Arc::new(Mutex::new(Sitemap::new())); + // Spawn threads let handles: Vec<_> = (0..10) .map(|_| { let urls = Arc::clone(&urls); let sitemap_result = Arc::clone(&sitemap_result); - thread::spawn(move || { - let mut sitemap = sitemap_result.lock().unwrap(); - let urls = urls.lock().unwrap(); + thread::spawn(move || -> SitemapResult<()> { + let sitemap = + &mut sitemap_result.lock().map_err(|e| { + SitemapError::CustomError(e.to_string()) + })?; + + let urls = urls.lock().map_err(|e| { + SitemapError::CustomError(e.to_string()) + })?; + for url in urls.iter() { - let entry = SiteMapData { - loc: url.clone(), - lastmod: "2024-01-01".to_string(), - changefreq: ChangeFreq::Weekly, - }; - sitemap.add_entry(entry).unwrap(); + let entry = SiteMapData::new( + url.clone(), + "2024-01-01".to_string(), + ChangeFreq::Weekly, + ); + sitemap.add_entry(entry)?; } + Ok(()) }) }) .collect(); + // Join threads and collect results for handle in handles { - handle.join().unwrap(); + handle.join().map_err(|_| { + SitemapError::CustomError("Thread panicked".to_string()) + })??; } - let sitemap = sitemap_result.lock().unwrap(); - assert_eq!(sitemap.entry_count(), 20, "Sitemap should contain 20 entries after concurrent generation"); + // Verify results + let sitemap = sitemap_result + .lock() + .map_err(|e| SitemapError::CustomError(e.to_string()))?; + + assert_eq!( + sitemap.len(), + 20, + "Sitemap should contain 20 entries after concurrent generation" + ); Ok(()) } diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs deleted file mode 100644 index 8b13789..0000000 --- a/tests/integration_tests.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tools/check_dependencies.sh b/tools/check_dependencies.sh new file mode 100644 index 0000000..49bfd5d --- /dev/null +++ b/tools/check_dependencies.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Set the path to Cargo.toml relative to the script's location +cargo_toml="$(dirname "$0")/../Cargo.toml" +# Set the directories to search in relative to the script's location +search_dirs=( + "$(dirname "$0")/../src/" + "$(dirname "$0")/../benches/" + "$(dirname "$0")/../examples/" + "$(dirname "$0")/../tests/" +) + +# Extract dependency names specifically from the `[dependencies]` section +dependencies=$(awk '/\[dependencies\]/ {flag=1; next} /^\[/{flag=0} flag {print}' "${cargo_toml}" | grep -oE '^[a-zA-Z0-9_-]+' || true) + +# Iterate over each dependency +while read -r dep; do + # Skip empty lines + [[ -z "${dep}" ]] && continue + + # Prepare a pattern to match Rust module imports (e.g., http-handle becomes http_handle) + dep_pattern=$(echo "${dep}" | tr '-' '_') + + # Check if the dependency is used in any of the specified directories + found=false + for dir in "${search_dirs[@]}"; do + if grep -qir "${dep_pattern}" "${dir}"; then + found=true + break + fi + done + + # If the dependency is not found in any directory, mark it as unused + if [[ "${found}" = false ]]; then + printf "🗑️ The \033[1m%s\033[0m crate is not required!\n" "${dep}" + fi +done <<< "${dependencies}"