diff --git a/Cargo.toml b/Cargo.toml index 8146fff..272ead7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,26 +1,34 @@ [package] authors = ["T.J. Telan "] -categories = ["parser-implementations", "encoding"] -description = "A parser for git repo urls based on url crate" +categories = ["parser-implementations"] +description = "A parser for urls used by git" documentation = "https://docs.rs/git-url-parse" -edition = "2021" -keywords = ["git", "url", "parsing", "normalize"] +edition = "2024" +keywords = ["git", "url", "parser"] license = "MIT" name = "git-url-parse" readme = "README.md" repository = "https://github.com/tjtelan/git-url-parse-rs" version = "0.4.6" -rust-version = "1.82" +rust-version = "1.85" [features] -default = [] -tracing = ["dep:tracing"] +default = ["url"] +# Enable Serialize/Deserialize on structs with `serde` crate +serde = ["dep:serde"] +# Enable debugging logging with `log` crate +log = ["dep:log"] +# Enable url parsing validation with `url` crate +url = ["dep:url"] [dependencies] -tracing = { version = "0.1", optional = true } -url = { version = "2.2" } -strum = { version = "^0.27", features = ["derive"] } -thiserror = "^2.0" +nom = "8" +getset = "0.1" +thiserror = "2" +serde = { version = "1", features = ["derive"], optional = true } +log = { version = "0.4", optional = true } +url = { version = "2.5", optional = true } [dev-dependencies] -env_logger = "^0.11" +env_logger = "0.11" +log = "0.4" diff --git a/README.md b/README.md index 5b5b31f..45e6d66 100644 --- a/README.md +++ b/README.md @@ -1,95 +1,117 @@ -# git-url-parse - [![Crates.io](https://img.shields.io/crates/v/git-url-parse)](https://crates.io/crates/git-url-parse) -![Crates.io MSRV](https://img.shields.io/crates/msrv/git-url-parse?label=rust-version) +[![Crates.io Total Downloads](https://img.shields.io/crates/d/git-url-parse?label=Crates.io%20Downloads)](https://crates.io/crates/git-url-parse) +![Crates.io MSRV](https://img.shields.io/crates/msrv/git-url-parse?label=Min%20Supported%20Rust%20version) [![Github actions CI status](https://github.com/tjtelan/git-url-parse-rs/actions/workflows/ci.yml/badge.svg)](https://github.com/tjtelan/git-url-parse-rs/actions/workflows/ci.yml) [![docs.rs](https://docs.rs/git-url-parse/badge.svg)](https://docs.rs/git-url-parse/) [![License](https://img.shields.io/github/license/tjtelan/git-url-parse-rs)](LICENSE) ![Maintenance](https://img.shields.io/maintenance/passively-maintained/2025) -Supports common protocols as specified by the [Pro Git book](https://git-scm.com/book/en/v2) +--- -See: [4.1 Git on the Server - The Protocols](https://git-scm.com/book/en/v2/Git-on-the-Server-The-Protocols) + -Supports parsing SSH/HTTPS repo urls for: -* Github -* Bitbucket -* Azure Devops +# Git Url Parse -See [tests/parse.rs](tests/parse.rs) for expected output for a variety of inputs. +Parses url used by git (e.g. `git clone `) ---- +## Features + +- ๐Ÿ” Parses `git clone` compatible urls into [`GitUrl`](https://docs.rs/git-url-parse/latest/git_url_parse/types/struct.GitUrl.html) + - Supports multiple Git URL schemes (SSH, HTTP, HTTPS, File) + - Inspired by [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986) with adaptations to support Git urls + +- ๐Ÿ—๏ธ Host provider info extraction + - Easy to implement trait [`GitProvider`](https://docs.rs/git-url-parse/latest/git_url_parse/types/provider/trait.GitProvider.html) for custom provider parsing + - Built-in support for multiple Git hosting providers + * [Generic](https://docs.rs/git-url-parse/latest/git_url_parse/types/provider/struct.GenericProvider.html) (`git@host:owner/repo.git` style urls) + * [GitLab](https://docs.rs/git-url-parse/latest/git_url_parse/types/provider/struct.GitLabProvider.html) + * [Azure DevOps](https://docs.rs/git-url-parse/latest/git_url_parse/types/provider/struct.AzureDevOpsProvider.html) + +## Quick Example + +```rust +use git_url_parse::{GitUrl, GitUrlParseError}; +use git_url_parse::types::provider::GitProvider; +use git_url_parse::types::provider::GenericProvider; + +fn main() -> Result<(), git_url_parse::GitUrlParseError> { + let http_url = GitUrl::parse("https://github.com/tjtelan/git-url-parse-rs.git")?; + + // Extract basic URL components + assert_eq!(http_url.host(), Some("github.com")); + assert_eq!(http_url.path(), "/tjtelan/git-url-parse-rs.git"); + + // Support ssh-based urls as well + let ssh_url = GitUrl::parse("git@github.com:tjtelan/git-url-parse-rs.git")?; + + assert_eq!(ssh_url.scheme(), Some("ssh")); + assert_eq!(ssh_url.host(), Some("github.com")); + assert_eq!(ssh_url.path(), "tjtelan/git-url-parse-rs.git"); + + // Extract provider-specific information + // Built-in support for Github (Generic), Gitlab, Azure Devops style urls + let provider : GenericProvider = ssh_url.provider_info()?; + assert_eq!(provider.owner(), "tjtelan"); + assert_eq!(provider.repo(), "git-url-parse-rs"); + + // Implement your own provider + #[derive(Debug, Clone, PartialEq, Eq)] + struct CustomProvider; + + impl GitProvider, GitUrlParseError> for CustomProvider { + fn from_git_url(_url: &GitUrl) -> Result { + // Your custom provider parsing here + Ok(Self) + } + } + + let custom_provider: CustomProvider = ssh_url.provider_info()?; + let expected = CustomProvider; + assert_eq!(custom_provider, expected); + + Ok(()) +} +``` -URLs that use the `ssh://` protocol (implicitly or explicitly) undergo a small normalization process in order to be parsed. +## Limitations -Internally uses `Url::parse()` from the [Url](https://crates.io/crates/url) crate after normalization. + Intended only for git repo urls. Url spec [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986) is not fully implemented. -## Examples +- No support for: + - Query parameters + - Fragment identifiers + - Percent-encoding + - Complex IP address formats -### Run example with debug output +## Install ```shell -$ RUST_LOG=git_url_parse cargo run --example multi -$ RUST_LOG=git_url_parse cargo run --example trim_auth +cargo add git-url-parse ``` -### Simple usage and output +### Cargo Features -```bash -$ cargo run --example readme -``` +#### `log` +Enable for internal `debug!` output from [log](https://docs.rs/log/latest) +#### `serde` +Enable for [serde](https://docs.rs/serde/latest/) `Serialize`/`Deserialize` on [`GitUrl`](https://docs.rs/git-url-parse/latest/git_url_parse/types/struct.GitUrl.html) +#### `url` +(**enabled by default**) -```rust -use git_url_parse::GitUrl; +Uses [url](https://docs.rs/url/latest/) during parsing for full url validation -fn main() { - println!("SSH: {:#?}", GitUrl::parse("git@github.com:tjtelan/git-url-parse-rs.git")); - println!("HTTPS: {:#?}", GitUrl::parse("https://github.com/tjtelan/git-url-parse-rs")); -} -``` + + +## Migration from 0.4.x and earlier + +This crate was one of my first serious projects in Rust. Because I was still learning, it had some maintenance problems and was a bit awkward to use. In version 0.5, I rewrote most of it to fix those issues. + +The [`GitUrl`](https://docs.rs/git-url-parse/latest/git_url_parse/types/struct.GitUrl.html) struct is only meant to handle parsing urls used by `git`, which the [url](https://docs.rs/url/latest/url) crate doesn't handle. The recent updates make it so the input string is parsed and internally stored into a simple string slice (`&str`). And, instead of exposing all the internal fields of the struct, those details are hidden, and we use methods to interact with it. + +The [`GitProvider`](https://docs.rs/git-url-parse/latest/git_url_parse/types/provider/trait.GitProvider.html) trait helps extract common pieces of information that are often found in different url patterns using the [`GitUrl::provider_info`](https://docs.rs/git-url-parse/latest/git_url_parse/types/struct.GitUrl.html#method.provider_info) method. Several example provider parsers are included to show how this works. The result of [`GitUrl::parse`](https://docs.rs/git-url-parse/latest/git_url_parse/types/struct.GitUrl.html#method.parse) is more straightforward to use, but the internal details are hidden, and working with provider-specific information at the git host level is more specialized. + +The most common pattern for git url paths, like `/owner/repo.git`, is handled by [`GenericProvider`](https://docs.rs/git-url-parse/latest/git_url_parse/types/provider/struct.GenericProvider.html). + +There's also [`AzureDevOpsProvider`](https://docs.rs/git-url-parse/latest/git_url_parse/types/provider/struct.AzureDevOpsProvider.html), which is designed for Azure DevOps urls that follow the `org`, `project`, `repo` pattern. -### Example Output -```bash -SSH: Ok( - GitUrl { - host: Some( - "github.com", - ), - name: "git-url-parse-rs", - owner: Some( - "tjtelan", - ), - organization: None, - fullname: "tjtelan/git-url-parse-rs", - scheme: Ssh, - user: Some( - "git", - ), - token: None, - port: None, - path: "tjtelan/git-url-parse-rs.git", - git_suffix: true, - scheme_prefix: false, - }, -) -HTTPS: Ok( - GitUrl { - host: Some( - "github.com", - ), - name: "git-url-parse-rs", - owner: Some( - "tjtelan", - ), - organization: None, - fullname: "tjtelan/git-url-parse-rs", - scheme: Https, - user: None, - token: None, - port: None, - path: "/tjtelan/git-url-parse-rs", - git_suffix: false, - scheme_prefix: true, - }, -) -``` \ No newline at end of file +Finally, there's a new supported provider called [`GitLabProvider`](https://docs.rs/git-url-parse/latest/git_url_parse/types/provider/struct.GitLabProvider.html), which is for GitLab urls. It supports the common `owner/repo` pattern shared with [`GenericProvider`](https://docs.rs/git-url-parse/latest/git_url_parse/types/provider/struct.GenericProvider.html), and also handles GitLabโ€™s subgroups. diff --git a/cliff.toml b/cliff.toml index 347c148..a4d290c 100644 --- a/cliff.toml +++ b/cliff.toml @@ -1,7 +1,9 @@ -# configuration file for git-cliff (0.1.0) +# git-cliff ~ configuration file +# https://git-cliff.org/docs/configuration [changelog] -# changelog header +# A Tera template to be rendered as the changelog's header. +# See https://keats.github.io/tera/docs/#introduction header = """ # Changelog\n All notable changes to this project will be documented in this file. @@ -9,88 +11,96 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n """ -# template for the changelog body -# https://tera.netlify.app/docs/#introduction +# A Tera template to be rendered for each release in the changelog. +# See https://keats.github.io/tera/docs/#introduction body = """ -{% if version %}\ - ## [{{ version | trim_start_matches(pat="v") }}](https://github.com/tjtelan/git-url-parse-rs/tree/{{version}}) - {{ timestamp | date(format="%Y-%m-%d") }} -{% else %}\ - ## [UNRELEASED] -{% endif %}\ - -{% for group, commits in commits - | filter(attribute="merge_commit", value=false) - | unique(attribute="message") - | group_by(attribute="group") %} +{% set package = "git-url-parse" %} +{%- macro remote_url() -%} + https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }} +{%- endmacro -%} + +{% if version -%} + ## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }} +{% else -%} + ## [Unreleased] +{% endif -%} + +{% for group, commits in commits | group_by(attribute="group") %} ### {{ group | upper_first }} - {% for commit in commits %} - - {{ commit.message | upper_first | split(pat="\n") | first }}\ + {%- for commit in commits %} + - {{ commit.message | split(pat="\n") | first | upper_first | trim }}\ + {% if commit.remote.username %} by @{{ commit.remote.username }}{%- endif -%} + {% if commit.remote.pr_number %} in \ + [#{{ commit.remote.pr_number }}]({{ self::remote_url() }}/pull/{{ commit.remote.pr_number }}) \ + {%- endif -%} {% endfor %} -{% endfor %}\n +{% endfor %} + +{%- if github.contributors | filter(attribute="is_first_time", value=true) | length != 0 %} + ### New Contributors +{%- endif -%} + +{% for contributor in github.contributors | filter(attribute="is_first_time", value=true) %} + * @{{ contributor.username }} made their first contribution + {%- if contributor.pr_number %} in \ + [#{{ contributor.pr_number }}]({{ self::remote_url() }}/pull/{{ contributor.pr_number }}) \ + {%- endif %} +{%- endfor %}\n\n """ -# remove the leading and trailing whitespace from the template -trim = true -# changelog footer +# A Tera template to be rendered as the changelog's footer. +# See https://keats.github.io/tera/docs/#introduction footer = """ +{%- macro remote_url() -%} + https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }} +{%- endmacro -%} + +{% for release in releases -%} + {% if release.version -%} + {% if release.previous.version -%} + [{{ release.version | trim_start_matches(pat="v") }}]: \ + {{ self::remote_url() }}/compare/{{ release.previous.version }}..{{ release.version }} + {% endif -%} + {% else -%} + [unreleased]: {{ self::remote_url() }}/compare/{{ release.previous.version }}..HEAD + {% endif -%} + \n +{% endfor %} """ +# Remove leading and trailing whitespaces from the changelog's body. +trim = true [git] -# parse the commits based on https://www.conventionalcommits.org +# Parse commits according to the conventional commits specification. +# See https://www.conventionalcommits.org conventional_commits = true -# filter out the commits that are not conventional +# Exclude commits that do not match the conventional commits specification. filter_unconventional = false - +# An array of regex based parsers to modify commit messages prior to further processing. commit_preprocessors = [ - { pattern = "\\(#([0-9]+)\\)", replace = "([#${1}](https://github.com/tjtelan/git-url-parse-rs/issues/${1}))" }, + # Remove issue numbers. + { pattern = '\((\w+\s)?#([0-9]+)\)', replace = "" }, ] - -# regex for parsing and grouping commits +# An array of regex based parsers for extracting data from the commit message. +# Assigns commits to groups. +# Optionally sets the commit's scope and can decide to exclude commits from further processing. commit_parsers = [ - - { message = ".*[Bb]ump", group = "Noise", skip = true }, - { message = ".*[Rr]evert", group = "Noise", skip = true }, - { message = ".*[Cl]ippy", group = "Noise", skip = true }, - { message = "^Merge pull request", group = "Noise", skip = true }, - - { message = "^test", group = "Fixed" }, - { message = "^.*[Ff]ix", group = "Fixed" }, - { message = "^[Rr]esolve", group = "Fixed" }, - - { message = "[Cc]ompile", group = "CI" }, - { message = "[Pp]ublish", group = "CI" }, - - { message = ".*[Dd]eprecate", group = "Removed" }, - { message = "^[Dd]isable", group = "Removed" }, - - { message = ".*[Aa]dd", group = "Added" }, - { message = ".*[Ss]upport", group = "Added" }, - { message = ".*[Mm]ake", group = "Added" }, - - { message = ".*[Rr]emove", group = "Removed" }, - { message = ".*[Dd]elete", group = "Removed" }, - { message = ".*[Dd]isable", group = "Removed" }, - - { message = "[Rr]elease", group = "CI" }, - { message = ".*[Ll]og", group = "CI" }, - { message = ".*[Bb]uild", group = "CI" }, - - { message = ".*[Uu]pdate", group = "Changed" }, - - { message = ".*[Cc]lean", group = "Other" }, - { message = ".*[Rr]efactor", group = "Other" }, - { message = "^.*", group = "Other" }, - + { message = "^[a|A]dd", group = "Added" }, + { message = "^[s|S]upport", group = "Added" }, + { message = "^[r|R]emove", group = "Removed" }, + { message = "^.*: add", group = "Added" }, + { message = "^.*: support", group = "Added" }, + { message = "^.*: remove", group = "Removed" }, + { message = "^.*: delete", group = "Removed" }, + { message = "^test", group = "Fixed" }, + { message = "^fix", group = "Fixed" }, + { message = "^.*: fix", group = "Fixed" }, + { message = "^.*", group = "Changed" }, ] -# filter out the commits that are not matched by commit parsers -filter_commits = true -# glob pattern for matching git tags -tag_pattern = "v[0-9]*" -# regex for skipping tags -skip_tags = ".*-rc|.*-alpha|.*-beta" -# regex for ignoring tags -ignore_tags = "" -# sort the tags chronologically -date_order = false -# sort the commits inside sections by oldest/newest order -sort_commits = "oldest" +# Exclude commits that are not matched by any commit parser. +filter_commits = false +# Order releases topologically instead of chronologically. +topo_order = false +# Order of commits in each group/release within the changelog. +# Allowed values: newest, oldest +sort_commits = "newest" \ No newline at end of file diff --git a/examples/multi.rs b/examples/multi.rs index 529913f..ce59157 100644 --- a/examples/multi.rs +++ b/examples/multi.rs @@ -4,8 +4,10 @@ fn main() -> Result<(), GitUrlParseError> { env_logger::init(); let test_vec = vec![ - "https://github.com/tjtelan/orbitalci.git", - "git@github.com:tjtelan/orbitalci.git", + "https://github.com/tjtelan/git-url-parse-rs.git", + "git@github.com:tjtelan/git-url-parse-rs.git", + "git@hostname:22/path/to/repo.git", + "ssh://git@github.com:22/asdf/asdf.git", "https://token:x-oauth-basic@host.xz/path/to/repo.git/", "https://x-token-auth:token@host.xz/path/to/repo.git/", "git+ssh://git@some-host.com/and-the-path/name", @@ -15,6 +17,7 @@ fn main() -> Result<(), GitUrlParseError> { "~/path/to/repo.git/", "./path/to/repo.git/", "./path/to/repo.git", + "/path/to/repo.git", "../test_repo", "..\\test_repo", "git@ssh.dev.azure.com:v3/CompanyName/ProjectName/RepoName", @@ -24,7 +27,7 @@ fn main() -> Result<(), GitUrlParseError> { ]; for test_url in test_vec { - let parsed = GitUrl::parse(test_url)?; + let parsed = GitUrl::parse(test_url).unwrap(); println!("Original: {}", test_url); println!("Parsed: {}", parsed); println!("{:?}\n", parsed); diff --git a/examples/trim_auth.rs b/examples/trim_auth.rs index f51f8c9..a685035 100644 --- a/examples/trim_auth.rs +++ b/examples/trim_auth.rs @@ -25,7 +25,7 @@ fn main() -> Result<(), GitUrlParseError> { println!("Original: {}", test_url); println!( "Parsed + Trimmed: {}\n", - GitUrl::parse(test_url)?.trim_auth() + GitUrl::parse(test_url).unwrap().trim_auth() ); } Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 773f35d..f97944a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,522 +1,99 @@ -use std::fmt; -use std::str::FromStr; -use strum::{Display, EnumString, VariantNames}; -use thiserror::Error; -use url::Url; - -#[cfg(feature = "tracing")] -use tracing::debug; - -/// Supported uri schemes for parsing -#[derive(Debug, PartialEq, Eq, EnumString, VariantNames, Clone, Display, Copy)] -#[strum(serialize_all = "kebab_case")] -pub enum Scheme { - /// Represents `file://` url scheme - File, - /// Represents `ftp://` url scheme - Ftp, - /// Represents `ftps://` url scheme - Ftps, - /// Represents `git://` url scheme - Git, - /// Represents `git+ssh://` url scheme - #[strum(serialize = "git+ssh")] - GitSsh, - /// Represents `http://` url scheme - Http, - /// Represents `https://` url scheme - Https, - /// Represents `ssh://` url scheme - Ssh, - /// Represents No url scheme - Unspecified, -} - -/// GitUrl represents an input url that is a url used by git -/// Internally during parsing the url is sanitized and uses the `url` crate to perform -/// the majority of the parsing effort, and with some extra handling to expose -/// metadata used my many git hosting services -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct GitUrl { - /// The fully qualified domain name (FQDN) or IP of the repo - pub host: Option, - /// The name of the repo - pub name: String, - /// The owner/account/project name - pub owner: Option, - /// The organization name. Supported by Azure DevOps - pub organization: Option, - /// The full name of the repo, formatted as "owner/name" - pub fullname: String, - /// The git url scheme - pub scheme: Scheme, - /// The authentication user - pub user: Option, - /// The oauth token (could appear in the https urls) - pub token: Option, - /// The non-conventional port where git service is hosted - pub port: Option, - /// The path to repo w/ respect to user + hostname - pub path: String, - /// Indicate if url uses the .git suffix - pub git_suffix: bool, - /// Indicate if url explicitly uses its scheme - pub scheme_prefix: bool, -} - -/// Build the printable GitUrl from its components -impl fmt::Display for GitUrl { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let scheme_prefix = match self.scheme_prefix { - true => format!("{}://", self.scheme), - false => String::new(), - }; - - let auth_info = match self.scheme { - Scheme::Ssh | Scheme::Git | Scheme::GitSsh => { - if let Some(user) = &self.user { - format!("{}@", user) - } else { - String::new() - } - } - Scheme::Http | Scheme::Https => match (&self.user, &self.token) { - (Some(user), Some(token)) => format!("{}:{}@", user, token), - (Some(user), None) => format!("{}@", user), - (None, Some(token)) => format!("{}@", token), - (None, None) => String::new(), - }, - _ => String::new(), - }; - - let host = match &self.host { - Some(host) => host.to_string(), - None => String::new(), - }; - - let port = match &self.port { - Some(p) => format!(":{}", p), - None => String::new(), - }; - - let path = match &self.scheme { - Scheme::Ssh => { - if self.port.is_some() { - format!("/{}", &self.path) - } else { - format!(":{}", &self.path) - } - } - _ => self.path.to_string(), - }; - - let git_url_str = format!("{}{}{}{}{}", scheme_prefix, auth_info, host, port, path); - - write!(f, "{}", git_url_str) - } -} - -impl Default for GitUrl { - fn default() -> Self { - GitUrl { - host: None, - name: "".to_string(), - owner: None, - organization: None, - fullname: "".to_string(), - scheme: Scheme::Unspecified, - user: None, - token: None, - port: None, - path: "".to_string(), - git_suffix: false, - scheme_prefix: false, - } - } -} - -impl FromStr for GitUrl { - type Err = GitUrlParseError; - - fn from_str(s: &str) -> Result { - GitUrl::parse(s) - } -} - -impl GitUrl { - /// Returns `GitUrl` after removing `user` and `token` values - /// Intended use-case is for non-destructive printing GitUrl excluding any embedded auth info - pub fn trim_auth(&self) -> GitUrl { - let mut new_giturl = self.clone(); - new_giturl.user = None; - new_giturl.token = None; - new_giturl - } - - /// Returns a `Result` after normalizing and parsing `url` for metadata - pub fn parse(url: &str) -> Result { - // Normalize the url so we can use Url crate to process ssh urls - let normalized = normalize_url(url)?; - - // Some pre-processing for paths - let scheme = if let Ok(scheme) = Scheme::from_str(normalized.scheme()) { - scheme - } else { - return Err(GitUrlParseError::UnsupportedScheme( - normalized.scheme().to_string(), - )); - }; - if normalized.path().is_empty() { - return Err(GitUrlParseError::EmptyPath); - } - - // Normalized ssh urls can always have their first '/' removed - let urlpath = match &scheme { - Scheme::Ssh => { - // At the moment, we're relying on url::Url's parse() behavior to not duplicate - // the leading '/' when we normalize - normalized.path()[1..].to_string() - } - _ => normalized.path().to_string(), - }; - - let git_suffix_check = &urlpath.ends_with(".git"); - - // Parse through path for name,owner,organization - // Support organizations for Azure Devops - #[cfg(feature = "tracing")] - debug!("The urlpath: {:?}", &urlpath); - - // Most git services use the path for metadata in the same way, so we're going to separate - // the metadata - // ex. github.com/accountname/reponame - // owner = accountname - // name = reponame - // - // organizations are going to be supported on a per-host basis - let splitpath = &urlpath.rsplit_terminator('/').collect::>(); - - #[cfg(feature = "tracing")] - debug!("rsplit results for metadata: {:?}", splitpath); - - let name = splitpath[0].trim_end_matches(".git").to_string(); - - // TODO: I think here is where we want to update the url pattern identification step.. I want to be able to have a hint that the user can pass - - let (owner, organization, fullname) = match &scheme { - // We're not going to assume anything about metadata from a filepath - Scheme::File => (None::, None::, name.clone()), - _ => { - let mut fullname: Vec<&str> = Vec::new(); - - // TODO: Add support for parsing out orgs from these urls - let hosts_w_organization_in_path = ["dev.azure.com", "ssh.dev.azure.com"]; - //vec!["dev.azure.com", "ssh.dev.azure.com", "visualstudio.com"]; - - let host_str = if let Some(host) = normalized.host_str() { - host - } else { - return Err(GitUrlParseError::UnsupportedUrlHostFormat); - }; - - match hosts_w_organization_in_path.contains(&host_str) { - true => { - #[cfg(feature = "tracing")] - debug!("Found a git provider with an org"); - - // The path differs between git:// and https:// schemes - - match &scheme { - // Example: "git@ssh.dev.azure.com:v3/CompanyName/ProjectName/RepoName", - Scheme::Ssh => { - // Organization - fullname.push(splitpath[2]); - // Project/Owner name - fullname.push(splitpath[1]); - // Repo name - fullname.push(splitpath[0]); - - ( - Some(splitpath[1].to_string()), - Some(splitpath[2].to_string()), - fullname.join("/"), - ) - } - // Example: "https://CompanyName@dev.azure.com/CompanyName/ProjectName/_git/RepoName", - Scheme::Https => { - // Organization - fullname.push(splitpath[3]); - // Project/Owner name - fullname.push(splitpath[2]); - // Repo name - fullname.push(splitpath[0]); - - ( - Some(splitpath[2].to_string()), - Some(splitpath[3].to_string()), - fullname.join("/"), - ) - } - - // TODO: I'm not sure if I want to support throwing this error long-term - _ => return Err(GitUrlParseError::UnexpectedScheme), - } - } - false => { - if !url.starts_with("ssh") && splitpath.len() < 2 { - return Err(GitUrlParseError::UnexpectedFormat); - } - - let position = match splitpath.len() { - 0 => return Err(GitUrlParseError::UnexpectedFormat), - 1 => 0, - _ => 1, - }; - - // push owner - fullname.push(splitpath[position]); - // push name - fullname.push(name.as_str()); - - ( - Some(splitpath[position].to_string()), - None::, - fullname.join("/"), - ) - } - } - } - }; - - let final_host = match scheme { - Scheme::File => None, - _ => normalized.host_str().map(|h| h.to_string()), - }; - - let final_path = match scheme { - Scheme::File => { - if let Some(host) = normalized.host_str() { - format!("{}{}", host, urlpath) - } else { - urlpath - } - } - _ => urlpath, - }; - - Ok(GitUrl { - host: final_host, - name, - owner, - organization, - fullname, - scheme, - user: match normalized.username().to_string().len() { - 0 => None, - _ => Some(normalized.username().to_string()), - }, - token: normalized.password().map(|p| p.to_string()), - port: normalized.port(), - path: final_path, - git_suffix: *git_suffix_check, - scheme_prefix: url.contains("://") || url.starts_with("git:"), - }) - } -} - -/// `normalize_ssh_url` takes in an ssh url that separates the login info -/// from the path into with a `:` and replaces it with `/`. -/// -/// Prepends `ssh://` to url -/// -/// Supports absolute and relative paths -fn normalize_ssh_url(url: &str) -> Result { - let u = url.split(':').collect::>(); - - match u.len() { - 2 => { - #[cfg(feature = "tracing")] - debug!("Normalizing ssh url: {:?}", u); - normalize_url(&format!("ssh://{}/{}", u[0], u[1])) - } - 3 => { - #[cfg(feature = "tracing")] - debug!("Normalizing ssh url with ports: {:?}", u); - normalize_url(&format!("ssh://{}:{}/{}", u[0], u[1], u[2])) - } - _default => Err(GitUrlParseError::UnsupportedSshUrlFormat), - } -} - -/// `normalize_file_path` takes in a filepath and uses `Url::from_file_path()` to parse -/// -/// Prepends `file://` to url -#[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))] -fn normalize_file_path(filepath: &str) -> Result { - let fp = Url::from_file_path(filepath); - - match fp { - Ok(path) => Ok(path), - Err(_e) => { - if let Ok(file_url) = normalize_url(&format!("file://{}", filepath)) { - Ok(file_url) - } else { - Err(GitUrlParseError::FileUrlNormalizeFailedSchemeAdded) - } - } - } -} - -#[cfg(target_arch = "wasm32")] -fn normalize_file_path(_filepath: &str) -> Result { - unreachable!() -} - -/// `normalize_url` takes in url as `&str` and takes an opinionated approach to identify -/// `ssh://` or `file://` urls that require more information to be added so that -/// they can be parsed more effectively by `url::Url::parse()` -pub fn normalize_url(url: &str) -> Result { - #[cfg(feature = "tracing")] - debug!("Processing: {:?}", &url); - - // TODO: Should this be extended to check for any whitespace? - // Error if there are null bytes within the url - // https://github.com/tjtelan/git-url-parse-rs/issues/16 - if url.contains('\0') { - return Err(GitUrlParseError::FoundNullBytes); - } - - // We're going to remove any trailing slash before running through Url::parse - let trim_url = url.trim_end_matches('/'); - - // TODO: Remove support for this form when I go to next major version. - // I forget what it supports, and it isn't obvious after searching for examples - // normalize short git url notation: git:host/path - let url_to_parse = if trim_url.starts_with("git:") && !trim_url.starts_with("git://") { - trim_url.replace("git:", "git://") - } else { - trim_url.to_string() - }; - - let url_parse = Url::parse(&url_to_parse); - - Ok(match url_parse { - Ok(u) => { - match Scheme::from_str(u.scheme()) { - Ok(_p) => u, - Err(_e) => { - // Catch case when an ssh url is given w/o a user - #[cfg(feature = "tracing")] - debug!("Scheme parse fail. Assuming a userless ssh url"); - if let Ok(ssh_url) = normalize_ssh_url(trim_url) { - ssh_url - } else { - return Err(GitUrlParseError::SshUrlNormalizeFailedNoScheme); - } - } - } - } - - // If we're here, we're only looking for Scheme::Ssh or Scheme::File - // TODO: Add test for this - Err(url::ParseError::RelativeUrlWithoutBase) => { - // Assuming we have found Scheme::Ssh if we can find an "@" before ":" - // Otherwise we have Scheme::File - //let re = Regex::new(r"^\S+(@)\S+(:).*$").with_context(|| { - // "Failed to build ssh git url regex for testing against url".to_string() - //})?; - - match is_ssh_url(trim_url) { - true => { - #[cfg(feature = "tracing")] - debug!("Scheme::SSH match for normalization"); - normalize_ssh_url(trim_url)? - } - false => { - #[cfg(feature = "tracing")] - debug!("Scheme::File match for normalization"); - normalize_file_path(trim_url)? - } - } - } - Err(err) => { - return Err(GitUrlParseError::from(err)); - } - }) -} - -// Valid ssh `url` for cloning have a usernames, -// but we don't require it classification or parsing purposes -// However a path must be specified with a `:` -fn is_ssh_url(url: &str) -> bool { - // if we do not have a path - if !url.contains(':') { - return false; - } - - // if we have a username, expect it before the path (Are usernames with colons valid?) - if let (Some(at_pos), Some(colon_pos)) = (url.find('@'), url.find(':')) { - if colon_pos < at_pos { - return false; - } - - // Make sure we provided a username, and not just `@` - let parts: Vec<&str> = url.split('@').collect(); - return parts.len() == 2 || parts[0].is_empty(); - } - - // it's an ssh url if we have a domain:path pattern - let parts: Vec<&str> = url.split(':').collect(); - - // FIXME: I am not sure how to validate a url with a port - //if parts.len() != 3 && !parts[0].is_empty() && !parts[1].is_empty() && !parts[2].is_empty() { - // return false; - //} - - // This should also handle if a port is specified - // no port example: ssh://user@domain:path/to/repo.git - // port example: ssh://user@domain:port/path/to/repo.git - parts.len() == 2 && parts[0].is_empty() && parts[1].is_empty() -} - -#[derive(Error, Debug, PartialEq, Eq)] -pub enum GitUrlParseError { - #[error("Error from Url crate: {0}")] - UrlParseError(#[from] url::ParseError), - - #[error("No url scheme was found, then failed to normalize as ssh url.")] - SshUrlNormalizeFailedNoScheme, - - #[error("No url scheme was found, then failed to normalize as ssh url after adding 'ssh://'")] - SshUrlNormalizeFailedSchemeAdded, - - #[error("Failed to normalize as ssh url after adding 'ssh://'")] - SshUrlNormalizeFailedSchemeAddedWithPorts, - - #[error("No url scheme was found, then failed to normalize as file url.")] - FileUrlNormalizeFailedNoScheme, - - #[error( - "No url scheme was found, then failed to normalize as file url after adding 'file://'" - )] - FileUrlNormalizeFailedSchemeAdded, - - #[error("Git Url not in expected format")] - UnexpectedFormat, - - // FIXME: Keep an eye on this error for removal - #[error("Git Url for host using unexpected scheme")] - UnexpectedScheme, - - #[error("Scheme unsupported: {0}")] - UnsupportedScheme(String), - #[error("Host from Url cannot be str or does not exist")] - UnsupportedUrlHostFormat, - #[error("Git Url not in expected format for SSH")] - UnsupportedSshUrlFormat, - #[error("Normalized URL has no path")] - EmptyPath, - - #[error("Found null bytes within input url before parsing")] - FoundNullBytes, -} +#![deny(missing_docs)] +#![deny(clippy::missing_docs_in_private_items)] +#![allow(rustdoc::redundant_explicit_links)] // for cargo-rdme + +//! # Git Url Parse +//! +//! Parses url used by git (e.g. `git clone `) +//! +//! ## Features +//! +//! - ๐Ÿ” Parses `git clone` compatible urls into [`GitUrl`](crate::types::GitUrl) +//! - Supports multiple Git URL schemes (SSH, HTTP, HTTPS, File) +//! - Inspired by [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986) with adaptations to support Git urls +//! +//! - ๐Ÿ—๏ธ Host provider info extraction +//! - Easy to implement trait [`GitProvider`](crate::types::provider::GitProvider) for custom provider parsing +//! - Built-in support for multiple Git hosting providers +//! * [Generic](crate::types::provider::GenericProvider) (`git@host:owner/repo.git` style urls) +//! * [GitLab](crate::types::provider::GitLabProvider) +//! * [Azure DevOps](crate::types::provider::AzureDevOpsProvider) +//! +//! ## Quick Example +//! +//! ```rust +//! use git_url_parse::{GitUrl, GitUrlParseError}; +//! use git_url_parse::types::provider::GitProvider; +//! use git_url_parse::types::provider::GenericProvider; +//! +//! fn main() -> Result<(), git_url_parse::GitUrlParseError> { +//! let http_url = GitUrl::parse("https://github.com/tjtelan/git-url-parse-rs.git")?; +//! +//! // Extract basic URL components +//! assert_eq!(http_url.host(), Some("github.com")); +//! assert_eq!(http_url.path(), "/tjtelan/git-url-parse-rs.git"); +//! +//! // Support ssh-based urls as well +//! let ssh_url = GitUrl::parse("git@github.com:tjtelan/git-url-parse-rs.git")?; +//! +//! assert_eq!(ssh_url.scheme(), Some("ssh")); +//! assert_eq!(ssh_url.host(), Some("github.com")); +//! assert_eq!(ssh_url.path(), "tjtelan/git-url-parse-rs.git"); +//! +//! // Extract provider-specific information +//! // Built-in support for Github (Generic), Gitlab, Azure Devops style urls +//! let provider : GenericProvider = ssh_url.provider_info()?; +//! assert_eq!(provider.owner(), "tjtelan"); +//! assert_eq!(provider.repo(), "git-url-parse-rs"); +//! +//! // Implement your own provider +//! #[derive(Debug, Clone, PartialEq, Eq)] +//! struct CustomProvider; +//! +//! impl GitProvider, GitUrlParseError> for CustomProvider { +//! fn from_git_url(_url: &GitUrl) -> Result { +//! // Your custom provider parsing here +//! Ok(Self) +//! } +//! } +//! +//! let custom_provider: CustomProvider = ssh_url.provider_info()?; +//! let expected = CustomProvider; +//! assert_eq!(custom_provider, expected); +//! +//! Ok(()) +//! } +//! ``` +//! +//! ## Limitations +//! +//! Intended only for git repo urls. Url spec [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986) is not fully implemented. +//! +//! - No support for: +//! - Query parameters +//! - Fragment identifiers +//! - Percent-encoding +//! - Complex IP address formats +//! +//! ## Install +//! +//! ```shell +//! cargo add git-url-parse +//! ``` +//! +//! ### Cargo Features +//! +//! #### `log` +//! Enable for internal `debug!` output from [log](https://docs.rs/log/latest) +//! #### `serde` +//! Enable for [serde](https://docs.rs/serde/latest/) `Serialize`/`Deserialize` on [`GitUrl`](crate::types::GitUrl) +//! #### `url` +//! (**enabled by default**) +//! +//! Uses [url](https://docs.rs/url/latest/) during parsing for full url validation +//! + +pub mod types; + +/// Re-exports +pub use types::{GitUrl, GitUrlParseError}; diff --git a/src/types/error.rs b/src/types/error.rs new file mode 100644 index 0000000..7d24608 --- /dev/null +++ b/src/types/error.rs @@ -0,0 +1,64 @@ +//! # GitUrl error handling +//! +//! Error struct to use as Err for parsing Git urls + +use thiserror::Error; + +/// Internal error type for `GitUrl` for parsing errors +#[derive(Error, Debug, PartialEq, Eq)] +pub enum GitUrlParseError { + #[cfg(feature = "url")] + /// Error originating from from `url` crate during validation + #[error("Error from Url crate: {0}")] + UrlParseError(#[from] url::ParseError), + + /// Parsing error converted from `nom` crate + #[error("Nom crate parsing error: {0}")] + NomParseError(String), + + /// Git url must contain a non-empty path + #[error("Git Url must have a path")] + InvalidPathEmpty, + + /// Invalid port number detected + #[error("Invalid port number")] + InvalidPortNumber, + + /// Password are only supported in HTTP-like url + #[error("Password only supported by httplike urls")] + InvalidPasswordUnsupported, + + /// File-like url must follow filesystem path patterns + #[error("Filelike urls expect only scheme and/or path")] + InvalidFilePattern, + + /// `GitUrl`not supported by the [`GitProvider`](crate::types::provider::GitProvider) + #[error("GitUrl not supported by provider")] + ProviderUnsupported, + + /// Detected null bytes in the input url + #[error("Found null bytes within input url before parsing")] + FoundNullBytes, + + /// Failed to extract provider-specific info from url + #[error("Provider info parse failed: {0}")] + ProviderParseFail(String), + + /// Catch-all error for unexpected failures during parsing + #[error("Unexpected error occurred during parsing")] + UnexpectedError, +} + +impl<'a> From> for GitUrlParseError { + fn from(err: nom::Err<(&'a str, nom::error::ErrorKind)>) -> Self { + match err { + nom::Err::Error((input, kind)) => { + GitUrlParseError::NomParseError(format!("Parse error at: {input}, kind: {kind:?}",)) + } + nom::Err::Failure((input, kind)) => { + GitUrlParseError::NomParseError(format!("Parse error at: {input}, kind: {kind:?}",)) + } + nom::Err::Incomplete(_) => GitUrlParseError::UnexpectedError, + } + } +} diff --git a/src/types/mod.rs b/src/types/mod.rs new file mode 100644 index 0000000..adbe307 --- /dev/null +++ b/src/types/mod.rs @@ -0,0 +1,334 @@ +//! # GitUrl internal types +//! +//! Internal types and parsing logic for Git urls +//! + +mod error; +mod spec; +use spec::*; +pub mod provider; + +pub use error::GitUrlParseError; + +use core::str; +use std::fmt; + +use getset::{CloneGetters, CopyGetters, Setters}; +#[cfg(feature = "log")] +use log::debug; +use nom::Finish; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +/// Assigned as a label during parsing for different Git URL types. +/// Some printing or `GitProvider` parsing behavior are influenced by this type. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub(crate) enum GitUrlParseHint { + /// The default status + #[default] + Unknown, + /// When `ssh` is in the scheme, or a `:` is used as initial path separator + Sshlike, + /// When `file` is in scheme, or filesystem-like relative paths + Filelike, + /// Default network scheme if not `ssh`. If `:` is used as initial path separator in the userinfo + Httplike, +} + +/// Represents a parsed Git repository url +/// +/// GitUrl is an input url used by git. +/// Parsing of the url inspired by rfc3986, but does not strictly cover the spec +/// Optional, but by default, uses the `url` crate to perform a final validation of the parsing effort +#[derive(Clone, CopyGetters, CloneGetters, Debug, Default, Setters, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct GitUrl<'url> { + /// scheme name (i.e. `scheme://`) + #[getset(get_copy = "pub", set = "pub(crate)")] + scheme: Option<&'url str>, + /// user name userinfo + #[getset(get_copy = "pub", set = "pub(crate)")] + user: Option<&'url str>, + /// password userinfo provided with `user` (i.e. `user`:`password`@...) + #[getset(get_copy = "pub", set = "pub(crate)")] + password: Option<&'url str>, + /// The hostname or IP of the repo host + #[getset(get_copy = "pub")] + host: Option<&'url str>, + /// The port number of the repo host, if specified + #[getset(get_copy = "pub")] + port: Option, + /// File or network path to repo + #[getset(get_copy = "pub", set = "pub(crate)")] + path: &'url str, + /// If we should print `scheme://` from input or derived during parsing + #[getset(get_copy = "pub", set = "pub(crate)")] + print_scheme: bool, + /// Pattern style of url derived during parsing + #[getset(get_copy = "pub(crate)")] + hint: GitUrlParseHint, +} + +/// Build the printable GitUrl from its components +impl fmt::Display for GitUrl<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let git_url_str = self.display(); + + write!(f, "{git_url_str}",) + } +} + +impl<'url> GitUrl<'url> { + /// Wrapper function for the default output mode via [`Display`](std::fmt::Display) trait + fn display(&self) -> String { + self.build_string(false) + } + + /// Wrapper function for printing a url for the [`url`](https://docs.rs/url/latest/url/) crate + #[cfg(feature = "url")] + fn url_compat_display(&self) -> String { + self.build_string(true) + } + + /// This method rebuilds the printable GitUrl from its components. + /// `url_compat` results in output that can be parsed by the `url` crate + fn build_string(&self, url_compat: bool) -> String { + let scheme = if self.print_scheme() || url_compat { + if let Some(scheme) = self.scheme() { + format!("{scheme}://") + } else { + String::new() + } + } else { + String::new() + }; + + let auth_info = match (self.user(), self.password()) { + (Some(user), Some(password)) => format!("{user}:{password}@"), + (Some(user), None) => format!("{user}@",), + (None, Some(password)) => format!("{password}@"), + (None, None) => String::new(), + }; + + let host = match &self.host() { + Some(host) => host.to_string(), + None => String::new(), + }; + + let (port, path) = match (self.hint(), self.port(), self.path()) { + (GitUrlParseHint::Httplike, Some(port), path) => { + (format!(":{port}"), format!("/{path}")) + } + (GitUrlParseHint::Httplike, None, path) => (String::new(), path.to_string()), + (GitUrlParseHint::Sshlike, Some(port), path) => { + (format!(":{port}"), format!("/{path}")) + } + (GitUrlParseHint::Sshlike, None, path) => { + if url_compat { + (String::new(), format!("/{path}")) + } else { + (String::new(), format!(":{path}")) + } + } + (GitUrlParseHint::Filelike, None, path) => (String::new(), path.to_string()), + _ => (String::new(), String::new()), + }; + + let git_url_str = format!("{scheme}{auth_info}{host}{port}{path}"); + git_url_str + } +} + +impl<'url> GitUrl<'url> { + /// Returns `GitUrl` after removing all user info values + pub fn trim_auth(&self) -> GitUrl { + let mut new_giturl = self.clone(); + new_giturl.set_user(None); + new_giturl.set_password(None); + #[cfg(feature = "log")] + debug!("{new_giturl:?}"); + new_giturl + } + + /// Returns a `Result` after parsing `input` for metadata + /// + /// ``` + /// # use git_url_parse::GitUrl; + /// # use git_url_parse::types::provider::GenericProvider; + /// # fn main() -> Result<(), git_url_parse::GitUrlParseError> { + /// let http_url = GitUrl::parse("https://github.com/tjtelan/git-url-parse-rs.git")?; + /// let ssh_url = GitUrl::parse("git@github.com:tjtelan/git-url-parse-rs.git")?; + /// # Ok(()) + /// # } + /// ``` + pub fn parse(input: &'url str) -> Result { + // Error if there are null bytes within the url + // https://github.com/tjtelan/git-url-parse-rs/issues/16 + if input.contains('\0') { + return Err(GitUrlParseError::FoundNullBytes); + } + + let (_input, url_spec_parser) = UrlSpecParser::parse(input).finish().unwrap_or_default(); + + let mut scheme = url_spec_parser.scheme(); + let user = url_spec_parser.hier_part().authority().userinfo().user(); + let password = url_spec_parser.hier_part().authority().userinfo().token(); + let host = url_spec_parser.hier_part().authority().host(); + let port = url_spec_parser.hier_part().authority().port(); + let mut path = url_spec_parser.hier_part().path(); + + // We will respect whether scheme was initially set + let print_scheme = scheme.is_some(); + + // Take a moment to identify the type of url we have + // We use the GitUrlParseHint to validate or adjust formatting path, if necessary + let hint = if let Some(scheme) = scheme { + if scheme.contains("ssh") { + GitUrlParseHint::Sshlike + } else { + match scheme.to_lowercase().as_str() { + "file" => GitUrlParseHint::Filelike, + _ => GitUrlParseHint::Httplike, + } + } + } else if user.is_none() + && password.is_none() + && host.is_none() + && port.is_none() + && !path.is_empty() + { + // if we only have a path => file + GitUrlParseHint::Filelike + } else if user.is_some() && password.is_some() { + // If we have a user and password => http + GitUrlParseHint::Httplike + } else if path.starts_with(':') { + // If path starts with a colon => ssh + GitUrlParseHint::Sshlike + } else { + GitUrlParseHint::Unknown + }; + + // If we found an ssh url, we should adjust the path. + // Skip the first character + if hint == GitUrlParseHint::Sshlike { + if let Some(scheme) = scheme.as_mut() { + *scheme = "ssh"; + } else { + scheme = Some("ssh") + } + path = &path[1..]; + } + + if hint == GitUrlParseHint::Filelike { + if let Some(scheme) = scheme.as_mut() { + *scheme = "file"; + } else { + scheme = Some("file") + } + } + + let git_url = GitUrl { + scheme, + user, + password, + host, + port, + path, + print_scheme, + hint, + }; + + git_url.is_valid()?; + + Ok(git_url) + } + + /// ``` + /// use git_url_parse::GitUrl; + /// use git_url_parse::types::provider::GenericProvider; + /// + /// # fn main() -> Result<(), git_url_parse::GitUrlParseError> { + /// let ssh_url = GitUrl::parse("git@github.com:tjtelan/git-url-parse-rs.git")?; + /// let provider : GenericProvider = ssh_url.provider_info()?; + /// # assert_eq!(provider.owner(), "tjtelan"); + /// # assert_eq!(provider.repo(), "git-url-parse-rs"); + /// + /// # Ok(()) + /// # } + pub fn provider_info(&self) -> Result + where + T: provider::GitProvider, GitUrlParseError>, + { + T::from_git_url(self) + } + + /// This is called as the last step before returning a `GitUrl` to the user + fn is_valid(&self) -> Result<(), GitUrlParseError> { + // Last chance validation + + #[cfg(feature = "log")] + debug!("Validating parsing results {self:#?}"); + + if self.path().is_empty() { + return Err(GitUrlParseError::InvalidPathEmpty); + } + + // There's an edge case we don't properly cover: ssh urls using ports + absolute paths + // https://mslinn.com/git/040-git-urls.html - describes this pattern, if we decide to parse for it + + // only ssh paths start with ':' + if self.hint() != GitUrlParseHint::Sshlike && self.path.starts_with(':') { + #[cfg(feature = "log")] + { + debug!("{:?}", self.hint()); + debug!("{:?}", self.path()); + debug!("Only sshlike url path starts with ':'"); + debug!("path starts with ':'? {}", self.path.starts_with(':')); + } + + return Err(GitUrlParseError::InvalidPortNumber); + } + + // if we are not httplike, we shouldn't have passwords + if self.hint() != GitUrlParseHint::Httplike && self.password().is_some() { + #[cfg(feature = "log")] + { + debug!("{:?}", self.hint()); + debug!( + "password support only for httplike url: {:?}", + self.password() + ); + } + return Err(GitUrlParseError::InvalidPasswordUnsupported); + } + + // if we are filelike, we should only have paths + if self.hint() == GitUrlParseHint::Filelike + && (self.user().is_some() + || self.password().is_some() + || self.host().is_some() + || self.port().is_some() + || self.path().is_empty()) + { + #[cfg(feature = "log")] + { + debug!( + "Only scheme and path expected to have values set for filelike urls {:?}", + self + ); + } + return Err(GitUrlParseError::InvalidFilePattern); + } + + #[cfg(feature = "url")] + { + // Since we don't fully implement any spec, we'll rely on the url crate + println!("{:#?}", self.url_compat_display()); + let _u = url::Url::parse(&self.url_compat_display())?; + } + + Ok(()) + } +} diff --git a/src/types/provider/mod.rs b/src/types/provider/mod.rs new file mode 100644 index 0000000..3176556 --- /dev/null +++ b/src/types/provider/mod.rs @@ -0,0 +1,318 @@ +//! # Git URL Providers +//! +//! Provides extraction of Git host service info from `GitUrl`s. +//! +//! ## Supported Providers +//! +//! - [Generic Git repositories](crate::types::provider::GenericProvider) +//! - [Azure DevOps](crate::types::provider::AzureDevOpsProvider) +//! - [GitLab](crate::types::provider::GitLabProvider) +//! - Custom (via [`GitProvider`] trait) + +use crate::types::GitUrlParseHint; +use crate::{GitUrl, GitUrlParseError}; + +use getset::{CloneGetters, CopyGetters}; +use nom::Parser; +use nom::bytes::complete::{is_not, tag, take_until}; +use nom::combinator::opt; +use nom::sequence::{preceded, separated_pair, terminated}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +/// Secondary parser called by [`GitUrl::provider_info()`] to extract Git host provider info from url +/// +/// ``` +/// // Custom trait example +/// +/// use git_url_parse::{GitUrl, GitUrlParseError}; +/// use git_url_parse::types::provider::GitProvider; +/// +/// #[derive(Debug, Clone, PartialEq, Eq)] +/// struct MyCustomProvider; +/// +/// impl GitProvider, GitUrlParseError> for MyCustomProvider { +/// fn from_git_url(_url: &GitUrl) -> Result { +/// // Do your custom parsing here with your GitUrl +/// Ok(Self) +/// } +/// } +/// +/// let test_url = "git@github.com:tjtelan/git-url-parse-rs.git"; +/// let parsed = GitUrl::parse(test_url).expect("URL parse failed"); +/// +/// // Provide your custom type to `GitUrl::provider_info()` +/// let provider_info: MyCustomProvider = parsed.provider_info().unwrap(); +/// let expected = MyCustomProvider; +/// assert_eq!(provider_info, expected) +/// ``` +pub trait GitProvider: Clone + std::fmt::Debug { + /// Trait method called by `GitUrl::provider_info()` + /// + /// Logic for extracting service level information from a `GitUrl` + fn from_git_url(url: &T) -> Result; +} + +/// Represents a generic Git repository provider +/// +/// ## Typical Use Cases +/// +/// - Common service hosting with `owner/repo` patterns (e.g. GitHub, Bitbucket) +/// - Self-hosted repositories (e.g. Codeberg, Gitea) +/// +/// Example: +/// +/// ``` +/// use git_url_parse::{GitUrl, GitUrlParseError}; +/// use git_url_parse::types::provider::GenericProvider; +/// +/// let test_url = "git@github.com:tjtelan/git-url-parse-rs.git"; +/// let parsed = GitUrl::parse(test_url).expect("URL parse failed"); +/// +/// let provider_info: GenericProvider = parsed.provider_info().unwrap(); +/// +/// assert_eq!(provider_info.owner(), "tjtelan"); +/// assert_eq!(provider_info.repo(), "git-url-parse-rs"); +/// assert_eq!(provider_info.fullname(), "tjtelan/git-url-parse-rs"); +/// ``` +/// +#[derive(Debug, PartialEq, Eq, Clone, CopyGetters)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[getset(get_copy = "pub")] +pub struct GenericProvider<'a> { + /// Repo owner + owner: &'a str, + /// Repo name + repo: &'a str, +} + +impl<'a> GenericProvider<'a> { + /// Parse the most common form of git url by offered by git providers + fn parse_path(input: &str) -> Result<(&str, GenericProvider), GitUrlParseError> { + let (input, _) = opt(tag("/")).parse(input)?; + let (input, (user, repo)) = + separated_pair(is_not("/"), tag("/"), take_until(".git")).parse(input)?; + Ok((input, GenericProvider { owner: user, repo })) + } + + /// Helper method to get the full name of a repo: `{owner}/{repo}` + pub fn fullname(&self) -> String { + format!("{}/{}", self.owner, self.repo) + } +} + +impl<'a> GitProvider, GitUrlParseError> for GenericProvider<'a> { + fn from_git_url(url: &GitUrl<'a>) -> Result { + if url.hint() == GitUrlParseHint::Filelike { + return Err(GitUrlParseError::ProviderUnsupported); + } + + let path = url.path(); + Self::parse_path(path).map(|(_, provider)| provider) + } +} + +/// Azure DevOps repository provider +/// ## Supported URL Formats +/// +/// - `https://dev.azure.com/org/project/_git/repo` +/// - `git@ssh.dev.azure.com:v3/org/project/repo` +/// +/// Example: +/// +/// ``` +/// use git_url_parse::{GitUrl, GitUrlParseError}; +/// use git_url_parse::types::provider::AzureDevOpsProvider; +/// +/// let test_url = "https://CompanyName@dev.azure.com/CompanyName/ProjectName/_git/RepoName"; +/// let parsed = GitUrl::parse(test_url).expect("URL parse failed"); +/// +/// let provider_info: AzureDevOpsProvider = parsed.provider_info().unwrap(); +/// +/// assert_eq!(provider_info.org(), "CompanyName"); +/// assert_eq!(provider_info.project(), "ProjectName"); +/// assert_eq!(provider_info.repo(), "RepoName"); +/// assert_eq!(provider_info.fullname(), "CompanyName/ProjectName/RepoName"); +/// ``` +/// +#[derive(Debug, PartialEq, Eq, Clone, CopyGetters)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[getset(get_copy = "pub")] +pub struct AzureDevOpsProvider<'a> { + /// Azure Devops organization name + org: &'a str, + /// Azure Devops project name + project: &'a str, + /// Azure Devops repo name + repo: &'a str, +} + +impl<'a> AzureDevOpsProvider<'a> { + /// Helper method to get the full name of a repo: `{org}/{project}/{repo}` + pub fn fullname(&self) -> String { + format!("{}/{}/{}", self.org, self.project, self.repo) + } + + /// Parse the path of a http url for Azure Devops patterns + fn parse_http_path(input: &str) -> Result<(&str, AzureDevOpsProvider), GitUrlParseError> { + // Handle optional leading / + let (input, _) = opt(tag("/")).parse(input)?; + + // Parse org/project/repo + let (input, (org, (project, repo))) = separated_pair( + is_not("/"), + tag("/"), + separated_pair( + is_not("/"), + tag("/"), + preceded(opt(tag("_git/")), is_not("")), + ), + ) + .parse(input)?; + + Ok((input, AzureDevOpsProvider { org, project, repo })) + } + + /// Parse the path of an ssh url for Azure Devops patterns + fn parse_ssh_path(input: &str) -> Result<(&str, AzureDevOpsProvider), GitUrlParseError> { + // Handle optional leading v3/ or other prefix + let (input, _) = opt(take_until("/")).parse(input)?; + let (input, _) = opt(tag("/")).parse(input)?; + + // Parse org/project/repo + let (input, (org, (project, repo))) = separated_pair( + is_not("/"), + tag("/"), + separated_pair( + is_not("/"), + tag("/"), + terminated(is_not("."), opt(tag(".git"))), + ), + ) + .parse(input)?; + + Ok((input, AzureDevOpsProvider { org, project, repo })) + } +} + +impl<'a> GitProvider, GitUrlParseError> for AzureDevOpsProvider<'a> { + fn from_git_url(url: &GitUrl<'a>) -> Result { + let path = url.path(); + + let parsed = if url.hint() == GitUrlParseHint::Httplike { + Self::parse_http_path(path) + } else { + Self::parse_ssh_path(path) + }; + + parsed.map(|(_, provider)| provider) + } +} + +/// ## GitLab repository provider +/// +/// ## Supported URL Formats +/// +/// - `https://gitlab.com/owner/repo.git` +/// - `https://gitlab.com/owner/subgroup1/subgroup2/repo.git` +/// - `git@gitlab.com:owner/repo.git` +/// - `git@gitlab.com:owner/subgroup1/subgroup2/repo.git` +/// +/// ## Examples +/// +/// ``` +/// use git_url_parse::GitUrl; +/// use git_url_parse::types::provider::GitLabProvider; +/// +/// fn main() -> Result<(), git_url_parse::GitUrlParseError> { +/// // Top-level repository +/// let url1 = GitUrl::parse("https://gitlab.com/gitlab-org/gitlab.git")?; +/// let provider1 : GitLabProvider = url1.provider_info()?; +/// assert_eq!(provider1.owner(), "gitlab-org"); +/// assert_eq!(provider1.repo(), "gitlab"); +/// assert_eq!(provider1.subgroup(), None); +/// assert_eq!(provider1.fullname(), "gitlab-org/gitlab"); +/// +/// // Repository with subgroups +/// let url2 = GitUrl::parse("https://gitlab.com/owner/group1/group2/project.git")?; +/// let provider2 : GitLabProvider = url2.provider_info()?; +/// assert_eq!(provider2.owner(), "owner"); +/// assert_eq!(provider2.repo(), "project"); +/// assert_eq!(provider2.subgroup(), Some(vec!["group1", "group2"])); +/// assert_eq!(provider2.fullname(), "owner/group1/group2/project"); +/// +/// Ok(()) +/// } +/// ``` +/// +#[derive(Clone, Debug, PartialEq, Eq, Default, CopyGetters, CloneGetters)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct GitLabProvider<'a> { + /// Repo owner + #[getset(get_copy = "pub")] + owner: &'a str, + /// Gitlab subgroups + #[getset(get_clone = "pub")] + subgroup: Option>, + /// Repo name + #[getset(get_copy = "pub")] + repo: &'a str, +} + +impl<'a> GitLabProvider<'a> { + /// Helper method to get the full name of a repo: `{owner}/{repo}` or `{owner}/{subgroups}/{repo}` + pub fn fullname(&self) -> String { + if let Some(subgroup) = self.subgroup() { + let subgroup_str = subgroup.join("/"); + + format!("{}/{subgroup_str}/{}", self.owner, self.repo) + } else { + format!("{}/{}", self.owner, self.repo) + } + } + + /// Parse the path of url for GitLab patterns + fn parse_path(input: &str) -> Result<(&str, GitLabProvider), GitUrlParseError> { + // Optional leading slash + let (input, _) = opt(tag("/")).parse(input)?; + + // Remove .git extension if present + let input = input.trim_end_matches(".git"); + + // Split the path + let parts: Vec<&str> = input.split('/').filter(|s| !s.is_empty()).collect(); + + // Ensure we have at least 2 parts (owner and repo) + if parts.len() < 2 { + return Err(GitUrlParseError::ProviderParseFail( + "Path needs at least 2 parts: ex. \'/owner/repo\'".into(), + )); + } + + // Last part is the repo + let repo = parts[parts.len() - 1]; + + // Everything before the last part is the owner/subgroups + let (owner, subgroup) = if parts.len() > 2 { + (parts[0], Some(parts[1..parts.len() - 1].to_vec())) + } else { + (parts[0], None) + }; + + Ok(( + input, + GitLabProvider { + owner, + subgroup, + repo, + }, + )) + } +} + +impl<'a> GitProvider, GitUrlParseError> for GitLabProvider<'a> { + fn from_git_url(url: &GitUrl<'a>) -> Result { + let path = url.path(); + Self::parse_path(path).map(|(_, provider)| provider) + } +} diff --git a/src/types/spec.rs b/src/types/spec.rs new file mode 100644 index 0000000..02a85f2 --- /dev/null +++ b/src/types/spec.rs @@ -0,0 +1,538 @@ +//! # GitUrl url spec parser +//! +//! Internal structs with RFC 3968 parsing logic for Git urls +//! + +use getset::CopyGetters; +#[cfg(feature = "log")] +use log::debug; +use nom::Finish; +use nom::branch::alt; +use nom::bytes::complete::{tag, take_while}; +use nom::character::complete::alpha1; +use nom::combinator::{map_opt, peek, recognize, verify}; +use nom::error::context; +use nom::multi::{many0, many1}; +use nom::sequence::{pair, preceded, separated_pair, terminated}; +use nom::{IResult, Parser, combinator::opt}; + +/// Top-level struct for RFC 3986 spec parser +#[derive(Debug, Default, Clone, Copy, CopyGetters)] +#[getset(get_copy = "pub")] +pub(crate) struct UrlSpecParser<'url> { + /// RFC 3986 scheme + pub(crate) scheme: Option<&'url str>, + /// RFC 3986 hier-part + pub(crate) hier_part: UrlHierPart<'url>, +} + +impl<'url> UrlSpecParser<'url> { + /// https://datatracker.ietf.org/doc/html/rfc3986 + /// Based on rfc3986, but does not strictly cover the spec + /// * No support for: + /// * query, fragment, percent-encoding, and much of the edges for path support + /// * many forms of ip representations like ipv6, hexdigits + /// * Added support for: + /// * parsing ssh git urls which use ":" as a delimiter between the authority and path + /// * parsing userinfo into user:token (but its officially deprecated, per #section-3.2.1) + /// * some limited support for windows/linux filepaths + pub(crate) fn parse(input: &'url str) -> IResult<&'url str, Self> { + let (input, scheme) = Self::parse_scheme.parse(input).finish().unwrap_or_default(); + let (input, heir_part) = Self::parse_hier_part(input).finish().unwrap_or_default(); + + let parsed = UrlSpecParser { + scheme, + hier_part: heir_part, + }; + + Ok((input, parsed)) + } + + /// RFC 3986 scheme + fn parse_scheme(input: &'url str) -> IResult<&'url str, Option<&'url str>> { + #[cfg(feature = "log")] + { + debug!("Looking ahead before parsing for scheme"); + } + + let mut check = context( + "scheme validate", + peek(pair( + pair( + alpha1, + take_while(|c: char| { + c.is_ascii_alphabetic() + || c.is_ascii_digit() + || c == '+' + || c == '-' + || c == '.' + }), + ), + tag::<&str, &str, nom::error::Error<&str>>("://"), + )), + ); + + // Check if we have scheme 'git:' without the '//' for normalizing to 'git://' + if Self::short_git_scheme_check(input) { + // return early if we are normalizing 'git:' (short git) + if let Ok((input, scheme)) = Self::short_git_scheme_parser().parse(input) { + return Ok((input, scheme)); + } + } + + if check.parse(input).is_err() { + #[cfg(feature = "log")] + { + debug!("Look ahead check for scheme failed"); + } + return Ok((input, None)); + } + + #[cfg(feature = "log")] + { + debug!("Look ahead check passed, parsing for scheme"); + } + + // Must start with alpha character, then alpha/digit/+/-/. + let (input, scheme) = context( + "Scheme parse", + opt(verify( + terminated( + recognize(pair( + alpha1, + take_while(|c: char| { + c.is_ascii_alphabetic() + || c.is_ascii_digit() + || c == '+' + || c == '-' + || c == '.' + }), + )), + // Not part of spec. We consume the "://" here to more easily manage scheme to be optional + tag("://"), + ), + |s: &str| !s.is_empty(), + )), + ) + .parse(input)?; + + #[cfg(feature = "log")] + { + debug!("{input:?}"); + debug!("{scheme:?}"); + } + + Ok((input, scheme)) + } + + /// RFC 3986 hier-part + // https://datatracker.ietf.org/doc/html/rfc3986#section-3.2 + // The rfc says parsing the "//" part of the uri belongs to the hier-part parsing + // but we only support common internet protocols, file paths, but not other "baseless" ones + // so it is sensible for this move it with scheme parsing to support git user service urls + fn parse_hier_part(input: &'url str) -> IResult<&'url str, UrlHierPart<'url>> { + #[cfg(feature = "log")] + { + debug!("Parsing for heir-part"); + } + + let (input, authority) = Self::parse_authority(input)?; + + let (input, path) = context( + "Top of path parsers", + verify( + alt(( + //preceded(tag("//"), Self::path_abempty_parser()), + Self::path_abempty_parser(), + Self::path_rootless_parser(), + Self::path_ssh_parser(), + )), + |s: &str| !s.is_empty(), + ), + ) + .parse(input)?; + + let hier_part = UrlHierPart { authority, path }; + + #[cfg(feature = "log")] + { + debug!("{:?}", input); + debug!("{:?}", hier_part); + } + + Ok((input, hier_part)) + } + + /// RFC 3986 authority + fn parse_authority(input: &'url str) -> IResult<&'url str, UrlAuthority<'url>> { + #[cfg(feature = "log")] + { + debug!("Parsing for Authority"); + } + + // Optional: username / token + let (input, userinfo) = Self::parse_userinfo(input)?; + + // Host + #[cfg(feature = "log")] + { + debug!("Looking ahead for windows-style path vs host"); + } + + // peek ahead to check for windows path stuff + let check = context( + "Host check for windows path", + peek(preceded( + take_while(|c| reg_name_uri_chars(c) && c != '\\'), + tag::<&str, &str, nom::error::Error<&str>>(":\\"), + )), + ) + .parse(input); + + if check.is_ok() { + #[cfg(feature = "log")] + { + debug!( + "Host check failed. Found potential windows-style path while looking for host" + ); + } + + return Ok((input, UrlAuthority::default())); + } + + #[cfg(feature = "log")] + { + debug!("Parsing for host"); + } + + let (input, host) = context( + "Host parser", + opt(verify( + recognize(take_while(|c: char| reg_name_uri_chars(c))), + |s: &str| { + let has_alphanum = s.chars().any(char::is_alphanumeric); + let starts_with_alphanum = s.chars().next().is_some_and(char::is_alphanumeric); + + has_alphanum && starts_with_alphanum && !s.is_empty() + }, + )), + ) + .parse(input)?; + + #[cfg(feature = "log")] + { + debug!("host found: {host:?}"); + } + + // Optional: port + let (input, port) = Self::parse_port(input)?; + + let authority = UrlAuthority { + userinfo, + host, + port, + }; + + #[cfg(feature = "log")] + { + debug!("{input:?}"); + debug!("{authority:?}"); + } + + Ok((input, authority)) + } + + /// RFC 3986 userinfo + fn parse_userinfo(authority_input: &'url str) -> IResult<&'url str, UrlUserInfo<'url>> { + // Peek for username@ + #[cfg(feature = "log")] + { + debug!("Checking for for Userinfo"); + } + + let mut check = context( + "Userinfo validation", + peek(pair( + take_while(|c: char| unreserved_uri_chars(c) || subdelims_uri_chars(c) || c == ':'), + tag::<&str, &str, nom::error::Error<&str>>("@"), + )), + ); + + if check.parse(authority_input).is_err() { + #[cfg(feature = "log")] + { + debug!("Userinfo check failed"); + } + return Ok((authority_input, UrlUserInfo::default())); + } + + // Userinfo + let (authority_input, userinfo) = context( + "Userinfo parser", + opt(verify( + recognize(take_while(|c: char| { + unreserved_uri_chars(c) || subdelims_uri_chars(c) || c == ':' + })), + |s: &str| !s.is_empty(), + )), + ) + .parse(authority_input)?; + + let (authority_input, _) = if userinfo.is_some() { + #[cfg(feature = "log")] + { + debug!("Userinfo found. Parsing for '@'"); + } + + context("Userinfo '@' parser", tag("@")).parse(authority_input)? + } else { + // No change to input, but let the compiler be happy + (authority_input, authority_input) + }; + + // Break down userinfo into user and token + let (user, token) = if let Some(userinfo) = userinfo { + if userinfo.contains(":") { + #[cfg(feature = "log")] + { + debug!("Continue break down userinfo into user:token"); + } + let (_, (user, token)) = context( + "Userinfo with colon parser", + separated_pair( + verify( + take_while(|c: char| unreserved_uri_chars(c) || subdelims_uri_chars(c)), + |s: &str| !s.is_empty(), + ), + tag(":"), + verify( + take_while(|c: char| unreserved_uri_chars(c) || subdelims_uri_chars(c)), + |s: &str| !s.is_empty(), + ), + ), + ) + .parse(userinfo)?; + (Some(user), Some(token)) + } else { + (Some(userinfo), None) + } + } else { + (None, None) + }; + + let userinfo = UrlUserInfo { user, token }; + + #[cfg(feature = "log")] + { + debug!("{authority_input:?}"); + debug!("{userinfo:?}"); + } + + Ok((authority_input, userinfo)) + } + + /// RFC 3986 port + fn parse_port(authority_input: &'url str) -> IResult<&'url str, Option> { + #[cfg(feature = "log")] + { + debug!("Parsing port"); + } + + // We need to pull the full value of what's in the segment THEN parse for numbers + let (input, port) = context( + "Port parser", + opt(map_opt( + verify( + preceded( + tag(":"), + take_while(|c: char| unreserved_uri_chars(c) || subdelims_uri_chars(c)), + ), + |p_str: &str| !p_str.is_empty(), + ), + |s: &str| s.parse::().ok(), + )), + ) + .parse(authority_input)?; + + #[cfg(feature = "log")] + { + debug!("{authority_input:?}"); + debug!("{port:?}"); + } + + Ok((input, port)) + } + + /// RFC 3986 path-abempty + fn path_abempty_parser( + ) -> impl Parser< + &'url str, + Output = > as Parser< + &'url str, + >>::Output, + Error = nom::error::Error<&'url str>, + >{ + #[cfg(feature = "log")] + { + debug!("parsing abempty path"); + } + + // Starts with '/' or empty + context( + "Path parser (abempty)", + recognize(many1(pair( + tag("/"), + take_while(|c: char| pchar_uri_chars(c)), + ))), + ) + } + + /// Not part of RFC 3986 - ssh-based url path + fn path_ssh_parser( + ) -> impl Parser< + &'url str, + Output = > as Parser< + &'url str, + >>::Output, + Error = nom::error::Error<&'url str>, + >{ + #[cfg(feature = "log")] + { + debug!("Parsing ssh path"); + } + + context( + "Path parser (ssh)", + recognize(( + tag(":"), + take_while(|c: char| pchar_uri_chars(c)), + many1(pair(tag("/"), take_while(|c: char| pchar_uri_chars(c)))), + )), + ) + } + + /// RFC 3986 path-rootless + fn path_rootless_parser( + ) -> impl Parser< + &'url str, + Output = > as Parser< + &'url str, + >>::Output, + Error = nom::error::Error<&'url str>, + >{ + #[cfg(feature = "log")] + { + debug!("Parsing rootless path"); + } + + context( + "Path parser (rootless)", + recognize(pair( + take_while(|c: char| pchar_uri_chars(c)), + many0(pair(tag("/"), take_while(|c: char| pchar_uri_chars(c)))), + )), + ) + } + + /// consuming parser for `git:` (short git) as scheme for normalizing + fn short_git_scheme_parser() -> impl Parser< + &'url str, + Output = , + Error = nom::error::Error<&'url str>, + > as Parser<&'url str>>::Output, + Error = nom::error::Error<&'url str>, + > { + #[cfg(feature = "log")] + { + debug!("Parsing short git scheme"); + } + + context( + "short git scheme parse", + opt(terminated( + tag::<&str, &str, nom::error::Error<&str>>("git"), + tag::<&str, &str, nom::error::Error<&str>>(":"), + )), + ) + } + + /// Non-consuming check for `git:` (short git) as scheme for normalizing + fn short_git_scheme_check(input: &'url str) -> bool { + context( + "short git validate", + peek(terminated( + tag::<&str, &str, nom::error::Error<&str>>("git"), + tag::<&str, &str, nom::error::Error<&str>>(":"), + )), + ) + .parse(input) + .is_ok() + } +} + +/// RFC 3986 userinfo +#[derive(Debug, Default, Clone, Copy, CopyGetters)] +#[getset(get_copy = "pub")] +pub(crate) struct UrlUserInfo<'url> { + /// RFC 3986 Userinfo + pub(crate) user: Option<&'url str>, + /// Non-spec, deprecated + pub(crate) token: Option<&'url str>, +} + +/// RFC 3986 authority +#[derive(Debug, Default, Clone, Copy, CopyGetters)] +#[getset(get_copy = "pub")] +pub(crate) struct UrlAuthority<'url> { + /// RFC 3986 Username, non-spec token + pub(crate) userinfo: UrlUserInfo<'url>, + /// RFC 3986 Host + pub(crate) host: Option<&'url str>, + /// RFC 3986 Port + pub(crate) port: Option, +} + +/// RFC 3986 hier-part +#[derive(Debug, Default, Clone, Copy, CopyGetters)] +#[getset(get_copy = "pub")] +pub(crate) struct UrlHierPart<'url> { + /// RFC 3986 authority + pub(crate) authority: UrlAuthority<'url>, + /// RFC 3986 relative-part + pub(crate) path: &'url str, +} + +/// RFC 3986 pchar +pub(crate) fn pchar_uri_chars(c: char) -> bool { + // unreserved / pct-encoded (not implemented) / sub-delims / ":" / "@" + unreserved_uri_chars(c) || subdelims_uri_chars(c) || c == ':' || c == '@' +} + +/// RFC 3986 reg-name +pub(crate) fn reg_name_uri_chars(c: char) -> bool { + // *( unreserved / pct-encoded (not implemented) / sub-delims ) + unreserved_uri_chars(c) || subdelims_uri_chars(c) +} + +/// RFC 3986 unreserved +pub(crate) fn unreserved_uri_chars(c: char) -> bool { + c.is_alphanumeric() || c == '-' || c == '.' || c == '_' || c == '~' +} + +/// RFC 3986 sub-delims (mostly) +pub(crate) fn subdelims_uri_chars(c: char) -> bool { + c == '!' + || c == '$' + || c == '&' + || c == '\'' + || c == '(' + || c == ')' + || c == '*' + || c == '+' + || c == ',' + || c == ';' + || c == '=' + || c == '\\' // This is not part of spec, but used for windows paths +} diff --git a/tests/mod.rs b/tests/mod.rs index b693062..db47ff5 100644 --- a/tests/mod.rs +++ b/tests/mod.rs @@ -1,3 +1,3 @@ -mod normalize; mod parse; +mod provider; mod trim_auth; diff --git a/tests/normalize.rs b/tests/normalize.rs deleted file mode 100644 index ea1b174..0000000 --- a/tests/normalize.rs +++ /dev/null @@ -1,185 +0,0 @@ -use git_url_parse::*; - -// Url Normalization -#[test] -fn git() { - let test_url = "git://host.tld/user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!(normalized.as_str(), "git://host.tld/user/project-name.git"); -} - -// I'm not even sure if this is a form that should be supported bc I can't find examples of it being used in the wild by another service -//#[should_panic] -#[test] -fn git2() { - let test_url = "git:host.tld/user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!(normalized.as_str(), "git://host.tld/user/project-name.git"); -} - -#[test] -fn http() { - let test_url = "http://host.tld/user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!(normalized.as_str(), "http://host.tld/user/project-name.git"); -} - -#[test] -fn https() { - let test_url = "https://host.tld/user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!( - normalized.as_str(), - "https://host.tld/user/project-name.git" - ); -} - -#[test] -fn ssh_scheme() { - let test_url = "ssh://git@host.tld/user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!( - normalized.as_str(), - "ssh://git@host.tld/user/project-name.git" - ); -} - -#[test] -fn ssh_no_scheme() { - let test_url = "git@host.tld:user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!( - normalized.as_str(), - "ssh://git@host.tld/user/project-name.git" - ); -} - -#[test] -fn ssh_no_scheme_no_user() { - let test_url = "host.tld:user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!(normalized.as_str(), "ssh://host.tld/user/project-name.git"); -} - -#[test] -fn unix_file_scheme_abs_path() { - let test_url = "file:///user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!(normalized.as_str(), "file:///user/project-name.git"); -} - -#[test] -fn unix_file_no_scheme_abs_path() { - let test_url = "/user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!(normalized.as_str(), "file:///user/project-name.git"); -} - -#[test] -fn unix_file_scheme_rel_path() { - let test_url = "file://../user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!(normalized.as_str(), "file://../user/project-name.git"); -} - -#[test] -fn unix_file_no_scheme_rel_path() { - let test_url = "../user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!(normalized.as_str(), "file://../user/project-name.git"); -} - -#[should_panic] -#[test] -fn win_file_scheme_abs_path() { - let test_url = "file://c:\\user\\project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - // I actually don't know how this should be normalized. - assert_eq!(normalized.as_str(), "file://c:\\user\\project-name.git"); -} - -#[should_panic] -#[test] -fn win_file_no_scheme_abs_path() { - let test_url = "c:\\user\\project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - // I actually don't know how this should be normalized. - assert_eq!(normalized.as_str(), "file://c:\\user\\project-name.git"); -} - -#[test] -fn win_file_scheme_rel_path() { - let test_url = "file://..\\user\\project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - // I actually don't know how this should be normalized. - assert_eq!(normalized.as_str(), "file://../user/project-name.git"); -} - -#[test] -fn win_file_no_scheme_rel_path() { - let test_url = "..\\user\\project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - // I actually don't know how this should be normalized. - assert_eq!(normalized.as_str(), "file://../user/project-name.git"); -} -#[test] -fn multi_git_ssh() { - let test_url = "git+ssh://host.tld/user/project-name.git"; - let normalized = normalize_url(test_url).expect("Normalizing url failed"); - - assert_eq!( - normalized.as_str(), - "git+ssh://host.tld/user/project-name.git" - ); -} - -// From https://github.com/tjtelan/git-url-parse-rs/issues/16 -#[test] -fn null_in_input1() { - let test_url = "////////ws///////////*,\u{0}\u{0}^\u{0}\u{0}\u{0}\u{0}@2\u{1}\u{0}\u{1d})\u{0}\u{0}\u{0}:\u{0}\u{0}\u{0}"; - let normalized = normalize_url(test_url); - - assert!(normalized.is_err()); -} - -// From https://github.com/tjtelan/git-url-parse-rs/issues/16 -#[test] -fn null_in_input2() { - let test_url = "?\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{1f}s\u{3}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{5}\u{1}@\u{0}\u{0}\u{4}!e\u{0}\u{0}2\u{1c}^3106://?, GitUrlParseError> for TestProvider { + fn from_git_url(_url: &GitUrl) -> Result { + Ok(Self) + } + } + + let _ = env_logger::try_init(); + let test_url = "git@github.com:tjtelan/git-url-parse-rs.git"; + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + debug!("{:#?}", parsed); + + let provider_info: TestProvider = parsed.provider_info().unwrap(); + debug!("{:#?}", provider_info); + + let expected = TestProvider; + assert_eq!(provider_info, expected) +} + +#[test] +fn self_host() { + let _ = env_logger::try_init(); + let test_url = "http://git.example.com:3000/user/repo.git"; + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + debug!("{:#?}", parsed); + + let provider_info: GenericProvider = parsed.provider_info().unwrap(); + debug!("{:#?}", provider_info); + + let owner = "user"; + let repo = "repo"; + let full = format!("{owner}/{repo}"); + + assert_eq!(provider_info.owner(), owner); + assert_eq!(provider_info.repo(), repo); + assert_eq!(provider_info.fullname(), full); +} + +#[test] +fn http_azure_devops() { + let _ = env_logger::try_init(); + let test_url = "https://CompanyName@dev.azure.com/CompanyName/ProjectName/_git/RepoName"; + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + debug!("{:#?}", parsed); + + let provider_info: AzureDevOpsProvider = parsed.provider_info().unwrap(); + debug!("{:#?}", provider_info); + + let org = "CompanyName"; + let project = "ProjectName"; + let repo = "RepoName"; + let full = format!("{org}/{project}/{repo}"); + + assert_eq!(provider_info.org(), org); + assert_eq!(provider_info.project(), project); + assert_eq!(provider_info.repo(), repo); + assert_eq!(provider_info.fullname(), full); +} + +#[test] +fn ssh_azure_devops() { + let _ = env_logger::try_init(); + let test_url = "git@ssh.dev.azure.com:v3/CompanyName/ProjectName/RepoName.git"; + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + debug!("{:#?}", parsed); + + let provider_info: AzureDevOpsProvider = parsed.provider_info().unwrap(); + debug!("{:#?}", provider_info); + + let org = "CompanyName"; + let project = "ProjectName"; + let repo = "RepoName"; + let full = format!("{org}/{project}/{repo}"); + + assert_eq!(provider_info.org(), org); + assert_eq!(provider_info.project(), project); + assert_eq!(provider_info.repo(), repo); + assert_eq!(provider_info.fullname(), full); +} + +#[test] +fn http_gitlab() { + let _ = env_logger::try_init(); + let test_url = "https://gitlab.com/gitlab-org/gitlab.git"; + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + debug!("{:#?}", parsed); + + let provider_info: GitLabProvider = parsed.provider_info().unwrap(); + debug!("{:#?}", provider_info); + + let owner = "gitlab-org"; + let subgroup = None; + let repo = "gitlab"; + let full = format!("{owner}/{repo}"); + + assert_eq!(provider_info.owner(), owner); + assert_eq!(provider_info.subgroup(), subgroup); + assert_eq!(provider_info.repo(), repo); + assert_eq!(provider_info.fullname(), full); +} + +#[test] +fn ssh_gitlab() { + let _ = env_logger::try_init(); + let test_url = "git@gitlab.com:gitlab-org/gitlab.git"; + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + debug!("{:#?}", parsed); + + let provider_info: GitLabProvider = parsed.provider_info().unwrap(); + debug!("{:#?}", provider_info); + + let owner = "gitlab-org"; + let subgroup = None; + let repo = "gitlab"; + let full = format!("{owner}/{repo}"); + + assert_eq!(provider_info.owner(), owner); + assert_eq!(provider_info.subgroup(), subgroup); + assert_eq!(provider_info.repo(), repo); + assert_eq!(provider_info.fullname(), full); +} + +#[test] +fn http_gitlab_subgroups() { + let _ = env_logger::try_init(); + let test_url = "https://gitlab.com/gitlab-org/sbom/systems/gitlab-core.git"; + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + debug!("{:#?}", parsed); + + let provider_info: GitLabProvider = parsed.provider_info().unwrap(); + debug!("{:#?}", provider_info); + + let owner = "gitlab-org"; + let subgroup = Some(vec!["sbom", "systems"]); + let repo = "gitlab-core"; + let full = format!("{owner}/{}/{repo}", "sbom/systems"); + + assert_eq!(provider_info.owner(), owner); + assert_eq!(provider_info.subgroup(), subgroup); + assert_eq!(provider_info.repo(), repo); + assert_eq!(provider_info.fullname(), full); +} + +#[test] +fn ssh_gitlab_subgroups() { + let _ = env_logger::try_init(); + let test_url = "git@gitlab.com:gitlab-org/sbom/systems/gitlab-core.git"; + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + debug!("{:#?}", parsed); + + let provider_info: GitLabProvider = parsed.provider_info().unwrap(); + debug!("{:#?}", provider_info); + + let owner = "gitlab-org"; + let subgroup = Some(vec!["sbom", "systems"]); + let repo = "gitlab-core"; + let full = format!("{owner}/{}/{repo}", "sbom/systems"); + + assert_eq!(provider_info.owner(), owner); + assert_eq!(provider_info.subgroup(), subgroup); + assert_eq!(provider_info.repo(), repo); + assert_eq!(provider_info.fullname(), full); +} + +#[test] +fn filepath() { + let _ = env_logger::try_init(); + let test_url = "file:///home/user/Documents/"; + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + debug!("{:#?}", parsed); + + let provider_info: Result = parsed.provider_info(); + debug!("{:#?}", provider_info); + + assert!(provider_info.is_err()); + if let Err(e) = provider_info { + assert_eq!(e, GitUrlParseError::ProviderUnsupported) + } +} diff --git a/tests/trim_auth.rs b/tests/trim_auth.rs index 327db3e..5929b23 100644 --- a/tests/trim_auth.rs +++ b/tests/trim_auth.rs @@ -1,101 +1,129 @@ use git_url_parse::*; +use log::debug; #[test] fn ssh_user_ports() { + let _ = env_logger::try_init(); let test_url = "ssh://git@host.tld:9999/user/project-name.git"; - let parsed_and_trimmed = GitUrl::parse(test_url) - .expect("URL parse failed") - .trim_auth(); let expected = "ssh://host.tld:9999/user/project-name.git"; - assert_eq!(format!("{}", parsed_and_trimmed), expected); + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + let trimmed = parsed.trim_auth(); + debug!("{:#?}", parsed); + debug!("{:#?}", trimmed); + + assert_eq!(trimmed.to_string(), expected); } // Specific service support #[test] fn https_user_bitbucket() { + let _ = env_logger::try_init(); let test_url = "https://user@bitbucket.org/user/repo.git"; - let parsed_and_trimmed = GitUrl::parse(test_url) - .expect("URL parse failed") - .trim_auth(); let expected = "https://bitbucket.org/user/repo.git"; - assert_eq!(format!("{}", parsed_and_trimmed), expected); + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + let trimmed = parsed.trim_auth(); + debug!("{:#?}", parsed); + debug!("{:#?}", trimmed); + + assert_eq!(trimmed.to_string(), expected); } #[test] fn ssh_user_bitbucket() { + let _ = env_logger::try_init(); let test_url = "git@bitbucket.org:user/repo.git"; - let parsed_and_trimmed = GitUrl::parse(test_url) - .expect("URL parse failed") - .trim_auth(); let expected = "bitbucket.org:user/repo.git"; - assert_eq!(format!("{}", parsed_and_trimmed), expected); + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + let trimmed = parsed.trim_auth(); + debug!("{:#?}", parsed); + debug!("{:#?}", trimmed); + + assert_eq!(trimmed.to_string(), expected); } #[test] fn https_user_auth_bitbucket() { + let _ = env_logger::try_init(); let test_url = "https://x-token-auth:token@bitbucket.org/owner/name.git/"; - let parsed_and_trimmed = GitUrl::parse(test_url) - .expect("URL parse failed") - .trim_auth(); - let expected = "https://bitbucket.org/owner/name.git"; + let expected = "https://bitbucket.org/owner/name.git/"; - assert_eq!(format!("{}", parsed_and_trimmed), expected); + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + let trimmed = parsed.trim_auth(); + debug!("{:#?}", parsed); + debug!("{:#?}", trimmed); + + assert_eq!(trimmed.to_string(), expected); } #[test] fn https_user_github() { + let _ = env_logger::try_init(); let test_url = "https://user@github.com/user/repo.git/"; - let parsed_and_trimmed = GitUrl::parse(test_url) - .expect("URL parse failed") - .trim_auth(); - let expected = "https://github.com/user/repo.git"; + let expected = "https://github.com/user/repo.git/"; + + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + let trimmed = parsed.trim_auth(); + debug!("{:#?}", parsed); + debug!("{:#?}", trimmed); - assert_eq!(format!("{}", parsed_and_trimmed), expected); + assert_eq!(trimmed.to_string(), expected); } #[test] fn ssh_user_github() { + let _ = env_logger::try_init(); let test_url = "git@github.com:user/repo.git"; - let parsed_and_trimmed = GitUrl::parse(test_url) - .expect("URL parse failed") - .trim_auth(); let expected = "github.com:user/repo.git"; - assert_eq!(format!("{}", parsed_and_trimmed), expected); + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + let trimmed = parsed.trim_auth(); + debug!("{:#?}", parsed); + debug!("{:#?}", trimmed); + + assert_eq!(trimmed.to_string(), expected); } #[test] fn https_user_auth_github() { + let _ = env_logger::try_init(); let test_url = "https://token:x-oauth-basic@github.com/owner/name.git/"; - let parsed_and_trimmed = GitUrl::parse(test_url) - .expect("URL parse failed") - .trim_auth(); - let expected = "https://github.com/owner/name.git"; + let expected = "https://github.com/owner/name.git/"; + + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + let trimmed = parsed.trim_auth(); + debug!("{:#?}", parsed); + debug!("{:#?}", trimmed); - assert_eq!(format!("{}", parsed_and_trimmed), expected); + assert_eq!(trimmed.to_string(), expected); } #[test] fn ssh_user_azure_devops() { + let _ = env_logger::try_init(); let test_url = "git@ssh.dev.azure.com:v3/CompanyName/ProjectName/RepoName"; - let parsed_and_trimmed = GitUrl::parse(test_url) - .expect("URL parse failed") - .trim_auth(); let expected = "ssh.dev.azure.com:v3/CompanyName/ProjectName/RepoName"; - assert_eq!(format!("{}", parsed_and_trimmed), expected); + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + let trimmed = parsed.trim_auth(); + debug!("{:#?}", parsed); + debug!("{:#?}", trimmed); + + assert_eq!(trimmed.to_string(), expected); } #[test] fn https_user_azure_devops() { + let _ = env_logger::try_init(); let test_url = "https://organization@dev.azure.com/organization/project/_git/repo"; - let parsed_and_trimmed = GitUrl::parse(test_url) - .expect("URL parse failed") - .trim_auth(); let expected = "https://dev.azure.com/organization/project/_git/repo"; - assert_eq!(format!("{}", parsed_and_trimmed), expected); + let parsed = GitUrl::parse(test_url).expect("URL parse failed"); + let trimmed = parsed.trim_auth(); + debug!("{:#?}", parsed); + debug!("{:#?}", trimmed); + + assert_eq!(trimmed.to_string(), expected); }