diff --git a/Dockerfile b/Dockerfile index f52ab021..0bca1014 100644 --- a/Dockerfile +++ b/Dockerfile @@ -733,7 +733,7 @@ RUN rm /license-scan/{clarify,deny}.toml FROM sdk-cargo as sdk-license-scan -ENV SPDXVER="3.19" +ENV SPDXVER="3.26.0" USER builder WORKDIR /home/builder/license-scan diff --git a/configs/cargo-deny/clarify.toml b/configs/cargo-deny/clarify.toml index f57860db..e9361889 100644 --- a/configs/cargo-deny/clarify.toml +++ b/configs/cargo-deny/clarify.toml @@ -1,3 +1,10 @@ +[spdx] +ignore-licenses = [ + # Apache-2.0 is often misclassified as Pixar, which is a significantly more rare + # https://github.com/jpeddicord/askalono/issues/94 + "Pixar" +] + [clarify.askalono] expression = "Apache-2.0" license-files = [ diff --git a/configs/cargo-make/clarify.toml b/configs/cargo-make/clarify.toml index 7efe34cb..8360ba98 100644 --- a/configs/cargo-make/clarify.toml +++ b/configs/cargo-make/clarify.toml @@ -1,3 +1,10 @@ +[spdx] +ignore-licenses = [ + # Apache-2.0 is often misclassified as Pixar, which is a significantly more rare + # https://github.com/jpeddicord/askalono/issues/94 + "Pixar" +] + [clarify.bstr] expression = "(MIT OR Apache-2.0) AND Unicode-DFS-2016" license-files = [ diff --git a/hashes/license-scan b/hashes/license-scan index 9a3f4d62..ff9115af 100644 --- a/hashes/license-scan +++ b/hashes/license-scan @@ -1,2 +1,2 @@ -# https://github.com/spdx/license-list-data/archive/v3.19.tar.gz#/license-list-data-3.19.tar.gz -SHA512 (license-list-data-3.19.tar.gz) = 23d90eece2f164a00ad710c84c3f3194bf54830b4c2b5c2739c4bf713c95ab161697850eecb20d1c3dfbdad24aa795a75bf11f9473982824fc9fe885962b7433 +# https://github.com/spdx/license-list-data/archive/v3.26.0.tar.gz#/license-list-data-3.26.0.tar.gz +SHA512 (license-list-data-3.26.0.tar.gz) = 67e618d4642dfe4f366935d9514d6f7941d3711e77c49bc1b340ac572d36015981b8f10c72a9f80f1d83e132ae7e5093c0155e0c0e786e4777ee16d1cb7b885a diff --git a/license-scan/Cargo.lock b/license-scan/Cargo.lock index cc6ef900..8d9ee624 100644 --- a/license-scan/Cargo.lock +++ b/license-scan/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "adler2" @@ -54,6 +54,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + [[package]] name = "bottlerocket-license-scan" version = "0.1.0" @@ -68,6 +74,7 @@ dependencies = [ "serde", "spdx", "structopt", + "tempfile", "toml", "twox-hash", "walkdir", @@ -75,9 +82,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.11.1" +version = "1.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "786a307d683a5bf92e6fd5fd69a7eb613751668d1d8d67d802846dfe367c62c8" +checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0" dependencies = [ "memchr", "serde", @@ -123,9 +130,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.5" +version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31a0499c1dc64f458ad13872de75c0eb7e3fdb0e67964610c914b034fc5956e" +checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7" dependencies = [ "jobserver", "libc", @@ -144,7 +151,7 @@ version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ - "bitflags", + "bitflags 1.3.2", "textwrap", "unicode-width", ] @@ -195,6 +202,22 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "errno" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + [[package]] name = "flate2" version = "1.0.35" @@ -297,6 +320,12 @@ version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + [[package]] name = "log" version = "0.4.22" @@ -333,6 +362,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + [[package]] name = "paste" version = "1.0.15" @@ -497,6 +532,19 @@ dependencies = [ "serde", ] +[[package]] +name = "rustix" +version = "0.38.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "ryu" version = "1.0.18" @@ -523,22 +571,22 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.216" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.216" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.91", + "syn 2.0.94", ] [[package]] @@ -576,12 +624,10 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "spdx" -version = "0.3.6" +version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e6b6cc773b635ad64a05f00367c6f66d06a8708f7360f67c41d446dacdd0a0f" +checksum = "58b69356da67e2fc1f542c71ea7e654a361a79c938e4424392ecf4fa065d2193" dependencies = [ - "lazy_static", - "regex", "smallvec", ] @@ -628,15 +674,29 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.91" +version = "2.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53cbcb5a243bd33b7858b1d7f4aca2153490815872d86d955d6ea29f743c035" +checksum = "987bc0be1cdea8b10216bd06e2ca407d40b9543468fafd3ddfb02f36e77f71f3" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" +dependencies = [ + "cfg-if", + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -663,7 +723,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.91", + "syn 2.0.94", ] [[package]] @@ -859,9 +919,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.20" +version = "0.6.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" +checksum = "39281189af81c07ec09db316b302a3e67bf9bd7cbf6c820b50e35fee9c2fa980" dependencies = [ "memchr", ] @@ -884,7 +944,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.91", + "syn 2.0.94", ] [[package]] diff --git a/license-scan/Cargo.toml b/license-scan/Cargo.toml index d12b2413..e1bfce7a 100644 --- a/license-scan/Cargo.toml +++ b/license-scan/Cargo.toml @@ -17,8 +17,9 @@ ignore = "0.4" lazy_static = "1" semver = { version = "1", features = ["serde"] } serde = { version = "1", features = ["derive"] } -spdx = "0.3" +spdx = "0.10" structopt = { version = "0.3", default-features = false } +tempfile = "3" toml = "0.8" twox-hash = "1" walkdir = "2" diff --git a/license-scan/clarify.toml b/license-scan/clarify.toml index 943fb2bb..d75df29c 100644 --- a/license-scan/clarify.toml +++ b/license-scan/clarify.toml @@ -1,3 +1,10 @@ +[spdx] +ignore-licenses = [ + # Apache-2.0 is often misclassified as Pixar, which is a significantly more rare + # https://github.com/jpeddicord/askalono/issues/94 + "Pixar" +] + [clarify.askalono] expression = "Apache-2.0" license-files = [ @@ -21,7 +28,6 @@ license-files = [ { path = "COPYING", hash = 0x278afbcf }, { path = "LICENSE-APACHE", hash = 0x24b54f4b }, { path = "LICENSE-MIT", hash = 0x462dee44 }, - { path = "src/unicode/data/LICENSE-UNICODE", hash = 0x70f7339 }, ] [clarify.crossbeam-channel] @@ -58,12 +64,16 @@ license-files = [ { path = "src/unicode_tables/LICENSE-UNICODE", hash = 0xa7f28b93 }, ] -[clarify.unicode-ident] -expression = "(MIT OR Apache-2.0) AND Unicode-DFS-2016" +[clarify.spdx] +expression = "MIT OR Apache-2.0" license-files = [ - { path = "LICENSE-APACHE", hash = 0xb5518783 }, - { path = "LICENSE-MIT", hash = 0x386ca1bc }, - { path = "LICENSE-UNICODE", hash = 0x9698cbbe }, + { path = "LICENSE-MIT", hash = 0xa502ee8a }, + { path = "LICENSE-APACHE", hash = 0x4fccb6b7 }, +] +skip-dirs = [ + # The spdx crate contains the full text of referred licenses + "src/text/licenses", + "src/text/exceptions", ] [clarify.zstd-safe] diff --git a/license-scan/src/license_store.rs b/license-scan/src/license_store.rs new file mode 100644 index 00000000..5173bf61 --- /dev/null +++ b/license-scan/src/license_store.rs @@ -0,0 +1,99 @@ +use anyhow::{ensure, Context, Result}; +use serde::Deserialize; +use std::collections::HashSet; +use std::path::Path; +use walkdir::WalkDir; + +#[derive(Debug, Deserialize, Default)] +#[serde(rename_all = "kebab-case")] +pub(crate) struct SPDXOptions { + /// A set of licenses to ignore from the SPDX license data. + #[serde(default)] + pub(crate) ignore_licenses: HashSet, +} + +impl SPDXOptions { + /// Mogrifies SPDX licenses using a set of static rules. + /// + /// This function is implemented to work around quirks in the SPDX data that cause unusual behavior + /// in askalono during license identification. + pub(crate) fn preprocess_licenses( + &self, + input_path: impl AsRef, + output_path: impl AsRef, + ) -> Result<()> { + let input_path = input_path.as_ref(); + let output_path = output_path.as_ref(); + ensure!( + input_path.is_dir(), + "License preprocessing input path is not a directory." + ); + ensure!( + output_path.is_dir(), + "License preprocessing output path is not a directory." + ); + ensure!( + input_path.canonicalize()? != output_path.canonicalize()?, + "License preprocessing must write to a new path" + ); + + let license_iter = WalkDir::new(input_path) + .min_depth(1) + .max_depth(1) + .into_iter(); + + for license_file in license_iter { + let license_file = license_file?; + + if !license_file.file_type().is_file() { + continue; + } + let license_path = license_file.path(); + if license_file + .file_name() + .to_string_lossy() + .ends_with(".json") + && self.should_include_license_file(license_path)? + { + let license_output_path = output_path.join(license_file.file_name()); + std::fs::copy(license_path, &license_output_path).with_context(|| { + format!( + "Failed to copy license from '{}' to '{}'", + license_path.display(), + license_output_path.display() + ) + })?; + } + } + + Ok(()) + } + + fn should_include_license_file(&self, filepath: impl AsRef) -> Result { + let filepath = filepath.as_ref(); + let license_name = filepath + .file_name() + .with_context(|| { + format!( + "License file '{}' seemingly has no filename", + filepath.display() + ) + })? + .to_str() + .with_context(|| { + format!( + "License filename '{}' not valid unicode", + filepath.display() + ) + })? + .strip_suffix(".json") + .with_context(|| { + format!( + "License filename '{}' does not end in '.json'", + filepath.display() + ) + })?; + + Ok(!self.ignore_licenses.contains(license_name)) + } +} diff --git a/license-scan/src/main.rs b/license-scan/src/main.rs index be4b0edb..5403a0fe 100644 --- a/license-scan/src/main.rs +++ b/license-scan/src/main.rs @@ -2,10 +2,13 @@ #![warn(clippy::pedantic)] #![allow(clippy::redundant_closure_for_method_calls)] +mod license_store; + use anyhow::{anyhow, bail, ensure, Context, Result}; use askalono::{ScanStrategy, Store, TextData}; use ignore::types::{Types, TypesBuilder}; use ignore::WalkBuilder; +use license_store::SPDXOptions; use semver::VersionReq; use serde::{Deserialize, Deserializer}; use spdx::Expression; @@ -66,7 +69,13 @@ fn main() -> Result<()> { }; let mut store = Store::new(); - store.load_spdx(&opt.spdx_data, false)?; + + let spdx_proc_dir = tempfile::tempdir()?; + clarify + .spdx + .preprocess_licenses(&opt.spdx_data, &spdx_proc_dir)?; + + store.load_spdx(spdx_proc_dir.path(), false)?; let scanner = ScanStrategy::new(&store) .confidence_threshold(0.93) .shallow_limit(1.0) @@ -147,6 +156,8 @@ fn main() -> Result<()> { struct Clarifications { #[serde(default)] clarify: HashMap, + #[serde(default)] + pub(crate) spdx: SPDXOptions, } /// A clarification for a package overrides the auto-detected license string. @@ -195,6 +206,10 @@ struct InnerClarification { /// List of files that should be skipped as they don't contain license information. #[serde(default)] skip_files: Vec, + + /// List of source directories which should not be scanned for license information. + #[serde(default)] + skip_dirs: Vec, } impl InnerClarification { @@ -221,6 +236,7 @@ struct LicenseFile { struct Clarified<'a> { expression: &'a Expression, skip_files: &'a Vec, + skip_dirs: &'a Vec, } impl Clarifications { @@ -247,6 +263,20 @@ impl Clarifications { files.remove(file.as_path()); } + let skipped_dir_files = files + .keys() + .filter(|input_file| { + clarification + .skip_dirs + .iter() + .any(|skipped_dir| input_file.starts_with(skipped_dir)) + }) + .copied() + .collect::>(); + for skipped_file in skipped_dir_files { + files.remove(skipped_file); + } + // convert `clarification.license_files` into a struct we can compare with `files` let clarify_files = clarification .license_files @@ -263,6 +293,7 @@ impl Clarifications { Ok(Some(Clarified { expression: &clarification.expression, skip_files: &clarification.skip_files, + skip_dirs: &clarification.skip_dirs, })) } else { Ok(None) @@ -557,6 +588,7 @@ mod test { Some(Clarified { expression: &spdx::Expression::parse("Apache-2.0").unwrap(), skip_files: &vec![], + skip_dirs: &vec![], }) ); @@ -576,6 +608,7 @@ mod test { Some(Clarified { expression: &spdx::Expression::parse("Apache-2.0").unwrap(), skip_files: &vec![], + skip_dirs: &vec![], }) ); } @@ -601,6 +634,7 @@ mod test { Some(Clarified { expression: &spdx::Expression::parse("MIT").unwrap(), skip_files: &vec![], + skip_dirs: &vec![], }) ); @@ -664,6 +698,7 @@ mod test { Some(Clarified { expression: &spdx::Expression::parse("Apache-2.0 OR BSD-3-Clause").unwrap(), skip_files: &vec![], + skip_dirs: &vec![], }) ); assert_eq!( @@ -680,6 +715,7 @@ mod test { Some(Clarified { expression: &spdx::Expression::parse("Apache-2.0 OR MIT").unwrap(), skip_files: &vec![], + skip_dirs: &vec![], }) ); } @@ -705,6 +741,7 @@ mod test { Some(Clarified { expression: &spdx::Expression::parse("BSD-3-Clause").unwrap(), skip_files: &vec![], + skip_dirs: &vec![], }) ); assert_eq!( @@ -721,6 +758,7 @@ mod test { Some(Clarified { expression: &spdx::Expression::parse("BSD-3-Clause AND Apache-2.0").unwrap(), skip_files: &vec![], + skip_dirs: &vec![], }) ); }