Skip to content

Commit 8720d77

Browse files
authored
Merge pull request #11453 from Turbo87/sloc
Add SLoC (Source Lines of Code) metric to versions
2 parents 6c4bab3 + ac21dff commit 8720d77

26 files changed

+1254
-4
lines changed

Cargo.lock

Lines changed: 432 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ crates_io_docs_rs = { path = "crates/crates_io_docs_rs" }
7171
crates_io_env_vars = { path = "crates/crates_io_env_vars" }
7272
crates_io_github = { path = "crates/crates_io_github" }
7373
crates_io_index = { path = "crates/crates_io_index" }
74+
crates_io_linecount = { path = "crates/crates_io_linecount" }
7475
crates_io_markdown = { path = "crates/crates_io_markdown" }
7576
crates_io_og_image = "=0.2.1"
7677
crates_io_pagerduty = { path = "crates/crates_io_pagerduty" }

crates/crates_io_database/src/models/version.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ pub struct Version {
3636
pub documentation: Option<String>,
3737
pub repository: Option<String>,
3838
pub trustpub_data: Option<TrustpubData>,
39+
pub linecounts: Option<serde_json::Value>,
3940
}
4041

4142
impl Version {
@@ -109,6 +110,7 @@ pub struct NewVersion<'a> {
109110
categories: Option<&'a [&'a str]>,
110111
keywords: Option<&'a [&'a str]>,
111112
trustpub_data: Option<&'a TrustpubData>,
113+
linecounts: Option<serde_json::Value>,
112114
}
113115

114116
impl NewVersion<'_> {

crates/crates_io_database/src/schema.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,6 +1089,8 @@ diesel::table! {
10891089
semver_ord -> Nullable<Jsonb>,
10901090
/// JSONB data containing JWT claims from the trusted publisher (e.g., GitHub Actions context like repository, run_id, sha)
10911091
trustpub_data -> Nullable<Jsonb>,
1092+
/// Source Lines of Code statistics for this version, stored as JSON with language breakdown and totals.
1093+
linecounts -> Nullable<Jsonb>,
10921094
}
10931095
}
10941096

crates/crates_io_database_dump/src/dump-db.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,8 @@ categories = "public"
288288
keywords = "public"
289289
# The following column is private for now, until we can guarantee a stable data schema.
290290
trustpub_data = "private"
291+
# The following column is private for now, until we can guarantee a stable data schema.
292+
linecounts = "private"
291293

292294
[versions_published_by.columns]
293295
version_id = "private"
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[package]
2+
name = "crates_io_linecount"
3+
version = "0.0.0"
4+
description = "Lines of code counting for crates.io using tokei"
5+
license = "MIT OR Apache-2.0"
6+
edition = "2024"
7+
8+
[lints]
9+
workspace = true
10+
11+
[dependencies]
12+
serde = { version = "=1.0.223", features = ["derive"] }
13+
tokei = "=13.0.0-alpha.9"
14+
15+
[dev-dependencies]
16+
claims = "=0.8.0"
17+
insta = { version = "=1.43.2", features = ["json"] }
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
use tokei::LanguageType;
2+
3+
/// Determine if a language should be counted or ignored
4+
pub fn should_ignore_language(lang: LanguageType) -> bool {
5+
matches!(
6+
lang,
7+
// Configuration and data files
8+
LanguageType::Json |
9+
LanguageType::Yaml |
10+
LanguageType::Toml |
11+
LanguageType::Xml |
12+
LanguageType::Ini |
13+
14+
// Documentation
15+
LanguageType::Markdown |
16+
LanguageType::Text |
17+
LanguageType::ReStructuredText |
18+
LanguageType::AsciiDoc |
19+
LanguageType::Org |
20+
21+
// Build system files
22+
LanguageType::Makefile |
23+
LanguageType::CMake |
24+
LanguageType::Dockerfile |
25+
LanguageType::Autoconf |
26+
LanguageType::MsBuild |
27+
LanguageType::Meson |
28+
LanguageType::Scons |
29+
LanguageType::Bazel |
30+
LanguageType::Nix |
31+
32+
// Shell scripts (debatable, but often just build/deploy automation)
33+
LanguageType::Batch |
34+
LanguageType::PowerShell |
35+
36+
// Other non-programming files
37+
LanguageType::Svg |
38+
LanguageType::Hex |
39+
LanguageType::Protobuf |
40+
LanguageType::Thrift
41+
)
42+
}
43+
44+
#[cfg(test)]
45+
mod tests {
46+
use super::*;
47+
48+
#[test]
49+
fn test_should_ignore_language() {
50+
// Should count programming languages
51+
assert!(!should_ignore_language(LanguageType::Rust));
52+
assert!(!should_ignore_language(LanguageType::JavaScript));
53+
assert!(!should_ignore_language(LanguageType::Html));
54+
assert!(!should_ignore_language(LanguageType::Css));
55+
56+
// Should skip config/data files
57+
assert!(should_ignore_language(LanguageType::Json));
58+
assert!(should_ignore_language(LanguageType::Yaml));
59+
assert!(should_ignore_language(LanguageType::Toml));
60+
assert!(should_ignore_language(LanguageType::Markdown));
61+
}
62+
}
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
mod languages;
2+
mod paths;
3+
4+
use serde::{Deserialize, Serialize};
5+
use std::collections::HashMap;
6+
use std::sync::LazyLock;
7+
use tokei::Config;
8+
9+
pub use crate::paths::PathDetails;
10+
11+
// Re-export LanguageType for use by other crates
12+
pub use tokei::LanguageType;
13+
14+
/// Tokei configuration used for analysis (cached)
15+
static TOKEI_CONFIG: LazyLock<Config> = LazyLock::new(|| Config {
16+
no_ignore: Some(true),
17+
treat_doc_strings_as_comments: Some(true),
18+
..Default::default()
19+
});
20+
21+
/// Statistics for a single programming language
22+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
23+
pub struct LanguageStats {
24+
/// Number of lines of code (excluding comments and blank lines)
25+
pub code_lines: usize,
26+
/// Number of comment lines
27+
pub comment_lines: usize,
28+
/// Number of files of this language
29+
pub files: usize,
30+
}
31+
32+
/// Complete line count statistics for a crate
33+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
34+
pub struct LinecountStats {
35+
/// Per-language breakdown of line counts
36+
pub languages: HashMap<LanguageType, LanguageStats>,
37+
/// Total lines of code across all languages
38+
pub total_code_lines: usize,
39+
/// Total comment lines across all languages
40+
pub total_comment_lines: usize,
41+
}
42+
43+
impl LinecountStats {
44+
/// Create a new empty statistics collection
45+
pub fn new() -> Self {
46+
Self::default()
47+
}
48+
49+
/// Add a single file to the statistics
50+
///
51+
/// The caller can use `should_count_path()` to check if a file should be processed
52+
/// before decompressing to avoid unnecessary work.
53+
pub fn add_file(&mut self, language_type: LanguageType, content: &[u8]) {
54+
let file_stats = language_type.parse_from_slice(content, &TOKEI_CONFIG);
55+
56+
// Update language-specific stats
57+
let entry = self.languages.entry(language_type).or_default();
58+
entry.code_lines += file_stats.code;
59+
entry.comment_lines += file_stats.comments;
60+
entry.files += 1;
61+
62+
// Update totals
63+
self.total_code_lines += file_stats.code;
64+
self.total_comment_lines += file_stats.comments;
65+
}
66+
}
67+
68+
#[cfg(test)]
69+
mod tests {
70+
use super::*;
71+
use std::path::Path;
72+
73+
#[test]
74+
fn test_empty() {
75+
let stats = LinecountStats::new();
76+
insta::assert_json_snapshot!(stats, @r#"
77+
{
78+
"languages": {},
79+
"total_code_lines": 0,
80+
"total_comment_lines": 0
81+
}
82+
"#);
83+
}
84+
85+
#[test]
86+
fn test_add_file() {
87+
let mut stats = LinecountStats::new();
88+
89+
// Add a Rust file
90+
let rust_code = b"// This is a comment\nfn main() {\n println!(\"Hello\");\n}";
91+
stats.add_file(LanguageType::Rust, rust_code);
92+
93+
insta::assert_json_snapshot!(stats, @r#"
94+
{
95+
"languages": {
96+
"Rust": {
97+
"code_lines": 3,
98+
"comment_lines": 1,
99+
"files": 1
100+
}
101+
},
102+
"total_code_lines": 3,
103+
"total_comment_lines": 1
104+
}
105+
"#);
106+
}
107+
108+
#[test]
109+
fn test_workflow() {
110+
let mut stats = LinecountStats::new();
111+
112+
let files = [
113+
("src/lib.rs", "pub fn hello() {}"),
114+
("tests/test.rs", "fn test() {}"), // Should be skipped
115+
("README.md", "# Hello"), // Should be skipped
116+
];
117+
118+
for (path, content) in files {
119+
let path = Path::new(path);
120+
let path_details = PathDetails::from_path(path);
121+
122+
if !path_details.should_ignore()
123+
&& let Some(language_type) = path_details.language_type()
124+
{
125+
stats.add_file(language_type, content.as_bytes())
126+
};
127+
}
128+
129+
insta::assert_json_snapshot!(stats, @r#"
130+
{
131+
"languages": {
132+
"Rust": {
133+
"code_lines": 1,
134+
"comment_lines": 0,
135+
"files": 1
136+
}
137+
},
138+
"total_code_lines": 1,
139+
"total_comment_lines": 0
140+
}
141+
"#);
142+
}
143+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
use crate::languages::should_ignore_language;
2+
use std::path::Path;
3+
use tokei::LanguageType;
4+
5+
#[derive(Debug, Clone, Copy)]
6+
pub struct PathDetails {
7+
is_benchmark: bool,
8+
is_example: bool,
9+
is_hidden: bool,
10+
is_test: bool,
11+
language_type: Option<LanguageType>,
12+
}
13+
14+
impl PathDetails {
15+
pub fn from_path(path: &Path) -> Self {
16+
let path_str = path.to_string_lossy().to_lowercase();
17+
18+
let is_benchmark = path_str.contains("benches/") || path_str.contains("benchmark/");
19+
let is_example = path_str.contains("examples/");
20+
let is_test = path_str.contains("tests/")
21+
|| path_str.contains("test/")
22+
|| path_str.contains("testing/");
23+
24+
let is_hidden = path
25+
.file_name()
26+
.map(|filename| filename.to_string_lossy().starts_with('.'))
27+
.unwrap_or(false);
28+
29+
let language_type = path
30+
.extension()
31+
.and_then(|ext| ext.to_str())
32+
.and_then(LanguageType::from_file_extension);
33+
34+
Self {
35+
is_benchmark,
36+
is_example,
37+
is_hidden,
38+
is_test,
39+
language_type,
40+
}
41+
}
42+
43+
/// Determine if the file should be ignored for line counting purposes
44+
/// because it is a benchmark, example, hidden, or test file.
45+
pub fn should_ignore(&self) -> bool {
46+
self.is_benchmark || self.is_example || self.is_hidden || self.is_test
47+
}
48+
49+
/// Get the actual detected language type, even if it should be ignored.
50+
pub fn actual_language_type(&self) -> Option<LanguageType> {
51+
self.language_type
52+
}
53+
54+
/// Get the detected language type, returning `None` if no language was
55+
/// detected or if the language should be ignored (e.g., data files).
56+
pub fn language_type(&self) -> Option<LanguageType> {
57+
self.language_type.filter(|lt| !should_ignore_language(*lt))
58+
}
59+
}
60+
61+
#[cfg(test)]
62+
mod tests {
63+
use super::*;
64+
use insta::assert_debug_snapshot;
65+
66+
#[test]
67+
fn test_should_count_path() {
68+
assert_debug_snapshot!(PathDetails::from_path(Path::new("src/tests/mod.rs")));
69+
assert_debug_snapshot!(PathDetails::from_path(Path::new("tests/integration.rs")));
70+
assert_debug_snapshot!(PathDetails::from_path(Path::new("examples/basic.rs")));
71+
assert_debug_snapshot!(PathDetails::from_path(Path::new("benches/bench.rs")));
72+
assert_debug_snapshot!(PathDetails::from_path(Path::new("src/lib.rs")));
73+
}
74+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
source: crates/crates_io_linecount/src/paths.rs
3+
expression: "PathDetails::from_path(Path::new(\"tests/integration.rs\"))"
4+
---
5+
PathDetails {
6+
is_benchmark: false,
7+
is_example: false,
8+
is_hidden: false,
9+
is_test: true,
10+
language_type: Some(
11+
Rust,
12+
),
13+
}

0 commit comments

Comments
 (0)