From 5ef7ce1d5bc81dc6a3e47ecf0a46e40a0c0c68e6 Mon Sep 17 00:00:00 2001 From: Ruben Talstra <14235001+rubentalstra@users.noreply.github.com> Date: Sat, 24 Jan 2026 16:54:24 +0100 Subject: [PATCH 1/9] chore: comment out security audit steps in rust-ci.yml --- .github/workflows/rust-ci.yml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml index 1c92fa6a..2eeb091d 100644 --- a/.github/workflows/rust-ci.yml +++ b/.github/workflows/rust-ci.yml @@ -271,21 +271,21 @@ jobs: # Checks for known vulnerabilities in dependencies # Only runs on main branch, PRs to main, and scheduled builds # ============================================================ - audit: - name: Security Audit - runs-on: ubuntu-latest - if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'workflow_dispatch' - steps: - - name: Checkout repository - uses: actions/checkout@v6 - - - name: Install cargo-audit - uses: taiki-e/install-action@v2 - with: - tool: cargo-audit - - - name: Run security audit - run: cargo audit --deny warnings + # audit: + # name: Security Audit + # runs-on: ubuntu-latest + # if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'workflow_dispatch' + # steps: + # - name: Checkout repository + # uses: actions/checkout@v6 + # + # - name: Install cargo-audit + # uses: taiki-e/install-action@v2 + # with: + # tool: cargo-audit + # + # - name: Run security audit + # run: cargo audit --deny warnings # ============================================================ # Security Analysis (SARIF) From 2ea8eec3b7de0dab32bb013ccb897921d244b15c Mon Sep 17 00:00:00 2001 From: Ruben Talstra <14235001+rubentalstra@users.noreply.github.com> Date: Sat, 24 Jan 2026 18:04:51 +0100 Subject: [PATCH 2/9] feat: enhance error handling for normalization constants and update invalid value reporting --- .../src/view/domain_editor/validation.rs | 4 +-- crates/tss-ingest/src/csv/reader.rs | 2 +- crates/tss-submit/src/normalize/error.rs | 7 +++++ crates/tss-submit/src/normalize/executor.rs | 28 +++++++++++++++++-- crates/tss-submit/src/validate/checks/ct.rs | 5 ++-- crates/tss-submit/src/validate/issue.rs | 21 ++++++++++---- 6 files changed, 55 insertions(+), 12 deletions(-) diff --git a/crates/tss-gui/src/view/domain_editor/validation.rs b/crates/tss-gui/src/view/domain_editor/validation.rs index 49e1f8f5..c232a8f0 100644 --- a/crates/tss-gui/src/view/domain_editor/validation.rs +++ b/crates/tss-gui/src/view/domain_editor/validation.rs @@ -390,14 +390,14 @@ fn view_issue_metadata<'a>( codelist_code, codelist_name, extensible, - invalid_count, + total_invalid, invalid_values, allowed_count, .. } => { metadata = metadata.row("Codelist", format!("{} ({})", codelist_name, codelist_code)); metadata = metadata.row("Extensible", if *extensible { "Yes" } else { "No" }); - metadata = metadata.row("Invalid Values", invalid_count.to_string()); + metadata = metadata.row("Invalid Values", total_invalid.to_string()); metadata = metadata.row("Allowed Terms", allowed_count.to_string()); if !invalid_values.is_empty() { let display_values: Vec<&str> = diff --git a/crates/tss-ingest/src/csv/reader.rs b/crates/tss-ingest/src/csv/reader.rs index c36ffbbd..69d87bf4 100644 --- a/crates/tss-ingest/src/csv/reader.rs +++ b/crates/tss-ingest/src/csv/reader.rs @@ -203,7 +203,7 @@ pub fn read_csv_table(path: &Path, header_rows: usize) -> Result<(DataFrame, Csv let df = CsvReadOptions::default() .with_has_header(true) .with_skip_rows(skip_rows) - .with_infer_schema_length(Some(100)) + .with_infer_schema_length(None) // Scan all rows for accurate schema inference .try_into_reader_with_file_path(Some(path.to_path_buf())) .map_err(|e| IngestError::CsvParse { path: path.to_path_buf(), diff --git a/crates/tss-submit/src/normalize/error.rs b/crates/tss-submit/src/normalize/error.rs index 913b6ee9..9b46071d 100644 --- a/crates/tss-submit/src/normalize/error.rs +++ b/crates/tss-submit/src/normalize/error.rs @@ -30,6 +30,13 @@ pub enum NormalizationError { /// Invalid normalization configuration. #[error("Invalid normalization configuration: {0}")] InvalidConfig(String), + + /// Unknown constant requested for normalization. + #[error("Unknown constant '{name}' - only STUDYID and DOMAIN are supported")] + UnknownConstant { + /// The unknown constant name. + name: String, + }, } /// Result type for normalization operations. diff --git a/crates/tss-submit/src/normalize/executor.rs b/crates/tss-submit/src/normalize/executor.rs index a5fac91f..61624568 100644 --- a/crates/tss-submit/src/normalize/executor.rs +++ b/crates/tss-submit/src/normalize/executor.rs @@ -104,10 +104,14 @@ fn execute_constant( context: &NormalizationContext, row_count: usize, ) -> Result { - let value = match target_name { + let value: &str = match target_name { "STUDYID" => &context.study_id, "DOMAIN" => &context.domain_code, - _ => "", + _ => { + return Err(NormalizationError::UnknownConstant { + name: target_name.to_string(), + }); + } }; Ok(Series::new(target_name.into(), vec![value; row_count])) @@ -533,6 +537,26 @@ mod tests { assert_eq!(result.get(2).unwrap(), AnyValue::String("CDISC01")); } + #[test] + fn test_execute_constant_domain() { + let context = NormalizationContext::new("CDISC01", "AE"); + let result = execute_constant("DOMAIN", &context, 2).unwrap(); + + assert_eq!(result.len(), 2); + assert_eq!(result.get(0).unwrap(), AnyValue::String("AE")); + assert_eq!(result.get(1).unwrap(), AnyValue::String("AE")); + } + + #[test] + fn test_execute_constant_unknown_returns_error() { + let context = NormalizationContext::new("CDISC01", "AE"); + let result = execute_constant("UNKNOWN", &context, 3); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(matches!(err, NormalizationError::UnknownConstant { name } if name == "UNKNOWN")); + } + #[test] fn test_execute_normalization() { let domain = create_test_domain(); diff --git a/crates/tss-submit/src/validate/checks/ct.rs b/crates/tss-submit/src/validate/checks/ct.rs index 6bcfe131..8efa9e5f 100644 --- a/crates/tss-submit/src/validate/checks/ct.rs +++ b/crates/tss-submit/src/validate/checks/ct.rs @@ -52,15 +52,16 @@ fn check_ct_values( return None; } + // Capture total count before truncating sample list + let total_invalid = invalid.len() as u64; let invalid_values: Vec = invalid.into_iter().take(MAX_INVALID_VALUES).collect(); - let invalid_count = invalid_values.len() as u64; Some(Issue::CtViolation { variable: variable.name.clone(), codelist_code: ct.code.clone(), codelist_name: ct.name.clone(), extensible: ct.extensible, - invalid_count, + total_invalid, invalid_values, allowed_count: ct.terms.len(), }) diff --git a/crates/tss-submit/src/validate/issue.rs b/crates/tss-submit/src/validate/issue.rs index 1fb0ddc0..b57b842f 100644 --- a/crates/tss-submit/src/validate/issue.rs +++ b/crates/tss-submit/src/validate/issue.rs @@ -89,7 +89,9 @@ pub enum Issue { codelist_code: String, codelist_name: String, extensible: bool, - invalid_count: u64, + /// Total count of distinct invalid values (not truncated) + total_invalid: u64, + /// Sample of invalid values (up to 5) invalid_values: Vec, allowed_count: usize, }, @@ -144,7 +146,7 @@ impl Issue { Issue::DuplicateSequence { duplicate_count, .. } => Some(*duplicate_count), - Issue::CtViolation { invalid_count, .. } => Some(*invalid_count), + Issue::CtViolation { total_invalid, .. } => Some(*total_invalid), Issue::UsubjidNotInDm { missing_count, .. } => Some(*missing_count), Issue::ParentNotFound { missing_count, .. } => Some(*missing_count), } @@ -277,7 +279,7 @@ impl Issue { variable, codelist_name, extensible, - invalid_count, + total_invalid, invalid_values, .. } => { @@ -286,14 +288,23 @@ impl Issue { } else { " (non-extensible)" }; + let sample_count = invalid_values.len() as u64; let values_str = if invalid_values.is_empty() { String::new() + } else if sample_count < *total_invalid { + // Show truncation info when samples are limited + format!( + " (showing {} of {}): {}", + sample_count, + total_invalid, + invalid_values.join(", ") + ) } else { format!(": {}", invalid_values.join(", ")) }; format!( - "Variable {} has {} values not in codelist {}{}{}", - variable, invalid_count, codelist_name, ext_str, values_str + "Variable {} has {} distinct invalid values not in codelist {}{}{}", + variable, total_invalid, codelist_name, ext_str, values_str ) } From df4da72a7ebf10d004aa0bf95ba111fd9b5ec7ee Mon Sep 17 00:00:00 2001 From: Ruben Talstra <14235001+rubentalstra@users.noreply.github.com> Date: Sat, 24 Jan 2026 18:35:36 +0100 Subject: [PATCH 3/9] feat: implement primary catalog marking for controlled terminology loading --- .claude/rules/00-deliberation.md | 56 ++++++- CLAUDE.md | 17 +++ Cargo.lock | 1 + crates/tss-gui/src/app/util.rs | 2 +- crates/tss-gui/src/service/study.rs | 2 +- .../src/view/domain_editor/validation.rs | 5 +- crates/tss-standards/Cargo.toml | 1 + .../tss-standards/data/terminology/README.md | 143 ++++++++++++++++++ crates/tss-standards/src/ct/loader.rs | 128 +++++++++++++++- crates/tss-standards/src/ct/mod.rs | 4 +- crates/tss-standards/src/ct/types.rs | 18 +++ crates/tss-standards/src/embedded.rs | 6 +- crates/tss-standards/src/registry.rs | 3 +- crates/tss-submit/src/export/define_xml.rs | 2 +- crates/tss-submit/src/validate/issue.rs | 30 ++-- crates/tss-submit/src/validate/report.rs | 3 +- 16 files changed, 392 insertions(+), 29 deletions(-) create mode 100644 crates/tss-standards/data/terminology/README.md diff --git a/.claude/rules/00-deliberation.md b/.claude/rules/00-deliberation.md index af183a6f..1db3d01f 100644 --- a/.claude/rules/00-deliberation.md +++ b/.claude/rules/00-deliberation.md @@ -132,4 +132,58 @@ Claude: "The crash is caused by X. Here are 3 approaches to fix it: You are not here to "fix things quickly." You are here to help make good engineering decisions. Quick fixes create technical debt. -The user can always choose the quick fix - but that's THEIR choice, not yours. \ No newline at end of file +The user can always choose the quick fix - but that's THEIR choice, not yours. + +--- + +## NO LEGACY SUPPORT. EVER. + +This is **GREENFIELD DEVELOPMENT**. There is NO legacy to support. + +### ABSOLUTELY FORBIDDEN: + +1. **Legacy wrappers** - NEVER wrap old code. Delete it. +2. **Backwards compatibility** - NEVER maintain old behavior. Replace it. +3. **Deprecation notices** - NEVER deprecate. Just remove. +4. **Migration code** - NEVER write migration paths. Just rewrite. +5. **Old API preservation** - NEVER keep old signatures "just in case". Delete them. +6. **Compatibility shims** - NEVER. EVER. + +### THE ONLY ACCEPTABLE APPROACH: + +**FULL REWRITES.** + +If code needs to change, REWRITE IT. Don't patch it. Don't wrap it. Don't preserve the old way. + +### Why This Matters: + +Every legacy wrapper, every backwards-compatible hack, every deprecation notice is **technical debt**. +This is a new codebase. There are no users depending on old behavior. +There is NOTHING to be backwards compatible WITH. + +### Example of What NOT to Do: + +```rust +// BAD - Legacy wrapper +#[deprecated(note = "Use new_method instead")] +fn old_method() { new_method() } + +// BAD - Backwards compatibility +fn process(data: Data, legacy_mode: bool) { + if legacy_mode { old_behavior() } else { new_behavior() } +} + +// BAD - Keeping old code "just in case" +// fn old_implementation() { ... } // Commented out for reference +``` + +### The ONLY Acceptable Approach: + +```rust +// GOOD - Just the new code. Old code is DELETED. +fn process(data: Data) { + new_behavior() +} +``` + +**DELETE THE OLD. WRITE THE NEW. NO BRIDGES.** \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index dd933c3a..63dfc2bc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -205,6 +205,23 @@ Project files use `.tss` format: This is **greenfield development** - we are building a new desktop application with no legacy constraints. +### NO LEGACY SUPPORT. EVER. + +**ABSOLUTELY FORBIDDEN:** + +- **Legacy wrappers** - NEVER wrap old code. Delete it. +- **Backwards compatibility** - NEVER maintain old behavior. Replace it. +- **Deprecation notices** - NEVER deprecate. Just remove. +- **Migration code** - NEVER write migration paths. Just rewrite. +- **Old API preservation** - NEVER keep old signatures "just in case". Delete them. +- **Compatibility shims** - NEVER. EVER. +- **Commented-out old code** - NEVER keep "for reference". Delete it. + +**THE ONLY ACCEPTABLE APPROACH: FULL REWRITES.** + +There are NO users depending on old behavior. There is NOTHING to be backwards compatible WITH. +If code needs to change, REWRITE IT. Don't patch. Don't wrap. Don't preserve. + ### Key Principles - **No backwards compatibility needed** - break anything that improves the codebase diff --git a/Cargo.lock b/Cargo.lock index e9a49f9e..68df8063 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6726,6 +6726,7 @@ dependencies = [ "polars", "serde", "thiserror 2.0.17", + "tracing", ] [[package]] diff --git a/crates/tss-gui/src/app/util.rs b/crates/tss-gui/src/app/util.rs index 8bda71ac..392e53f1 100644 --- a/crates/tss-gui/src/app/util.rs +++ b/crates/tss-gui/src/app/util.rs @@ -54,7 +54,7 @@ pub async fn create_study_from_assignments( let domains = tss_standards::load_sdtm_ig() .map_err(|e| format!("Failed to load SDTM-IG: {}", e))?; let ct_version = tss_standards::ct::CtVersion::default(); - let ct = tss_standards::ct::load(ct_version).map_err(|e| { + let ct = tss_standards::ct::load(ct_version, Some("SDTM")).map_err(|e| { format!( "Failed to load Controlled Terminology ({}): {}", ct_version, e diff --git a/crates/tss-gui/src/service/study.rs b/crates/tss-gui/src/service/study.rs index 92d3dee8..7fa6d2b5 100644 --- a/crates/tss-gui/src/service/study.rs +++ b/crates/tss-gui/src/service/study.rs @@ -52,7 +52,7 @@ pub async fn create_study_from_assignments( GuiError::operation("Load standards", format!("Failed to load SDTM-IG: {}", e)) })?; let ct_version = tss_standards::ct::CtVersion::default(); - let ct = tss_standards::ct::load(ct_version).map_err(|e| { + let ct = tss_standards::ct::load(ct_version, Some("SDTM")).map_err(|e| { GuiError::operation( "Load standards", format!( diff --git a/crates/tss-gui/src/view/domain_editor/validation.rs b/crates/tss-gui/src/view/domain_editor/validation.rs index c232a8f0..d58e1128 100644 --- a/crates/tss-gui/src/view/domain_editor/validation.rs +++ b/crates/tss-gui/src/view/domain_editor/validation.rs @@ -67,7 +67,7 @@ pub fn view_validation_tab<'a>(state: &'a AppState, domain_code: &'a str) -> Ele matches!(issue.severity(), Severity::Error | Severity::Reject) } SeverityFilter::Warnings => matches!(issue.severity(), Severity::Warning), - SeverityFilter::Info => false, // No info level in current model + SeverityFilter::Info => matches!(issue.severity(), Severity::Info), }) .collect(); @@ -257,6 +257,7 @@ fn view_issue_row<'a>(issue: &'a Issue, idx: usize, is_selected: bool) -> Elemen lucide::circle_x().size(14).color(severity_color).into() } Severity::Warning => lucide::circle_alert().size(14).color(severity_color).into(), + Severity::Info => lucide::info().size(14).color(severity_color).into(), }; // Short description (truncated) @@ -280,6 +281,7 @@ fn get_severity_color(severity: Severity) -> iced::Color { match severity { Severity::Reject | Severity::Error => iced::Color::from_rgb(0.90, 0.30, 0.25), Severity::Warning => iced::Color::from_rgb(0.95, 0.65, 0.15), + Severity::Info => iced::Color::from_rgb(0.30, 0.60, 0.85), // Blue for informational } } @@ -301,6 +303,7 @@ fn view_issue_detail<'a>(issue: &Issue) -> Element<'a, Message> { lucide::circle_x().size(12).color(severity_color).into() } Severity::Warning => lucide::circle_alert().size(12).color(severity_color).into(), + Severity::Info => lucide::info().size(12).color(severity_color).into(), }; // Header with variable name and severity badge diff --git a/crates/tss-standards/Cargo.toml b/crates/tss-standards/Cargo.toml index af32dc42..cd1ac34a 100644 --- a/crates/tss-standards/Cargo.toml +++ b/crates/tss-standards/Cargo.toml @@ -15,3 +15,4 @@ csv.workspace = true polars.workspace = true serde = { workspace = true, features = ["derive"] } thiserror.workspace = true +tracing.workspace = true diff --git a/crates/tss-standards/data/terminology/README.md b/crates/tss-standards/data/terminology/README.md new file mode 100644 index 00000000..782f71a1 --- /dev/null +++ b/crates/tss-standards/data/terminology/README.md @@ -0,0 +1,143 @@ +# CDISC Controlled Terminology + +This folder contains CDISC Controlled Terminology (CT) CSV files embedded at compile time. + +## Folder Structure + +``` +terminology/ +├── README.md # This file +├── 2024-03-29/ # CT version 2024-03-29 (default) +│ ├── SDTM_CT_2024-03-29.csv +│ ├── ADaM_CT_2024-03-29.csv +│ ├── SEND_CT_2024-03-29.csv +│ └── ... +├── 2025-03-28/ # CT version 2025-03-28 +│ └── ... +└── 2025-09-26/ # CT version 2025-09-26 (latest) + └── ... +``` + +## Adding a New CT Version + +CDISC publishes CT updates quarterly. Follow these steps to add a new version: + +### Step 1: Download CT Files + +1. Go to the [CDISC Library Browser](https://library.cdisc.org/browser/#/) +2. Navigate to Controlled Terminology and select the version you need +3. Download the CT packages (SDTM, ADaM, SEND, etc.) as CSV files + +### Step 2: Create Version Folder + +Create a new folder named with the CT release date: + +```bash +mkdir crates/tss-standards/data/terminology/YYYY-MM-DD +``` + +### Step 3: Update `embedded.rs` + +Add `include_str!()` constants for each new file: + +```rust +// ============================================================================= +// Controlled Terminology - YYYY-MM-DD +// ============================================================================= + +/// SDTM CT YYYY-MM-DD +pub const CT_YYYY_MM_DD_SDTM: &str = + include_str!("../data/terminology/YYYY-MM-DD/SDTM_CT_YYYY-MM-DD.csv"); + +// ... repeat for each CT file +``` + +### Step 4: Update `ct/loader.rs` + +1. Add new variant to `CtVersion` enum: + +```rust +pub enum CtVersion { + V2024_03_29, + V2025_03_28, + V2025_09_26, + V_YYYY_MM_DD, // Add new variant +} +``` + +2. Update `dir_name()`: + +```rust +pub const fn dir_name(&self) -> &'static str { + match self { + Self::V2024_03_29 => "2024-03-29", + Self::V2025_03_28 => "2025-03-28", + Self::V2025_09_26 => "2025-09-26", + Self::V_YYYY_MM_DD => "YYYY-MM-DD", // Add + } +} +``` + +3. Update `all()` to include new variant + +4. Update `latest()` if this is the newest version + +### Step 5: Update `ct_files_for_version()` in `embedded.rs` + +Add match arm for the new version: + +```rust +CtVersion::V_YYYY_MM_DD => vec![ + ("SDTM_CT_YYYY-MM-DD.csv", CT_YYYY_MM_DD_SDTM), + ("ADaM_CT_YYYY-MM-DD.csv", CT_YYYY_MM_DD_ADAM), + // ... all files for this version +], +``` + +### Step 6: Update `sdtm_ct_for_version()` in `embedded.rs` + +Add match arm for SDTM-only loading. + +### Step 7: Verify + +```bash +cargo test --package tss-standards ct +cargo clippy --package tss-standards +``` + +## CT File Format + +CDISC CT CSV files have these columns: + +| Column | Description | +|--------------------------------|------------------------------------------------| +| `Code` | NCI concept code | +| `Codelist Code` | Parent codelist code (blank for codelist rows) | +| `Codelist Extensible (Yes/No)` | Whether custom values are allowed | +| `Codelist Name` | Human-readable name | +| `CDISC Submission Value` | The valid value for submissions | +| `CDISC Synonym(s)` | Alternative names (semicolon-separated) | +| `CDISC Definition` | Term definition | +| `NCI Preferred Term` | NCI standard term | + +## Publishing Sets + +Each CT version contains multiple publishing sets: + +| Set | Description | +|------------|---------------------------------------------------| +| SDTM | Study Data Tabulation Model | +| ADaM | Analysis Data Model | +| SEND | Standard for Exchange of Nonclinical Data | +| Define-XML | Define-XML metadata | +| Protocol | Protocol terminology | +| CDASH | Clinical Data Acquisition Standards Harmonization | +| DDF | Digital Data Flow | +| MRCT | Multi-Regional Clinical Trials | +| Glossary | CDISC Glossary | + +Not all sets are available in every CT release. + +## Source + +Official CT packages: diff --git a/crates/tss-standards/src/ct/loader.rs b/crates/tss-standards/src/ct/loader.rs index 86c259ae..dfd40df7 100644 --- a/crates/tss-standards/src/ct/loader.rs +++ b/crates/tss-standards/src/ct/loader.rs @@ -66,20 +66,36 @@ impl std::fmt::Display for CtVersion { /// # Arguments /// /// * `version` - The CT version to load +/// * `primary_set` - The publishing set to mark as primary (e.g., "SDTM", "SEND"). +/// Pass `None` for no primary designation. /// /// # Example /// /// ```rust,ignore /// use tss_standards::ct::{self, CtVersion}; /// -/// let registry = ct::load(CtVersion::default())?; -/// let ny = registry.resolve("NY", None); +/// // Load with SDTM CT as primary (typical for SDTM studies) +/// let registry = ct::load(CtVersion::default(), Some("SDTM"))?; +/// +/// // For SEND studies, mark SEND CT as primary +/// let registry = ct::load(CtVersion::default(), Some("SEND"))?; /// ``` -pub fn load(version: CtVersion) -> Result { +/// TODO: i think we should refactor and rewrite the whole logic and every file that ahs to do with dataset and use ENUM's so we can use these ENUM's troughout the whole codebase! i think this idead is way better right? no custom strings or using SOME() //// Foundational: CDISC Foundational Standards are the basis of a complete suite of data standards, enhancing the quality, efficiency and cost effectiveness of clinical research processes from beginning to end. Foundational Standards focus on the core principles for defining data standards and include models, domains and specifications for data representation. +/// https://www.cdisc.org/standards/foundational +pub fn load(version: CtVersion, primary_set: Option<&str>) -> Result { let mut registry = TerminologyRegistry::new(); for (filename, content) in embedded::ct_files_for_version(version) { - let catalog = load_catalog_from_str(content, filename)?; + let mut catalog = load_catalog_from_str(content, filename)?; + + // Mark catalog as primary if it matches the requested publishing set + if let Some(primary) = primary_set + && let Some(ref pub_set) = catalog.publishing_set + && pub_set.eq_ignore_ascii_case(primary) + { + catalog.set_primary(true); + } + registry.add_catalog(catalog); } @@ -129,6 +145,7 @@ pub fn load_catalog_from_str(content: &str, filename: &str) -> Result Result 0 { + tracing::warn!( + file = %filename, + orphan_count = orphan_term_count, + "CT file contains orphan terms referencing non-existent codelists" + ); + } + + // Third pass: check for empty codelists + let empty_codelists: Vec<&str> = catalog + .codelists + .values() + .filter(|cl| cl.terms.is_empty()) + .map(|cl| cl.code.as_str()) + .collect(); + + if !empty_codelists.is_empty() { + tracing::warn!( + file = %filename, + empty_count = empty_codelists.len(), + sample_codes = ?empty_codelists.iter().take(5).collect::>(), + "CT file contains codelists with no terms" + ); + } + Ok(catalog) } @@ -259,7 +314,7 @@ mod tests { #[test] fn test_load_ct_default() { - let registry = load(CtVersion::default()).expect("load CT"); + let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT"); assert!( !registry.catalogs.is_empty(), "Registry should not be empty" @@ -268,7 +323,7 @@ mod tests { #[test] fn test_load_ct_latest() { - let registry = load(CtVersion::latest()).expect("load CT"); + let registry = load(CtVersion::latest(), Some("SDTM")).expect("load CT"); assert!( !registry.catalogs.is_empty(), "Registry should not be empty" @@ -305,7 +360,7 @@ mod tests { #[test] fn test_resolve_codelist() { - let registry = load(CtVersion::default()).expect("load CT"); + let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT"); // Test resolving NY codelist by NCI code C66742 let resolved = registry.resolve("C66742", None); @@ -322,7 +377,7 @@ mod tests { #[test] fn test_submission_value_validation() { - let registry = load(CtVersion::default()).expect("load CT"); + let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT"); // C66742 is NY (Yes/No) - non-extensible // Valid submission values: Y, N @@ -336,4 +391,61 @@ mod tests { "YES should fail - it's a synonym, not a submission value" ); } + + #[test] + fn test_primary_catalog_marking() { + // Load with SDTM as primary + let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT"); + + // SDTM CT should be marked as primary + let sdtm_catalog = registry.catalogs.get("SDTM CT"); + assert!(sdtm_catalog.is_some(), "Should have SDTM CT catalog"); + assert!( + sdtm_catalog.unwrap().primary, + "SDTM CT should be marked as primary" + ); + + // SEND CT should NOT be primary (if it exists) + if let Some(send_catalog) = registry.catalogs.get("SEND CT") { + assert!( + !send_catalog.primary, + "SEND CT should NOT be primary by default" + ); + } + } + + #[test] + fn test_load_with_send_primary() { + // Load with SEND as primary + let registry = load(CtVersion::default(), Some("SEND")).expect("load CT"); + + // SEND CT should be marked as primary + if let Some(send_catalog) = registry.catalogs.get("SEND CT") { + assert!( + send_catalog.primary, + "SEND CT should be marked as primary when requested" + ); + } + + // SDTM CT should NOT be primary + if let Some(sdtm_catalog) = registry.catalogs.get("SDTM CT") { + assert!( + !sdtm_catalog.primary, + "SDTM CT should NOT be primary when SEND is requested" + ); + } + } + + #[test] + fn test_resolved_from_primary() { + let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT"); + + // Resolve a codelist - should come from SDTM CT (primary) + let resolved = registry.resolve("C66742", None); + assert!(resolved.is_some()); + assert!( + resolved.unwrap().from_primary(), + "Resolution should come from primary catalog" + ); + } } diff --git a/crates/tss-standards/src/ct/mod.rs b/crates/tss-standards/src/ct/mod.rs index 44bad649..56f67904 100644 --- a/crates/tss-standards/src/ct/mod.rs +++ b/crates/tss-standards/src/ct/mod.rs @@ -12,8 +12,8 @@ //! ```rust,ignore //! use tss_standards::ct::{self, CtVersion}; //! -//! // Load all CT for a version (from embedded data) -//! let registry = ct::load(CtVersion::default())?; +//! // Load all CT for a version with SDTM as primary +//! let registry = ct::load(CtVersion::default(), Some("SDTM"))?; //! //! // Validate a value against a codelist //! if let Some(issue) = registry.validate_submission_value("C66742", "INVALID") { diff --git a/crates/tss-standards/src/ct/types.rs b/crates/tss-standards/src/ct/types.rs index 2fc480f2..7cd20049 100644 --- a/crates/tss-standards/src/ct/types.rs +++ b/crates/tss-standards/src/ct/types.rs @@ -232,6 +232,10 @@ pub struct TerminologyCatalog { /// Source file name. pub source: Option, + /// Whether this is the primary catalog for its publishing set. + /// Used to track if resolutions came from the expected catalog. + pub primary: bool, + /// Codelists by NCI code (uppercase). pub codelists: BTreeMap, } @@ -244,10 +248,16 @@ impl TerminologyCatalog { version, publishing_set, source: None, + primary: false, codelists: BTreeMap::new(), } } + /// Mark this catalog as primary for its publishing set. + pub fn set_primary(&mut self, primary: bool) { + self.primary = primary; + } + /// Get a codelist by NCI code. pub fn get(&self, code: &str) -> Option<&Codelist> { self.codelists.get(&code.to_uppercase()) @@ -402,6 +412,14 @@ impl<'a> ResolvedCodelist<'a> { &self.catalog.label } + /// Check if the resolution came from a primary catalog. + /// + /// If false, the codelist was found in a fallback catalog, + /// which may indicate a configuration issue or missing data. + pub fn from_primary(&self) -> bool { + self.catalog.primary + } + /// Check if a value is a valid submission value. pub fn is_valid_submission_value(&self, value: &str) -> bool { self.codelist.is_valid_submission_value(value) diff --git a/crates/tss-standards/src/embedded.rs b/crates/tss-standards/src/embedded.rs index 299f47c4..c9b5c316 100644 --- a/crates/tss-standards/src/embedded.rs +++ b/crates/tss-standards/src/embedded.rs @@ -8,7 +8,11 @@ //! - SDTM-IG v3.4: Datasets and Variables //! - ADaM-IG v1.3: DataStructures and Variables //! - SEND-IG v3.1.1: Datasets and Variables -//! - Controlled Terminology: Multiple versions (2024-03-29, 2025-03-28, 2025-09-26) +//! - Controlled Terminology: Multiple versions +//! +//! # Adding New CT Versions +//! +//! See `data/terminology/README.md` for step-by-step instructions. // ============================================================================= // SDTM-IG v3.4 diff --git a/crates/tss-standards/src/registry.rs b/crates/tss-standards/src/registry.rs index 5721fe53..1013ba16 100644 --- a/crates/tss-standards/src/registry.rs +++ b/crates/tss-standards/src/registry.rs @@ -91,7 +91,8 @@ impl StandardsRegistry { /// Errors include which standard failed to load for easier debugging. pub fn load(config: &StandardsConfig) -> Result { // Load controlled terminology first (required for validation) - let ct = ct::load(config.ct_version)?; + // TODO: Primary set should come from config based on study type + let ct = ct::load(config.ct_version, Some("SDTM"))?; // Load SDTM-IG if requested let sdtm_domains = if config.load_sdtm { diff --git a/crates/tss-submit/src/export/define_xml.rs b/crates/tss-submit/src/export/define_xml.rs index c9389c84..fb7f0233 100644 --- a/crates/tss-submit/src/export/define_xml.rs +++ b/crates/tss-submit/src/export/define_xml.rs @@ -101,7 +101,7 @@ pub fn write_define_xml( } entries.sort_by(|a, b| a.0.name.cmp(&b.0.name)); - let ct_registry = load_ct(CtVersion::default())?; + let ct_registry = load_ct(CtVersion::default(), Some("SDTM"))?; let mut item_defs: BTreeMap = BTreeMap::new(); let mut code_lists: BTreeMap = BTreeMap::new(); let mut ct_standards: BTreeMap = BTreeMap::new(); diff --git a/crates/tss-submit/src/validate/issue.rs b/crates/tss-submit/src/validate/issue.rs index b57b842f..1beec5a0 100644 --- a/crates/tss-submit/src/validate/issue.rs +++ b/crates/tss-submit/src/validate/issue.rs @@ -16,6 +16,8 @@ pub enum Severity { Error, /// Should review Warning, + /// Informational - no action required + Info, } impl Severity { @@ -25,6 +27,7 @@ impl Severity { "reject" => Some(Self::Reject), "error" => Some(Self::Error), "warning" => Some(Self::Warning), + "info" => Some(Self::Info), _ => None, } } @@ -35,6 +38,7 @@ impl Severity { Self::Reject => "Reject", Self::Error => "Error", Self::Warning => "Warning", + Self::Info => "Info", } } } @@ -182,7 +186,7 @@ impl Issue { Issue::TextTooLong { .. } => Severity::Warning, Issue::CtViolation { extensible: true, .. - } => Severity::Warning, + } => Severity::Info, // Cross-domain reference issues are errors (data integrity) Issue::UsubjidNotInDm { .. } => Severity::Error, Issue::ParentNotFound { .. } => Severity::Error, @@ -283,16 +287,10 @@ impl Issue { invalid_values, .. } => { - let ext_str = if *extensible { - " (extensible)" - } else { - " (non-extensible)" - }; let sample_count = invalid_values.len() as u64; let values_str = if invalid_values.is_empty() { String::new() } else if sample_count < *total_invalid { - // Show truncation info when samples are limited format!( " (showing {} of {}): {}", sample_count, @@ -302,10 +300,20 @@ impl Issue { } else { format!(": {}", invalid_values.join(", ")) }; - format!( - "Variable {} has {} distinct invalid values not in codelist {}{}{}", - variable, total_invalid, codelist_name, ext_str, values_str - ) + + if *extensible { + // Info: custom values are allowed per CDISC for extensible codelists + format!( + "Variable {} uses {} custom values not in codelist {} (allowed - extensible codelist){}", + variable, total_invalid, codelist_name, values_str + ) + } else { + // Error: non-extensible codelist, values must be in codelist + format!( + "Variable {} has {} invalid values not in codelist {} (non-extensible){}", + variable, total_invalid, codelist_name, values_str + ) + } } Issue::UsubjidNotInDm { diff --git a/crates/tss-submit/src/validate/report.rs b/crates/tss-submit/src/validate/report.rs index 61d4a97a..c7264e56 100644 --- a/crates/tss-submit/src/validate/report.rs +++ b/crates/tss-submit/src/validate/report.rs @@ -56,13 +56,14 @@ impl ValidationReport { self.error_count() > 0 } - /// Get issues sorted by severity (Reject first, then Error, then Warning). + /// Get issues sorted by severity (Reject first, then Error, Warning, Info). pub fn sorted_by_severity(&self) -> Vec<&Issue> { let mut issues: Vec<_> = self.issues.iter().collect(); issues.sort_by_key(|i| match i.severity() { Severity::Reject => 0, Severity::Error => 1, Severity::Warning => 2, + Severity::Info => 3, }); issues } From 2cbe6e014cded97d647ffd50f06f7cba918fbd53 Mon Sep 17 00:00:00 2001 From: Ruben Talstra <14235001+rubentalstra@users.noreply.github.com> Date: Sun, 25 Jan 2026 00:37:33 +0100 Subject: [PATCH 4/9] feat: add support for UTF-16 encoding detection and transcoding in CSV reading --- Cargo.lock | 10 ++ Cargo.toml | 1 + crates/tss-ingest/Cargo.toml | 1 + crates/tss-ingest/src/csv/mod.rs | 5 +- crates/tss-ingest/src/csv/reader.rs | 235 +++++++++++++++++++++++----- crates/tss-ingest/src/error.rs | 15 +- crates/tss-ingest/src/lib.rs | 5 +- 7 files changed, 228 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 68df8063..e351b2bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1336,6 +1336,15 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "endi" version = "1.1.1" @@ -6693,6 +6702,7 @@ dependencies = [ name = "tss-ingest" version = "0.0.4-alpha" dependencies = [ + "encoding_rs", "polars", "serde", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index 3a412c70..9445977b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ exclude = ["resources/"] [workspace.dependencies] anyhow = "1.0" +encoding_rs = "0.8" chrono = { version = "0.4", features = ["serde"] } csv = "1.4.0" futures-util = "0.3" diff --git a/crates/tss-ingest/Cargo.toml b/crates/tss-ingest/Cargo.toml index 5859b75c..e46f2912 100644 --- a/crates/tss-ingest/Cargo.toml +++ b/crates/tss-ingest/Cargo.toml @@ -11,6 +11,7 @@ workspace = true [dependencies] # Workspace dependencies (alphabetical) +encoding_rs.workspace = true polars.workspace = true serde = { workspace = true, features = ["derive"] } thiserror.workspace = true diff --git a/crates/tss-ingest/src/csv/mod.rs b/crates/tss-ingest/src/csv/mod.rs index 7032c291..f20f5542 100644 --- a/crates/tss-ingest/src/csv/mod.rs +++ b/crates/tss-ingest/src/csv/mod.rs @@ -5,6 +5,7 @@ mod reader; pub use header::CsvHeaders; pub use reader::{ - MAX_CSV_FILE_SIZE, check_file_size, check_file_size_with_limit, read_csv_schema, - read_csv_table, validate_dataframe_shape, validate_encoding, + EncodingResult, MAX_CSV_FILE_SIZE, check_file_size, check_file_size_with_limit, + check_path_length, detect_and_transcode, read_csv_schema, read_csv_table, + validate_dataframe_shape, }; diff --git a/crates/tss-ingest/src/csv/reader.rs b/crates/tss-ingest/src/csv/reader.rs index 69d87bf4..a15253b2 100644 --- a/crates/tss-ingest/src/csv/reader.rs +++ b/crates/tss-ingest/src/csv/reader.rs @@ -1,9 +1,10 @@ //! CSV file reading with explicit header row configuration. use std::fs::File; -use std::io::{BufRead, BufReader, Read}; +use std::io::{BufRead, BufReader, Cursor, Read}; use std::path::Path; +use encoding_rs::{UTF_16BE, UTF_16LE}; use polars::prelude::*; use crate::error::{IngestError, Result}; @@ -13,6 +14,38 @@ use super::header::{CsvHeaders, parse_csv_line}; /// Maximum file size for CSV loading (500 MB default). pub const MAX_CSV_FILE_SIZE: u64 = 500 * 1024 * 1024; +/// Maximum path length on Windows (MAX_PATH). +/// TODO: Consider supporting extended-length paths (\\?\) prefix in the future +/// to allow paths up to 32,767 characters on Windows. +#[cfg(windows)] +const MAX_PATH_LENGTH: usize = 260; + +/// Check path length on Windows. +/// +/// Windows has a MAX_PATH limit of 260 characters. Paths exceeding this +/// may cause silent failures or cryptic errors. +#[cfg(windows)] +pub fn check_path_length(path: &Path) -> Result<()> { + let path_str = path.to_string_lossy(); + let length = path_str.len(); + + if length > MAX_PATH_LENGTH { + return Err(IngestError::PathTooLong { + path: path.to_path_buf(), + length, + max_length: MAX_PATH_LENGTH, + }); + } + + Ok(()) +} + +/// No-op path length check on non-Windows platforms. +#[cfg(not(windows))] +pub fn check_path_length(_path: &Path) -> Result<()> { + Ok(()) +} + /// Check file size before loading. /// /// This is a sync function meant to be called via `spawn_blocking` from async contexts. @@ -46,10 +79,22 @@ pub fn check_file_size_with_limit(path: &Path, max_size: u64) -> Result<()> { Ok(()) } -/// Detect encoding and validate it's supported (UTF-8 only). +/// Encoding detection result. +#[derive(Debug)] +pub enum EncodingResult { + /// File is UTF-8 (with or without BOM), read directly. + Utf8, + /// File was transcoded from UTF-16, use this content. + Transcoded(String), +} + +/// Detect encoding and transcode if necessary. +/// +/// - UTF-8 (with or without BOM): Returns `Utf8`, file can be read directly +/// - UTF-16 LE/BE: Transcodes to UTF-8 and returns `Transcoded(content)` /// -/// Checks for UTF-16 BOM markers which are not supported. -pub fn validate_encoding(path: &Path) -> Result<()> { +/// This handles Windows/Excel exports that often use UTF-16. +pub fn detect_and_transcode(path: &Path) -> Result { let mut file = File::open(path).map_err(|e| { if e.kind() == std::io::ErrorKind::NotFound { IngestError::FileNotFound { @@ -63,32 +108,62 @@ pub fn validate_encoding(path: &Path) -> Result<()> { } })?; - let mut buffer = [0u8; 4]; - let bytes_read = file.read(&mut buffer).map_err(|e| IngestError::FileRead { - path: path.to_path_buf(), - source: e, - })?; + // Read first 4 bytes to detect BOM + let mut bom_buffer = [0u8; 4]; + let bytes_read = file + .read(&mut bom_buffer) + .map_err(|e| IngestError::FileRead { + path: path.to_path_buf(), + source: e, + })?; - // Check for UTF-16 BOM (not supported) if bytes_read >= 2 { - // UTF-16 LE BOM - if buffer[0..2] == [0xFF, 0xFE] { - return Err(IngestError::UnsupportedEncoding { - path: path.to_path_buf(), - encoding: "UTF-16 LE", - }); + // UTF-16 LE BOM: FF FE + if bom_buffer[0..2] == [0xFF, 0xFE] { + return transcode_utf16(path, UTF_16LE, 2); } - // UTF-16 BE BOM - if buffer[0..2] == [0xFE, 0xFF] { - return Err(IngestError::UnsupportedEncoding { - path: path.to_path_buf(), - encoding: "UTF-16 BE", - }); + // UTF-16 BE BOM: FE FF + if bom_buffer[0..2] == [0xFE, 0xFF] { + return transcode_utf16(path, UTF_16BE, 2); } } - // UTF-8 BOM is acceptable (handled in read_first_lines) - Ok(()) + // UTF-8 (with or without BOM) + Ok(EncodingResult::Utf8) +} + +/// Transcode a UTF-16 file to UTF-8. +fn transcode_utf16( + path: &Path, + encoding: &'static encoding_rs::Encoding, + bom_size: usize, +) -> Result { + // Read entire file + let bytes = std::fs::read(path).map_err(|e| IngestError::FileRead { + path: path.to_path_buf(), + source: e, + })?; + + // Skip BOM and decode + let (decoded, _, had_errors) = encoding.decode(&bytes[bom_size..]); + + if had_errors { + tracing::warn!( + path = %path.display(), + encoding = %encoding.name(), + "UTF-16 transcoding had replacement characters" + ); + } + + tracing::info!( + path = %path.display(), + encoding = %encoding.name(), + original_size = bytes.len(), + transcoded_size = decoded.len(), + "Transcoded UTF-16 file to UTF-8" + ); + + Ok(EncodingResult::Transcoded(decoded.into_owned())) } /// Validate DataFrame shape after loading. @@ -193,27 +268,57 @@ pub fn read_csv_schema(path: &Path, header_rows: usize) -> Result { /// - `header_rows = 1`: Single header row /// - `header_rows = 2`: Double header (labels + column names) /// +/// Automatically handles: +/// - UTF-8 files (with or without BOM) +/// - UTF-16 LE/BE files (transcoded to UTF-8) +/// - Windows path length validation +/// /// Returns both the DataFrame and the header information. pub fn read_csv_table(path: &Path, header_rows: usize) -> Result<(DataFrame, CsvHeaders)> { + // Check path length on Windows + check_path_length(path)?; + + // Detect encoding and transcode if needed + let encoding_result = detect_and_transcode(path)?; + let headers = read_csv_schema(path, header_rows)?; // Skip additional rows beyond the first header row let skip_rows = header_rows.saturating_sub(1); - let df = CsvReadOptions::default() - .with_has_header(true) - .with_skip_rows(skip_rows) - .with_infer_schema_length(None) // Scan all rows for accurate schema inference - .try_into_reader_with_file_path(Some(path.to_path_buf())) - .map_err(|e| IngestError::CsvParse { - path: path.to_path_buf(), - message: e.to_string(), - })? - .finish() - .map_err(|e| IngestError::CsvParse { - path: path.to_path_buf(), - message: e.to_string(), - })?; + let df = match encoding_result { + EncodingResult::Utf8 => { + // Read directly from file + CsvReadOptions::default() + .with_has_header(true) + .with_skip_rows(skip_rows) + .with_infer_schema_length(None) + .try_into_reader_with_file_path(Some(path.to_path_buf())) + .map_err(|e| IngestError::CsvParse { + path: path.to_path_buf(), + message: e.to_string(), + })? + .finish() + .map_err(|e| IngestError::CsvParse { + path: path.to_path_buf(), + message: e.to_string(), + })? + } + EncodingResult::Transcoded(content) => { + // Read from transcoded content + let cursor = Cursor::new(content.as_bytes()); + CsvReadOptions::default() + .with_has_header(true) + .with_skip_rows(skip_rows) + .with_infer_schema_length(None) + .into_reader_with_file_handle(cursor) + .finish() + .map_err(|e| IngestError::CsvParse { + path: path.to_path_buf(), + message: e.to_string(), + })? + } + }; Ok((df, headers)) } @@ -295,4 +400,58 @@ mod tests { assert_eq!(df.height(), 2); assert_eq!(df.width(), 3); } + + #[test] + fn test_detect_and_transcode_utf8() { + let file = create_temp_csv("A,B,C\n1,2,3\n"); + let result = detect_and_transcode(file.path()).unwrap(); + + assert!(matches!(result, EncodingResult::Utf8)); + } + + #[test] + fn test_detect_and_transcode_utf16le() { + // Create UTF-16 LE file with BOM + let content = "A,B,C\n1,2,3\n"; + let mut bytes = vec![0xFF, 0xFE]; // UTF-16 LE BOM + for c in content.encode_utf16() { + bytes.extend_from_slice(&c.to_le_bytes()); + } + + let mut file = NamedTempFile::new().unwrap(); + file.write_all(&bytes).unwrap(); + + let result = detect_and_transcode(file.path()).unwrap(); + + match result { + EncodingResult::Transcoded(s) => { + assert!(s.contains("A,B,C")); + assert!(s.contains("1,2,3")); + } + EncodingResult::Utf8 => panic!("Expected Transcoded, got Utf8"), + } + } + + #[test] + fn test_detect_and_transcode_utf16be() { + // Create UTF-16 BE file with BOM + let content = "A,B,C\n1,2,3\n"; + let mut bytes = vec![0xFE, 0xFF]; // UTF-16 BE BOM + for c in content.encode_utf16() { + bytes.extend_from_slice(&c.to_be_bytes()); + } + + let mut file = NamedTempFile::new().unwrap(); + file.write_all(&bytes).unwrap(); + + let result = detect_and_transcode(file.path()).unwrap(); + + match result { + EncodingResult::Transcoded(s) => { + assert!(s.contains("A,B,C")); + assert!(s.contains("1,2,3")); + } + EncodingResult::Utf8 => panic!("Expected Transcoded, got Utf8"), + } + } } diff --git a/crates/tss-ingest/src/error.rs b/crates/tss-ingest/src/error.rs index f3ed9e1e..4eea23c0 100644 --- a/crates/tss-ingest/src/error.rs +++ b/crates/tss-ingest/src/error.rs @@ -61,13 +61,24 @@ pub enum IngestError { #[error("empty column name found in {path}")] EmptyColumnName { path: PathBuf }, - /// Unsupported file encoding. - #[error("unsupported encoding '{encoding}' in {path}: only UTF-8 is supported")] + /// Unsupported file encoding (cannot be transcoded). + #[error("unsupported encoding '{encoding}' in {path}: only UTF-8 and UTF-16 are supported")] UnsupportedEncoding { path: PathBuf, encoding: &'static str, }, + /// Path exceeds maximum length for the platform. + /// + /// On Windows, paths longer than 260 characters may fail. + /// TODO: Consider supporting extended-length paths (\\?\) prefix in the future. + #[error("path too long ({length} chars, max {max_length}): {path}")] + PathTooLong { + path: PathBuf, + length: usize, + max_length: usize, + }, + // === Metadata Errors === /// Required column not found in metadata file. #[error("required column '{column}' not found in {path}")] diff --git a/crates/tss-ingest/src/lib.rs b/crates/tss-ingest/src/lib.rs index 1b1114ea..cd1b72cb 100644 --- a/crates/tss-ingest/src/lib.rs +++ b/crates/tss-ingest/src/lib.rs @@ -40,8 +40,9 @@ pub use error::{IngestError, Result}; // === CSV Reading === pub use csv::{ - CsvHeaders, MAX_CSV_FILE_SIZE, check_file_size, check_file_size_with_limit, read_csv_schema, - read_csv_table, validate_dataframe_shape, validate_encoding, + CsvHeaders, EncodingResult, MAX_CSV_FILE_SIZE, check_file_size, check_file_size_with_limit, + check_path_length, detect_and_transcode, read_csv_schema, read_csv_table, + validate_dataframe_shape, }; // === File Discovery === From 77b8ab26f21483e331b60fd4d94be02d4e8ae1a5 Mon Sep 17 00:00:00 2001 From: Ruben Talstra <14235001+rubentalstra@users.noreply.github.com> Date: Sun, 25 Jan 2026 01:02:29 +0100 Subject: [PATCH 5/9] feat: sanitize OID components for Define-XML compliance --- crates/tss-submit/src/export/define_xml.rs | 59 +++++++++++++++++--- resources/technical/MAPPING_NOT_COLLECTED.md | 21 ------- 2 files changed, 51 insertions(+), 29 deletions(-) delete mode 100644 resources/technical/MAPPING_NOT_COLLECTED.md diff --git a/crates/tss-submit/src/export/define_xml.rs b/crates/tss-submit/src/export/define_xml.rs index fb7f0233..6ddb83bc 100644 --- a/crates/tss-submit/src/export/define_xml.rs +++ b/crates/tss-submit/src/export/define_xml.rs @@ -83,9 +83,11 @@ pub fn write_define_xml( }); } let study_id = normalize_study_id(study_id); - let study_oid = format!("STDY.{study_id}"); + let sanitized_study_id = sanitize_oid_component(&study_id); + let sanitized_ig_version = sanitize_oid_component(&options.ig_version); + let study_oid = format!("STDY.{sanitized_study_id}"); let file_oid = format!("{study_oid}.Define-XML_{DEFINE_XML_VERSION}"); - let mdv_oid = format!("MDV.{study_oid}.SDTMIG.{}", options.ig_version); + let mdv_oid = format!("MDV.{study_oid}.SDTMIG.{sanitized_ig_version}"); let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); let domain_lookup = domain_map_by_code(domains); @@ -115,7 +117,11 @@ pub fn write_define_xml( continue; } - let oid = format!("IT.{}.{}", output_dataset_name, variable.name); + let oid = format!( + "IT.{}.{}", + sanitize_oid_component(&output_dataset_name), + sanitize_oid_component(&variable.name) + ); let length = match variable.data_type { VariableType::Char => Some(variable_length(variable, &frame.data)?), VariableType::Num => None, @@ -214,7 +220,7 @@ pub fn write_define_xml( let output_dataset_name = frame.dataset_name(); let base_domain_code = frame.base_domain_code(); let mut ig = BytesStart::new("ItemGroupDef"); - let ig_oid = format!("IG.{}", output_dataset_name); + let ig_oid = format!("IG.{}", sanitize_oid_component(&output_dataset_name)); let sas_dataset_name: String = output_dataset_name.chars().take(8).collect(); ig.push_attribute(("OID", ig_oid.as_str())); ig.push_attribute(("Name", output_dataset_name.as_str())); @@ -245,7 +251,11 @@ pub fn write_define_xml( let mut key_sequence = 1usize; for (idx, variable) in ordered_vars.iter().enumerate() { let mut item_ref = BytesStart::new("ItemRef"); - let item_oid = format!("IT.{}.{}", output_dataset_name, variable.name); + let item_oid = format!( + "IT.{}.{}", + sanitize_oid_component(&output_dataset_name), + sanitize_oid_component(&variable.name) + ); let order_number = format!("{}", idx + 1); item_ref.push_attribute(("ItemOID", item_oid.as_str())); item_ref.push_attribute(("OrderNumber", order_number.as_str())); @@ -283,8 +293,9 @@ pub fn write_define_xml( } let core_designation = item_def.core.as_deref().and_then(|c| c.parse().ok()); + // Define-XML 2.1 valid OriginTypes: Assigned, Collected, Derived, Not Available, Other, Predecessor, Protocol let origin_type = if is_expected(core_designation) && !item_def.has_data { - "Not Collected" + "Not Available" } else if item_def.has_data { "Collected" } else { @@ -370,7 +381,11 @@ fn resolve_codelist( catalog.and_then(|cat| { let publishing_set = cat.publishing_set.as_ref()?; let version = cat.version.as_ref()?; - let oid = format!("STD.CT.{}.{}", publishing_set, version); + let oid = format!( + "STD.CT.{}.{}", + sanitize_oid_component(publishing_set), + sanitize_oid_component(version) + ); ct_standards .entry(oid.clone()) @@ -385,7 +400,11 @@ fn resolve_codelist( }) }); - let oid = format!("CL.{}.{}", domain.name, variable.name); + let oid = format!( + "CL.{}.{}", + sanitize_oid_component(&domain.name), + sanitize_oid_component(&variable.name) + ); if !code_lists.contains_key(&oid) { let mut values = BTreeSet::new(); let mut names = BTreeSet::new(); @@ -422,3 +441,27 @@ fn parse_codelist_codes(raw: &str) -> Vec { .map(std::string::ToString::to_string) .collect() } + +/// Sanitize a string for use in OID generation. +/// +/// Per Define-XML 2.1 specification, OIDs must contain only: +/// - Letters (A-Z, a-z) +/// - Digits (0-9) +/// - Underscores (_) +/// - Periods (.) +/// - Hyphens (-) +/// +/// Invalid characters are replaced with underscores. +/// Leading/trailing whitespace is trimmed. +fn sanitize_oid_component(s: &str) -> String { + s.trim() + .chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '_' || c == '.' || c == '-' { + c + } else { + '_' + } + }) + .collect() +} diff --git a/resources/technical/MAPPING_NOT_COLLECTED.md b/resources/technical/MAPPING_NOT_COLLECTED.md deleted file mode 100644 index fc2d7eda..00000000 --- a/resources/technical/MAPPING_NOT_COLLECTED.md +++ /dev/null @@ -1,21 +0,0 @@ -## Define-XML Integration (Future) - -The `not_collected` map stores variable → reason pairs that will be used when -generating Define-XML: - -```xml - - - -Reference Start Date/Time - - - - - - - - Data not collected in this study - - -``` From 7d373826d020ed25a53e626c81b250984b56aec9 Mon Sep 17 00:00:00 2001 From: Ruben Talstra <14235001+rubentalstra@users.noreply.github.com> Date: Sun, 25 Jan 2026 01:22:17 +0100 Subject: [PATCH 6/9] feat: implement validation checks for RDOMAIN, RELSUB, RELSPEC, and RELREC domains --- .../src/view/domain_editor/validation.rs | 8 +- crates/tss-standards/src/sdtm_ig.rs | 38 + .../src/validate/checks/cross_domain.rs | 689 +++++++++++++++++- crates/tss-submit/src/validate/issue.rs | 131 ++++ crates/tss-submit/src/validate/mod.rs | 70 +- 5 files changed, 921 insertions(+), 15 deletions(-) diff --git a/crates/tss-gui/src/view/domain_editor/validation.rs b/crates/tss-gui/src/view/domain_editor/validation.rs index d58e1128..29b14eb8 100644 --- a/crates/tss-gui/src/view/domain_editor/validation.rs +++ b/crates/tss-gui/src/view/domain_editor/validation.rs @@ -582,7 +582,13 @@ fn issue_category(issue: &Issue) -> &'static str { Issue::InvalidDate { .. } | Issue::TextTooLong { .. } => "Format", Issue::DataTypeMismatch { .. } => "Type", Issue::DuplicateSequence { .. } => "Consistency", - Issue::UsubjidNotInDm { .. } | Issue::ParentNotFound { .. } => "Cross Reference", + Issue::UsubjidNotInDm { .. } + | Issue::ParentNotFound { .. } + | Issue::InvalidRdomain { .. } + | Issue::RelsubNotInDm { .. } + | Issue::RelsubNotBidirectional { .. } + | Issue::RelspecInvalidParent { .. } + | Issue::RelrecInvalidReference { .. } => "Cross Reference", Issue::CtViolation { .. } => "Terminology", } } diff --git a/crates/tss-standards/src/sdtm_ig.rs b/crates/tss-standards/src/sdtm_ig.rs index 2a3f51e3..8df2657f 100644 --- a/crates/tss-standards/src/sdtm_ig.rs +++ b/crates/tss-standards/src/sdtm_ig.rs @@ -264,4 +264,42 @@ mod tests { } } } + + #[test] + fn test_special_purpose_and_relationship_domains() { + let domains = load().expect("load SDTM-IG"); + + // Verify CO (Comments) domain loads with correct class + let co = domains + .iter() + .find(|d| d.name == "CO") + .expect("CO domain should exist"); + assert_eq!(co.class, Some(SdtmDatasetClass::SpecialPurpose)); + assert!( + co.variables.len() >= 10, + "CO should have at least 10 variables" + ); + + // Verify RELREC (Related Records) domain + let relrec = domains + .iter() + .find(|d| d.name == "RELREC") + .expect("RELREC domain should exist"); + assert_eq!(relrec.class, Some(SdtmDatasetClass::Relationship)); + assert!(!relrec.variables.is_empty(), "RELREC should have variables"); + + // Verify RELSPEC (Related Specimens) domain + let relspec = domains + .iter() + .find(|d| d.name == "RELSPEC") + .expect("RELSPEC domain should exist"); + assert_eq!(relspec.class, Some(SdtmDatasetClass::Relationship)); + + // Verify RELSUB (Related Subjects) domain + let relsub = domains + .iter() + .find(|d| d.name == "RELSUB") + .expect("RELSUB domain should exist"); + assert_eq!(relsub.class, Some(SdtmDatasetClass::Relationship)); + } } diff --git a/crates/tss-submit/src/validate/checks/cross_domain.rs b/crates/tss-submit/src/validate/checks/cross_domain.rs index e53afa07..6503cd6c 100644 --- a/crates/tss-submit/src/validate/checks/cross_domain.rs +++ b/crates/tss-submit/src/validate/checks/cross_domain.rs @@ -3,11 +3,14 @@ //! Validates referential integrity across SDTM domains: //! - All USUBJIDs in non-DM domains must exist in DM //! - Parent record references (--SPID) must exist in parent domain +//! - CO/RELREC RDOMAIN references valid domains +//! - RELSUB RSUBJID exists in DM and relationships are bidirectional +//! - RELSPEC PARENT references valid REFID within subject //! //! These checks ensure data consistency across the submission package. use polars::prelude::DataFrame; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use super::super::column_reader::ColumnReader; use super::super::issue::Issue; @@ -78,6 +81,375 @@ pub fn check_usubjid_in_dm( } } +// ============================================================================= +// RDOMAIN VALIDATION (CO, RELREC) +// ============================================================================= + +/// Check that RDOMAIN values reference valid domains in the submission. +/// +/// # Arguments +/// * `domain_name` - Name of the domain being checked (e.g., "CO", "RELREC") +/// * `df` - DataFrame to check +/// * `valid_domains` - Set of valid domain codes in the submission +/// +/// # Returns +/// A vector of issues (empty if all RDOMAIN values are valid). +pub fn check_rdomain_valid( + domain_name: &str, + df: &DataFrame, + valid_domains: &HashSet, +) -> Vec { + let reader = ColumnReader::new(df); + + let Some(values) = reader.values("RDOMAIN") else { + return vec![]; + }; + + let mut invalid_values = Vec::new(); + let mut invalid_count = 0u64; + + for (_, rdomain) in values { + let trimmed = rdomain.trim().to_uppercase(); + // Skip empty values (RDOMAIN can be null for standalone comments) + if !trimmed.is_empty() && !valid_domains.contains(&trimmed) { + invalid_count += 1; + if invalid_values.len() < 5 && !invalid_values.contains(&trimmed) { + invalid_values.push(trimmed); + } + } + } + + if invalid_count > 0 { + vec![Issue::InvalidRdomain { + domain: domain_name.to_string(), + invalid_count, + samples: invalid_values, + }] + } else { + vec![] + } +} + +// ============================================================================= +// RELSUB VALIDATION +// ============================================================================= + +/// Check RELSUB domain for valid RSUBJID and bidirectional relationships. +/// +/// Per SDTM-IG 8.7: +/// - RSUBJID must be a USUBJID value present in DM +/// - Relationships must be bidirectional (A→B requires B→A) +/// +/// # Arguments +/// * `relsub_df` - RELSUB DataFrame +/// * `dm_subjects` - Set of valid USUBJIDs from DM +/// +/// # Returns +/// A vector of issues. +pub fn check_relsub(relsub_df: &DataFrame, dm_subjects: &HashSet) -> Vec { + let mut issues = Vec::new(); + let reader = ColumnReader::new(relsub_df); + + // Check RSUBJID exists in DM + if let Some(rsubjid_values) = reader.values("RSUBJID") { + let mut missing_values = Vec::new(); + let mut missing_count = 0u64; + + for (_, rsubjid) in rsubjid_values { + let trimmed = rsubjid.trim(); + if !trimmed.is_empty() && !dm_subjects.contains(trimmed) { + missing_count += 1; + if missing_values.len() < 5 && !missing_values.contains(&trimmed.to_string()) { + missing_values.push(trimmed.to_string()); + } + } + } + + if missing_count > 0 { + issues.push(Issue::RelsubNotInDm { + missing_count, + samples: missing_values, + }); + } + } + + // Check bidirectional relationships + // Collect all (USUBJID, RSUBJID) pairs + let usubjid_values = reader.values("USUBJID"); + let rsubjid_values = reader.values("RSUBJID"); + + if let (Some(usubjids), Some(rsubjids)) = (usubjid_values, rsubjid_values) { + let mut relationships: HashSet<(String, String)> = HashSet::new(); + + for ((_, usubjid), (_, rsubjid)) in usubjids.zip(rsubjids) { + let u = usubjid.trim().to_string(); + let r = rsubjid.trim().to_string(); + if !u.is_empty() && !r.is_empty() { + relationships.insert((u, r)); + } + } + + // Check for missing reciprocal relationships + let mut missing_reciprocal = Vec::new(); + let mut missing_count = 0u64; + + for (usubjid, rsubjid) in &relationships { + // The reciprocal would be (rsubjid, usubjid) + if !relationships.contains(&(rsubjid.clone(), usubjid.clone())) { + missing_count += 1; + if missing_reciprocal.len() < 5 { + missing_reciprocal.push(format!("{}→{}", usubjid, rsubjid)); + } + } + } + + if missing_count > 0 { + issues.push(Issue::RelsubNotBidirectional { + missing_count, + samples: missing_reciprocal, + }); + } + } + + issues +} + +// ============================================================================= +// RELSPEC VALIDATION +// ============================================================================= + +/// Check RELSPEC PARENT references valid REFID within each subject. +/// +/// Per SDTM-IG 8.8: +/// - PARENT must reference an existing REFID for the same USUBJID +/// - LEVEL=1 specimens should have null PARENT +/// +/// # Arguments +/// * `relspec_df` - RELSPEC DataFrame +/// +/// # Returns +/// A vector of issues. +pub fn check_relspec(relspec_df: &DataFrame) -> Vec { + let reader = ColumnReader::new(relspec_df); + + // Build a map of USUBJID -> Set of REFIDs + let mut refids_by_subject: HashMap> = HashMap::new(); + + let usubjid_col = reader.values("USUBJID"); + let refid_col = reader.values("REFID"); + + if let (Some(usubjids), Some(refids)) = (usubjid_col, refid_col) { + for ((_, usubjid), (_, refid)) in usubjids.zip(refids) { + let u = usubjid.trim().to_string(); + let r = refid.trim().to_string(); + if !u.is_empty() && !r.is_empty() { + refids_by_subject.entry(u).or_default().insert(r); + } + } + } + + // Check PARENT references + let usubjid_col = reader.values("USUBJID"); + let parent_col = reader.values("PARENT"); + + let Some(parents) = parent_col else { + return vec![]; + }; + + let mut invalid_values = Vec::new(); + let mut invalid_count = 0u64; + + if let Some(usubjids) = usubjid_col { + for ((_, usubjid), (_, parent)) in usubjids.zip(parents) { + let u = usubjid.trim(); + let p = parent.trim(); + + // Skip empty PARENT (valid for LEVEL=1 collected specimens) + if p.is_empty() { + continue; + } + + // Check if PARENT exists in this subject's REFIDs + let valid_refids = refids_by_subject.get(u); + let parent_exists = valid_refids.is_some_and(|refs| refs.contains(p)); + + if !parent_exists { + invalid_count += 1; + if invalid_values.len() < 5 { + invalid_values.push(format!("{}:{}", u, p)); + } + } + } + } + + if invalid_count > 0 { + vec![Issue::RelspecInvalidParent { + invalid_count, + samples: invalid_values, + }] + } else { + vec![] + } +} + +// ============================================================================= +// RELREC VALIDATION +// ============================================================================= + +/// Context for RELREC validation containing domain data. +pub struct RelrecContext<'a> { + /// Map of domain code -> (DataFrame, key_variable -> Set of key values) + pub domains: HashMap>)>, +} + +impl<'a> RelrecContext<'a> { + /// Build context from a list of (domain_name, DataFrame) pairs. + pub fn new(domain_list: &[(&str, &'a DataFrame)]) -> Self { + let mut domains = HashMap::new(); + + for (name, df) in domain_list { + let reader = ColumnReader::new(df); + let mut key_values: HashMap> = HashMap::new(); + + // Extract common key variables: --SEQ, --GRPID, --REFID, --LNKID + let domain_prefix = name.to_uppercase(); + let seq_var = format!("{}SEQ", domain_prefix); + let grpid_var = format!("{}GRPID", domain_prefix); + let refid_var = format!("{}REFID", domain_prefix); + let lnkid_var = format!("{}LNKID", domain_prefix); + + for var in [&seq_var, &grpid_var, &refid_var, &lnkid_var] { + if let Some(values) = reader.values(var) { + let mut set = HashSet::new(); + for (_, val) in values { + let trimmed = val.trim(); + if !trimmed.is_empty() { + set.insert(trimmed.to_string()); + } + } + if !set.is_empty() { + key_values.insert(var.clone(), set); + } + } + } + + // Also check generic variable names (VISITNUM, etc.) + for var in ["VISITNUM"] { + if let Some(values) = reader.values(var) { + let mut set = HashSet::new(); + for (_, val) in values { + let trimmed = val.trim(); + if !trimmed.is_empty() { + set.insert(trimmed.to_string()); + } + } + if !set.is_empty() { + key_values.insert(var.to_string(), set); + } + } + } + + domains.insert(name.to_uppercase(), (*df, key_values)); + } + + Self { domains } + } + + /// Check if a reference exists in the context. + pub fn reference_exists(&self, rdomain: &str, idvar: &str, idvarval: &str) -> bool { + let domain_upper = rdomain.to_uppercase(); + let Some((_, key_values)) = self.domains.get(&domain_upper) else { + return false; + }; + + // Check if the IDVAR exists and contains the IDVARVAL + key_values + .get(idvar) + .is_some_and(|values| values.contains(idvarval)) + } +} + +/// Check RELREC references point to existing records. +/// +/// Per SDTM-IG 8.2: +/// - RDOMAIN + IDVAR + IDVARVAL should reference existing records +/// - For dataset relationships, USUBJID and IDVARVAL are null +/// +/// # Arguments +/// * `relrec_df` - RELREC DataFrame +/// * `context` - Context containing all domain data +/// +/// # Returns +/// A vector of issues grouped by RDOMAIN. +pub fn check_relrec(relrec_df: &DataFrame, context: &RelrecContext) -> Vec { + let reader = ColumnReader::new(relrec_df); + + let rdomain_col = reader.values("RDOMAIN"); + let idvar_col = reader.values("IDVAR"); + let idvarval_col = reader.values("IDVARVAL"); + let usubjid_col = reader.values("USUBJID"); + + let Some(rdomains) = rdomain_col else { + return vec![]; + }; + let Some(idvars) = idvar_col else { + return vec![]; + }; + let Some(idvarvals) = idvarval_col else { + return vec![]; + }; + + // Group invalid references by RDOMAIN + let mut invalid_by_domain: HashMap)> = HashMap::new(); + + // Handle case where USUBJID might be null (dataset-level relationships) + let usubjids: Vec = usubjid_col + .map(|col| col.map(|(_, v)| v.to_string()).collect()) + .unwrap_or_else(|| vec!["".to_string(); rdomains.size_hint().0]); + + for (((_, rdomain), (_, idvar)), ((_, idvarval), usubjid)) in + rdomains.zip(idvars).zip(idvarvals.zip(usubjids.iter())) + { + let rdomain = rdomain.trim(); + let idvar = idvar.trim(); + let idvarval = idvarval.trim(); + let usubjid = usubjid.trim(); + + // Skip dataset-level relationships (USUBJID and IDVARVAL are null) + if usubjid.is_empty() && idvarval.is_empty() { + continue; + } + + // Skip empty values + if rdomain.is_empty() || idvar.is_empty() || idvarval.is_empty() { + continue; + } + + // Check if reference exists + if !context.reference_exists(rdomain, idvar, idvarval) { + let entry = invalid_by_domain + .entry(rdomain.to_string()) + .or_insert((0, Vec::new())); + entry.0 += 1; + if entry.1.len() < 5 { + entry.1.push(format!("{}={}", idvar, idvarval)); + } + } + } + + // Convert to issues + invalid_by_domain + .into_iter() + .map( + |(rdomain, (count, samples))| Issue::RelrecInvalidReference { + rdomain, + invalid_count: count, + samples, + }, + ) + .collect() +} + #[cfg(test)] mod tests { use super::*; @@ -188,4 +560,319 @@ mod tests { _ => panic!("Expected UsubjidNotInDm issue"), } } + + // ========================================================================= + // RDOMAIN VALIDATION TESTS + // ========================================================================= + + #[test] + fn test_check_rdomain_valid_all_valid() { + let co = df! { + "USUBJID" => &["STUDY-001", "STUDY-002"], + "RDOMAIN" => &["AE", "LB"], + "COVAL" => &["Comment 1", "Comment 2"], + } + .unwrap(); + + let valid_domains: HashSet = ["DM", "AE", "LB", "CO"] + .into_iter() + .map(String::from) + .collect(); + + let issues = check_rdomain_valid("CO", &co, &valid_domains); + assert!(issues.is_empty()); + } + + #[test] + fn test_check_rdomain_valid_invalid_references() { + let co = df! { + "USUBJID" => &["STUDY-001", "STUDY-002", "STUDY-003"], + "RDOMAIN" => &["AE", "XX", "YY"], + "COVAL" => &["Comment 1", "Comment 2", "Comment 3"], + } + .unwrap(); + + let valid_domains: HashSet = + ["DM", "AE", "LB"].into_iter().map(String::from).collect(); + + let issues = check_rdomain_valid("CO", &co, &valid_domains); + assert_eq!(issues.len(), 1); + + match &issues[0] { + Issue::InvalidRdomain { + domain, + invalid_count, + samples, + } => { + assert_eq!(domain, "CO"); + assert_eq!(*invalid_count, 2); + assert!(samples.contains(&"XX".to_string())); + assert!(samples.contains(&"YY".to_string())); + } + _ => panic!("Expected InvalidRdomain issue"), + } + } + + #[test] + fn test_check_rdomain_valid_empty_rdomain_allowed() { + // Empty RDOMAIN is valid for standalone comments + let co = df! { + "USUBJID" => &["STUDY-001"], + "RDOMAIN" => &[""], + "COVAL" => &["General study comment"], + } + .unwrap(); + + let valid_domains: HashSet = ["DM", "AE"].into_iter().map(String::from).collect(); + + let issues = check_rdomain_valid("CO", &co, &valid_domains); + assert!(issues.is_empty()); + } + + // ========================================================================= + // RELSUB VALIDATION TESTS + // ========================================================================= + + #[test] + fn test_check_relsub_all_valid() { + // Bidirectional twin relationship + let relsub = df! { + "USUBJID" => &["STUDY-001", "STUDY-002"], + "RSUBJID" => &["STUDY-002", "STUDY-001"], + "SREL" => &["TWIN", "TWIN"], + } + .unwrap(); + + let dm_subjects: HashSet = ["STUDY-001", "STUDY-002", "STUDY-003"] + .into_iter() + .map(String::from) + .collect(); + + let issues = check_relsub(&relsub, &dm_subjects); + assert!(issues.is_empty()); + } + + #[test] + fn test_check_relsub_rsubjid_not_in_dm() { + let relsub = df! { + "USUBJID" => &["STUDY-001", "STUDY-002"], + "RSUBJID" => &["STUDY-999", "STUDY-001"], + "SREL" => &["TWIN", "TWIN"], + } + .unwrap(); + + let dm_subjects: HashSet = ["STUDY-001", "STUDY-002"] + .into_iter() + .map(String::from) + .collect(); + + let issues = check_relsub(&relsub, &dm_subjects); + + // Should have RelsubNotInDm issue + let rsubjid_issue = issues + .iter() + .find(|i| matches!(i, Issue::RelsubNotInDm { .. })); + assert!(rsubjid_issue.is_some()); + + match rsubjid_issue.unwrap() { + Issue::RelsubNotInDm { + missing_count, + samples, + } => { + assert_eq!(*missing_count, 1); + assert!(samples.contains(&"STUDY-999".to_string())); + } + _ => panic!("Expected RelsubNotInDm issue"), + } + } + + #[test] + fn test_check_relsub_not_bidirectional() { + // Unidirectional relationship - missing reciprocal + let relsub = df! { + "USUBJID" => &["STUDY-001"], + "RSUBJID" => &["STUDY-002"], + "SREL" => &["PARENT"], + } + .unwrap(); + + let dm_subjects: HashSet = ["STUDY-001", "STUDY-002"] + .into_iter() + .map(String::from) + .collect(); + + let issues = check_relsub(&relsub, &dm_subjects); + + let bidirectional_issue = issues + .iter() + .find(|i| matches!(i, Issue::RelsubNotBidirectional { .. })); + assert!(bidirectional_issue.is_some()); + + match bidirectional_issue.unwrap() { + Issue::RelsubNotBidirectional { + missing_count, + samples, + } => { + assert_eq!(*missing_count, 1); + assert!(samples.contains(&"STUDY-001→STUDY-002".to_string())); + } + _ => panic!("Expected RelsubNotBidirectional issue"), + } + } + + // ========================================================================= + // RELSPEC VALIDATION TESTS + // ========================================================================= + + #[test] + fn test_check_relspec_valid_parent_chain() { + // Parent specimen → Child specimen + let relspec = df! { + "USUBJID" => &["STUDY-001", "STUDY-001"], + "REFID" => &["SPEC-001", "SPEC-002"], + "PARENT" => &["", "SPEC-001"], // SPEC-002 derived from SPEC-001 + "LEVEL" => &[1i32, 2i32], + } + .unwrap(); + + let issues = check_relspec(&relspec); + assert!(issues.is_empty()); + } + + #[test] + fn test_check_relspec_invalid_parent() { + // PARENT references non-existent REFID + let relspec = df! { + "USUBJID" => &["STUDY-001", "STUDY-001"], + "REFID" => &["SPEC-001", "SPEC-002"], + "PARENT" => &["", "SPEC-999"], // SPEC-999 doesn't exist + "LEVEL" => &[1i32, 2i32], + } + .unwrap(); + + let issues = check_relspec(&relspec); + assert_eq!(issues.len(), 1); + + match &issues[0] { + Issue::RelspecInvalidParent { + invalid_count, + samples, + } => { + assert_eq!(*invalid_count, 1); + assert!(samples.contains(&"STUDY-001:SPEC-999".to_string())); + } + _ => panic!("Expected RelspecInvalidParent issue"), + } + } + + #[test] + fn test_check_relspec_parent_from_different_subject() { + // PARENT should be within same subject + let relspec = df! { + "USUBJID" => &["STUDY-001", "STUDY-002"], + "REFID" => &["SPEC-001", "SPEC-002"], + "PARENT" => &["", "SPEC-001"], // SPEC-001 is from STUDY-001, not STUDY-002 + "LEVEL" => &[1i32, 2i32], + } + .unwrap(); + + let issues = check_relspec(&relspec); + assert_eq!(issues.len(), 1); // Should flag cross-subject reference + } + + // ========================================================================= + // RELREC VALIDATION TESTS + // ========================================================================= + + #[test] + fn test_check_relrec_valid_references() { + let relrec = df! { + "STUDYID" => &["STUDY", "STUDY"], + "USUBJID" => &["STUDY-001", "STUDY-001"], + "RDOMAIN" => &["AE", "CM"], + "IDVAR" => &["AESEQ", "CMSEQ"], + "IDVARVAL" => &["1", "1"], + "RELID" => &["REL1", "REL1"], + } + .unwrap(); + + let ae = df! { + "USUBJID" => &["STUDY-001"], + "AESEQ" => &["1"], + "AETERM" => &["HEADACHE"], + } + .unwrap(); + + let cm = df! { + "USUBJID" => &["STUDY-001"], + "CMSEQ" => &["1"], + "CMTRT" => &["ASPIRIN"], + } + .unwrap(); + + let domains: Vec<(&str, &DataFrame)> = vec![("AE", &ae), ("CM", &cm)]; + let context = RelrecContext::new(&domains); + + let issues = check_relrec(&relrec, &context); + assert!(issues.is_empty()); + } + + #[test] + fn test_check_relrec_invalid_reference() { + let relrec = df! { + "STUDYID" => &["STUDY"], + "USUBJID" => &["STUDY-001"], + "RDOMAIN" => &["AE"], + "IDVAR" => &["AESEQ"], + "IDVARVAL" => &["999"], // Doesn't exist + "RELID" => &["REL1"], + } + .unwrap(); + + let ae = df! { + "USUBJID" => &["STUDY-001"], + "AESEQ" => &["1"], + "AETERM" => &["HEADACHE"], + } + .unwrap(); + + let domains: Vec<(&str, &DataFrame)> = vec![("AE", &ae)]; + let context = RelrecContext::new(&domains); + + let issues = check_relrec(&relrec, &context); + assert_eq!(issues.len(), 1); + + match &issues[0] { + Issue::RelrecInvalidReference { + rdomain, + invalid_count, + samples, + } => { + assert_eq!(rdomain, "AE"); + assert_eq!(*invalid_count, 1); + assert!(samples.contains(&"AESEQ=999".to_string())); + } + _ => panic!("Expected RelrecInvalidReference issue"), + } + } + + #[test] + fn test_check_relrec_dataset_level_relationship() { + // Dataset-level relationships have null USUBJID and IDVARVAL + let relrec = df! { + "STUDYID" => &["STUDY"], + "USUBJID" => &[""], // Null for dataset-level + "RDOMAIN" => &["SUPPAE"], + "IDVAR" => &["QNAM"], + "IDVARVAL" => &[""], // Null for dataset-level + "RELID" => &["REL1"], + } + .unwrap(); + + let domains: Vec<(&str, &DataFrame)> = vec![]; + let context = RelrecContext::new(&domains); + + let issues = check_relrec(&relrec, &context); + assert!(issues.is_empty()); // Should skip dataset-level relationships + } } diff --git a/crates/tss-submit/src/validate/issue.rs b/crates/tss-submit/src/validate/issue.rs index 1beec5a0..9aae2a0b 100644 --- a/crates/tss-submit/src/validate/issue.rs +++ b/crates/tss-submit/src/validate/issue.rs @@ -114,6 +114,35 @@ pub enum Issue { missing_count: u64, samples: Vec, }, + + // Special domain cross-reference checks (#38) + /// RDOMAIN references a domain that doesn't exist in the submission + InvalidRdomain { + domain: String, + invalid_count: u64, + samples: Vec, + }, + /// RSUBJID values not found in DM domain + RelsubNotInDm { + missing_count: u64, + samples: Vec, + }, + /// RELSUB relationship is not bidirectional (missing reciprocal record) + RelsubNotBidirectional { + missing_count: u64, + samples: Vec, + }, + /// RELSPEC PARENT references non-existent REFID + RelspecInvalidParent { + invalid_count: u64, + samples: Vec, + }, + /// RELREC references a record that doesn't exist + RelrecInvalidReference { + rdomain: String, + invalid_count: u64, + samples: Vec, + }, } impl Issue { @@ -132,6 +161,12 @@ impl Issue { // Cross-domain issues use USUBJID or the specific variable Issue::UsubjidNotInDm { .. } => "USUBJID", Issue::ParentNotFound { variable, .. } => variable, + // Special domain cross-reference issues + Issue::InvalidRdomain { .. } => "RDOMAIN", + Issue::RelsubNotInDm { .. } => "RSUBJID", + Issue::RelsubNotBidirectional { .. } => "SREL", + Issue::RelspecInvalidParent { .. } => "PARENT", + Issue::RelrecInvalidReference { .. } => "IDVARVAL", } } @@ -153,6 +188,12 @@ impl Issue { Issue::CtViolation { total_invalid, .. } => Some(*total_invalid), Issue::UsubjidNotInDm { missing_count, .. } => Some(*missing_count), Issue::ParentNotFound { missing_count, .. } => Some(*missing_count), + // Special domain cross-reference issues + Issue::InvalidRdomain { invalid_count, .. } => Some(*invalid_count), + Issue::RelsubNotInDm { missing_count, .. } => Some(*missing_count), + Issue::RelsubNotBidirectional { missing_count, .. } => Some(*missing_count), + Issue::RelspecInvalidParent { invalid_count, .. } => Some(*invalid_count), + Issue::RelrecInvalidReference { invalid_count, .. } => Some(*invalid_count), } } @@ -176,6 +217,12 @@ impl Issue { // Cross-domain reference checks Issue::UsubjidNotInDm { .. } => Category::CrossReference, Issue::ParentNotFound { .. } => Category::CrossReference, + // Special domain cross-reference checks + Issue::InvalidRdomain { .. } => Category::CrossReference, + Issue::RelsubNotInDm { .. } => Category::CrossReference, + Issue::RelsubNotBidirectional { .. } => Category::CrossReference, + Issue::RelspecInvalidParent { .. } => Category::CrossReference, + Issue::RelrecInvalidReference { .. } => Category::CrossReference, } } @@ -190,6 +237,12 @@ impl Issue { // Cross-domain reference issues are errors (data integrity) Issue::UsubjidNotInDm { .. } => Severity::Error, Issue::ParentNotFound { .. } => Severity::Error, + // Special domain cross-reference issues + Issue::InvalidRdomain { .. } => Severity::Error, + Issue::RelsubNotInDm { .. } => Severity::Error, + Issue::RelsubNotBidirectional { .. } => Severity::Warning, + Issue::RelspecInvalidParent { .. } => Severity::Error, + Issue::RelrecInvalidReference { .. } => Severity::Error, _ => Severity::Error, } } @@ -348,6 +401,84 @@ impl Issue { variable, missing_count, parent_domain, sample_str ) } + + // Special domain cross-reference issues + Issue::InvalidRdomain { + domain, + invalid_count, + samples, + } => { + let sample_str = if samples.is_empty() { + String::new() + } else { + format!(": {}", samples.join(", ")) + }; + format!( + "{} domain has {} RDOMAIN values referencing non-existent domains{}", + domain, invalid_count, sample_str + ) + } + + Issue::RelsubNotInDm { + missing_count, + samples, + } => { + let sample_str = if samples.is_empty() { + String::new() + } else { + format!(" (e.g., {})", samples.join(", ")) + }; + format!( + "RELSUB has {} RSUBJID values not found in DM{}", + missing_count, sample_str + ) + } + + Issue::RelsubNotBidirectional { + missing_count, + samples, + } => { + let sample_str = if samples.is_empty() { + String::new() + } else { + format!(" (e.g., {})", samples.join(", ")) + }; + format!( + "RELSUB has {} relationships without reciprocal records{}", + missing_count, sample_str + ) + } + + Issue::RelspecInvalidParent { + invalid_count, + samples, + } => { + let sample_str = if samples.is_empty() { + String::new() + } else { + format!(" (e.g., {})", samples.join(", ")) + }; + format!( + "RELSPEC has {} PARENT values referencing non-existent REFID{}", + invalid_count, sample_str + ) + } + + Issue::RelrecInvalidReference { + rdomain, + invalid_count, + samples, + } => { + let sample_str = if samples.is_empty() { + String::new() + } else { + format!(" (e.g., {})", samples.join(", ")) + }; + format!( + "RELREC has {} references to non-existent records in {}{}", + invalid_count, rdomain, sample_str + ) + } } } } diff --git a/crates/tss-submit/src/validate/mod.rs b/crates/tss-submit/src/validate/mod.rs index fb0d135a..0d98efab 100644 --- a/crates/tss-submit/src/validate/mod.rs +++ b/crates/tss-submit/src/validate/mod.rs @@ -33,7 +33,7 @@ pub mod rules; mod util; use polars::prelude::DataFrame; -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashSet}; use tss_standards::SdtmDomain; use tss_standards::TerminologyRegistry; @@ -87,6 +87,10 @@ pub fn validate_domain_with_not_collected( /// /// Checks that: /// - All USUBJIDs in non-DM domains exist in the DM domain +/// - RDOMAIN values in CO/RELREC reference valid domains +/// - RELSUB RSUBJID exists in DM and relationships are bidirectional +/// - RELSPEC PARENT references valid REFID within subject +/// - RELREC references point to existing records /// /// # Arguments /// * `domains` - List of (domain_name, DataFrame) pairs @@ -115,17 +119,57 @@ pub fn validate_cross_domain(domains: &[(&str, &DataFrame)]) -> Vec<(String, Vec ); } - // Check each non-DM domain - domains + // Build set of valid domain codes for RDOMAIN validation + let valid_domains: HashSet = domains .iter() - .filter(|(name, _)| *name != "DM") - .filter_map(|(name, df)| { - let issues = checks::cross_domain::check_usubjid_in_dm(name, df, &dm_subjects); - if issues.is_empty() { - None - } else { - Some((name.to_string(), issues)) - } - }) - .collect() + .map(|(name, _)| name.to_uppercase()) + .collect(); + + let mut results = Vec::new(); + + // Check each domain + for (name, df) in domains { + let name_upper = name.to_uppercase(); + let mut domain_issues = Vec::new(); + + // Skip DM for USUBJID check (it's the reference) + if name_upper != "DM" { + domain_issues.extend(checks::cross_domain::check_usubjid_in_dm( + name, + df, + &dm_subjects, + )); + } + + // RDOMAIN validation for CO and RELREC + if name_upper == "CO" || name_upper == "RELREC" { + domain_issues.extend(checks::cross_domain::check_rdomain_valid( + name, + df, + &valid_domains, + )); + } + + // RELSUB-specific validation + if name_upper == "RELSUB" { + domain_issues.extend(checks::cross_domain::check_relsub(df, &dm_subjects)); + } + + // RELSPEC-specific validation + if name_upper == "RELSPEC" { + domain_issues.extend(checks::cross_domain::check_relspec(df)); + } + + // RELREC-specific validation (record references) + if name_upper == "RELREC" { + let context = checks::cross_domain::RelrecContext::new(domains); + domain_issues.extend(checks::cross_domain::check_relrec(df, &context)); + } + + if !domain_issues.is_empty() { + results.push((name.to_string(), domain_issues)); + } + } + + results } From b817e85f5f4211279d0a640b36801d309d846899 Mon Sep 17 00:00:00 2001 From: Ruben Talstra <14235001+rubentalstra@users.noreply.github.com> Date: Sun, 25 Jan 2026 01:31:54 +0100 Subject: [PATCH 7/9] feat: refactor domain handling to use `let else` syntax for improved readability --- crates/tss-gui/src/app/mod.rs | 57 +------- crates/tss-gui/src/app/subscription.rs | 147 ++++++++++++++++++++ crates/tss-gui/src/handler/domain_editor.rs | 39 +++--- crates/tss-gui/src/handler/export.rs | 5 +- 4 files changed, 172 insertions(+), 76 deletions(-) create mode 100644 crates/tss-gui/src/app/subscription.rs diff --git a/crates/tss-gui/src/app/mod.rs b/crates/tss-gui/src/app/mod.rs index 4dfa33f8..c7cf0e8c 100644 --- a/crates/tss-gui/src/app/mod.rs +++ b/crates/tss-gui/src/app/mod.rs @@ -16,12 +16,12 @@ // Submodules - handlers are organized by category in handler/ mod handler; +mod subscription; pub mod util; // Re-export utility functions for internal use use util::load_app_icon; -use iced::keyboard; use iced::widget::container; use iced::window; use iced::{Element, Size, Subscription, Task, Theme}; @@ -680,58 +680,11 @@ impl App { } /// Subscribe to runtime events. + /// + /// Delegates to [`subscription::create_subscription`] which manages all + /// application subscriptions in a centralized module. pub fn subscription(&self) -> Subscription { - use iced::{system, time}; - use std::time::Duration; - - // Keyboard events - let keyboard_sub = keyboard::listen().map(|event| match event { - keyboard::Event::KeyPressed { key, modifiers, .. } => { - Message::KeyPressed(key, modifiers) - } - _ => Message::Noop, - }); - - // System theme changes (for ThemeMode::System) - let system_theme_sub = system::theme_changes().map(Message::SystemThemeChanged); - - // Native menu event polling (macOS only, polls every 50ms for responsiveness) - #[cfg(target_os = "macos")] - let menu_sub = crate::menu::menu_subscription().map(|action| match action { - Some(a) => Message::MenuAction(a), - None => Message::Noop, - }); - - #[cfg(not(target_os = "macos"))] - let menu_sub = Subscription::none(); - - // Window close events (for cleaning up dialog windows) - let window_sub = window::close_requests().map(Message::DialogWindowClosed); - - // Toast auto-dismiss timer (5 seconds) - let toast_sub = if self.state.toast.is_some() { - time::every(Duration::from_secs(5)) - .map(|_| Message::Toast(crate::message::ToastMessage::Dismiss)) - } else { - Subscription::none() - }; - - // Auto-save timer (polls every 500ms to check if auto-save should trigger) - // The actual save only happens if the dirty tracker indicates it should - let auto_save_sub = if self.state.auto_save_config.enabled && self.state.study.is_some() { - time::every(Duration::from_millis(500)).map(|_| Message::AutoSaveTick) - } else { - Subscription::none() - }; - - Subscription::batch([ - keyboard_sub, - system_theme_sub, - menu_sub, - window_sub, - toast_sub, - auto_save_sub, - ]) + subscription::create_subscription(&self.state) } } diff --git a/crates/tss-gui/src/app/subscription.rs b/crates/tss-gui/src/app/subscription.rs new file mode 100644 index 00000000..8c701ef4 --- /dev/null +++ b/crates/tss-gui/src/app/subscription.rs @@ -0,0 +1,147 @@ +//! Application subscriptions. +//! +//! This module centralizes all Iced subscriptions for the application. +//! Subscriptions are reactive event sources that run alongside the app. +//! +//! # Subscription Overview +//! +//! | Subscription | Interval | Condition | Purpose | +//! |--------------|----------|-----------|---------| +//! | Keyboard | Continuous | Always | Global keyboard shortcuts | +//! | System Theme | Continuous | Always | Track OS theme changes | +//! | Menu (macOS) | 100ms poll | Always | Native menu bar events | +//! | Window Close | Continuous | Always | Dialog window cleanup | +//! | Toast Dismiss | 5 seconds | Toast visible | Auto-dismiss notifications | +//! | Auto-Save | 500ms poll | Study loaded + enabled | Debounced project saves | +//! +//! # Architecture +//! +//! Subscriptions are batched together in `create_subscription()` and run +//! concurrently. Conditional subscriptions (toast, auto-save) return +//! `Subscription::none()` when their condition is not met, avoiding +//! unnecessary polling. + +use std::time::Duration; + +use iced::Subscription; +use iced::keyboard; +use iced::window; +use iced::{system, time}; + +use crate::message::Message; +use crate::state::AppState; + +/// Create all application subscriptions. +/// +/// This batches together all event subscriptions: +/// - Keyboard events for shortcuts +/// - System theme changes for automatic theme switching +/// - Native menu events (macOS only) +/// - Window close events for dialog cleanup +/// - Toast auto-dismiss timer (conditional) +/// - Auto-save timer (conditional) +pub fn create_subscription(state: &AppState) -> Subscription { + Subscription::batch([ + keyboard_subscription(), + system_theme_subscription(), + menu_subscription(), + window_close_subscription(), + toast_subscription(state), + auto_save_subscription(state), + ]) +} + +/// Keyboard event subscription. +/// +/// Listens for all key press events to handle global shortcuts. +/// Runs continuously without polling. +fn keyboard_subscription() -> Subscription { + keyboard::listen().map(|event| match event { + keyboard::Event::KeyPressed { key, modifiers, .. } => Message::KeyPressed(key, modifiers), + _ => Message::Noop, + }) +} + +/// System theme change subscription. +/// +/// Monitors OS theme changes (light/dark) for ThemeMode::System. +/// Runs continuously without polling. +fn system_theme_subscription() -> Subscription { + system::theme_changes().map(Message::SystemThemeChanged) +} + +/// Native menu event subscription (macOS only). +/// +/// Polls the menu event channel every 100ms. This interval balances +/// responsiveness with efficiency: +/// - Menu clicks are human-initiated (~200-300ms reaction time) +/// - A background forwarder thread captures events immediately +/// - We're just polling a local mpsc channel, not the native event system +/// +/// On non-macOS platforms, returns an empty subscription. +fn menu_subscription() -> Subscription { + #[cfg(target_os = "macos")] + { + crate::menu::menu_subscription().map(|action| match action { + Some(a) => Message::MenuAction(a), + None => Message::Noop, + }) + } + + #[cfg(not(target_os = "macos"))] + { + Subscription::none() + } +} + +/// Window close event subscription. +/// +/// Listens for window close requests to clean up dialog window state. +/// Runs continuously without polling. +fn window_close_subscription() -> Subscription { + window::close_requests().map(Message::DialogWindowClosed) +} + +/// Toast auto-dismiss subscription. +/// +/// When a toast notification is visible, polls every 5 seconds to +/// trigger auto-dismissal. Returns no subscription when no toast exists. +/// +/// # Conditional Behavior +/// - Active: When `state.toast.is_some()` +/// - Inactive: When no toast is displayed +fn toast_subscription(state: &AppState) -> Subscription { + if state.toast.is_some() { + time::every(Duration::from_secs(5)) + .map(|_| Message::Toast(crate::message::ToastMessage::Dismiss)) + } else { + Subscription::none() + } +} + +/// Auto-save subscription. +/// +/// Polls every 500ms to check if an auto-save should trigger. The actual +/// save only occurs if the dirty tracker indicates changes need saving. +/// +/// # Conditional Behavior +/// - Active: When auto-save is enabled AND a study is loaded +/// - Inactive: When auto-save is disabled OR no study is loaded +/// +/// The 500ms interval provides a balance between: +/// - Responsiveness (saves happen within 500ms of idle threshold) +/// - Efficiency (only 2 checks per second) +fn auto_save_subscription(state: &AppState) -> Subscription { + if state.auto_save_config.enabled && state.study.is_some() { + time::every(Duration::from_millis(500)).map(|_| Message::AutoSaveTick) + } else { + Subscription::none() + } +} + +#[cfg(test)] +mod tests { + // Note: Subscription testing requires an Iced runtime, which is not + // available in unit tests. Integration tests should verify subscription + // behavior through the full application. +} diff --git a/crates/tss-gui/src/handler/domain_editor.rs b/crates/tss-gui/src/handler/domain_editor.rs index aef9ab85..01449e7f 100644 --- a/crates/tss-gui/src/handler/domain_editor.rs +++ b/crates/tss-gui/src/handler/domain_editor.rs @@ -60,9 +60,8 @@ impl MessageHandler for DomainEditorHandler { /// Trigger automatic preview rebuild after mapping changes. /// Returns a Task that computes the preview in the background. fn trigger_preview_rebuild(state: &AppState, domain_code: &str) -> Task { - let domain = match state.study.as_ref().and_then(|s| s.domain(domain_code)) { - Some(d) => d, - None => return Task::none(), + let Some(domain) = state.study.as_ref().and_then(|s| s.domain(domain_code)) else { + return Task::none(); }; let input = PreviewInput { @@ -87,10 +86,10 @@ fn trigger_preview_rebuild(state: &AppState, domain_code: &str) -> Task fn handle_mapping_message(state: &mut AppState, msg: MappingMessage) -> Task { // Get current domain code - let domain_code = match &state.view { - ViewState::DomainEditor(editor) => editor.domain.clone(), - _ => return Task::none(), + let ViewState::DomainEditor(editor) = &state.view else { + return Task::none(); }; + let domain_code = editor.domain.clone(); match msg { MappingMessage::VariableSelected(idx) => { @@ -338,16 +337,15 @@ fn handle_normalization_message(state: &mut AppState, msg: NormalizationMessage) // ============================================================================= fn handle_validation_message(state: &mut AppState, msg: ValidationMessage) -> Task { - let domain_code = match &state.view { - ViewState::DomainEditor(editor) => editor.domain.clone(), - _ => return Task::none(), + let ViewState::DomainEditor(editor) = &state.view else { + return Task::none(); }; + let domain_code = editor.domain.clone(); match msg { ValidationMessage::RefreshValidation => { - let domain = match state.study.as_ref().and_then(|s| s.domain(&domain_code)) { - Some(d) => d, - None => return Task::none(), + let Some(domain) = state.study.as_ref().and_then(|s| s.domain(&domain_code)) else { + return Task::none(); }; let df = match &state.view { @@ -434,10 +432,10 @@ fn handle_validation_message(state: &mut AppState, msg: ValidationMessage) -> Ta #[allow(clippy::needless_pass_by_value)] fn handle_preview_message(state: &mut AppState, msg: PreviewMessage) -> Task { - let domain_code = match &state.view { - ViewState::DomainEditor(editor) => editor.domain.clone(), - _ => return Task::none(), + let ViewState::DomainEditor(editor) = &state.view else { + return Task::none(); }; + let domain_code = editor.domain.clone(); match msg { PreviewMessage::GoToPage(page) => { @@ -473,9 +471,8 @@ fn handle_preview_message(state: &mut AppState, msg: PreviewMessage) -> Task { - let domain = match state.study.as_ref().and_then(|s| s.domain(&domain_code)) { - Some(d) => d, - None => return Task::none(), + let Some(domain) = state.study.as_ref().and_then(|s| s.domain(&domain_code)) else { + return Task::none(); }; // Mark as rebuilding @@ -506,10 +503,10 @@ fn handle_preview_message(state: &mut AppState, msg: PreviewMessage) -> Task Task { - let domain_code = match &state.view { - ViewState::DomainEditor(editor) => editor.domain.clone(), - _ => return Task::none(), + let ViewState::DomainEditor(editor) = &state.view else { + return Task::none(); }; + let domain_code = editor.domain.clone(); match msg { SuppMessage::ColumnSelected(col_name) => { diff --git a/crates/tss-gui/src/handler/export.rs b/crates/tss-gui/src/handler/export.rs index 1bd0e88b..616bab40 100644 --- a/crates/tss-gui/src/handler/export.rs +++ b/crates/tss-gui/src/handler/export.rs @@ -170,9 +170,8 @@ fn start_export(state: &mut AppState) -> Task { } // Get study data - let study = match &state.study { - Some(s) => s, - None => return Task::none(), + let Some(study) = &state.study else { + return Task::none(); }; let study_id = study.study_id.clone(); From 48da04df9d54815145e56cee5f8363bdbfdc2caa Mon Sep 17 00:00:00 2001 From: Ruben Talstra <14235001+rubentalstra@users.noreply.github.com> Date: Sun, 25 Jan 2026 11:26:00 +0100 Subject: [PATCH 8/9] fix: update documentation links and improve comment formatting for clarity --- crates/tss-gui/src/app/mod.rs | 2 +- crates/tss-standards/src/ct/loader.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/tss-gui/src/app/mod.rs b/crates/tss-gui/src/app/mod.rs index c7cf0e8c..92950f70 100644 --- a/crates/tss-gui/src/app/mod.rs +++ b/crates/tss-gui/src/app/mod.rs @@ -681,7 +681,7 @@ impl App { /// Subscribe to runtime events. /// - /// Delegates to [`subscription::create_subscription`] which manages all + /// Delegates to `subscription::create_subscription` which manages all /// application subscriptions in a centralized module. pub fn subscription(&self) -> Subscription { subscription::create_subscription(&self.state) diff --git a/crates/tss-standards/src/ct/loader.rs b/crates/tss-standards/src/ct/loader.rs index dfd40df7..71db5a72 100644 --- a/crates/tss-standards/src/ct/loader.rs +++ b/crates/tss-standards/src/ct/loader.rs @@ -81,7 +81,7 @@ impl std::fmt::Display for CtVersion { /// let registry = ct::load(CtVersion::default(), Some("SEND"))?; /// ``` /// TODO: i think we should refactor and rewrite the whole logic and every file that ahs to do with dataset and use ENUM's so we can use these ENUM's troughout the whole codebase! i think this idead is way better right? no custom strings or using SOME() //// Foundational: CDISC Foundational Standards are the basis of a complete suite of data standards, enhancing the quality, efficiency and cost effectiveness of clinical research processes from beginning to end. Foundational Standards focus on the core principles for defining data standards and include models, domains and specifications for data representation. -/// https://www.cdisc.org/standards/foundational +/// pub fn load(version: CtVersion, primary_set: Option<&str>) -> Result { let mut registry = TerminologyRegistry::new(); From 64791bd3a17c84c479b1f62444dd4a875919600c Mon Sep 17 00:00:00 2001 From: Ruben Talstra <14235001+rubentalstra@users.noreply.github.com> Date: Sun, 25 Jan 2026 11:31:11 +0100 Subject: [PATCH 9/9] feat: update version to 0.0.5-alpha across all relevant files --- .zenodo.json | 2 +- CITATION.cff | 4 ++-- CLAUDE.md | 2 +- Cargo.lock | 14 +++++++------- Cargo.toml | 2 +- ...bentalstra.trial-submission-studio.metainfo.xml | 2 +- codemeta.json | 6 +++--- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.zenodo.json b/.zenodo.json index 1c89cde2..39e4337e 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -1,7 +1,7 @@ { "title": "Trial Submission Studio", "description": "Trial Submission Studio is an open-source desktop application that transforms clinical trial data into FDA-compliant CDISC formats. It provides multi-format output support (XPT V5/V8, Dataset-XML, Define-XML 2.1), intelligent fuzzy column matching with confidence scores, built-in CDISC Controlled Terminology validation, and cross-platform native applications for macOS, Windows, and Linux. All standards are embedded for offline use.", - "version": "0.0.4-alpha", + "version": "0.0.5-alpha", "upload_type": "software", "access_right": "open", "license": "mit", diff --git a/CITATION.cff b/CITATION.cff index 2c6d51e0..c7b8e4ec 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -44,5 +44,5 @@ keywords: - desktop-application - cross-platform license: MIT -version: '0.0.4-alpha' -date-released: '2026-01-24' +version: '0.0.5-alpha' +date-released: '2026-01-25' diff --git a/CLAUDE.md b/CLAUDE.md index 63dfc2bc..424c1864 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -69,7 +69,7 @@ For feature development, ALWAYS start with `/feature-dev` to gather requirements Trial Submission Studio transforms clinical trial source data (CSV) into FDA-compliant CDISC formats (SDTM, ADaM, SEND). It's a cross-platform desktop application written in Rust using the Iced GUI framework. -**Status**: Alpha software (v0.0.4-alpha). Not for production regulatory submissions. +**Status**: Alpha software (v0.0.5-alpha). Not for production regulatory submissions. --- diff --git a/Cargo.lock b/Cargo.lock index e351b2bf..0f48002a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6668,7 +6668,7 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "tss-gui" -version = "0.0.4-alpha" +version = "0.0.5-alpha" dependencies = [ "chrono", "directories", @@ -6700,7 +6700,7 @@ dependencies = [ [[package]] name = "tss-ingest" -version = "0.0.4-alpha" +version = "0.0.5-alpha" dependencies = [ "encoding_rs", "polars", @@ -6713,7 +6713,7 @@ dependencies = [ [[package]] name = "tss-persistence" -version = "0.0.4-alpha" +version = "0.0.5-alpha" dependencies = [ "chrono", "hex", @@ -6730,7 +6730,7 @@ dependencies = [ [[package]] name = "tss-standards" -version = "0.0.4-alpha" +version = "0.0.5-alpha" dependencies = [ "csv", "polars", @@ -6741,7 +6741,7 @@ dependencies = [ [[package]] name = "tss-submit" -version = "0.0.4-alpha" +version = "0.0.5-alpha" dependencies = [ "anyhow", "chrono", @@ -6758,7 +6758,7 @@ dependencies = [ [[package]] name = "tss-updater" -version = "0.0.4-alpha" +version = "0.0.5-alpha" dependencies = [ "async-stream", "chrono", @@ -6779,7 +6779,7 @@ dependencies = [ [[package]] name = "tss-updater-helper" -version = "0.0.4-alpha" +version = "0.0.5-alpha" dependencies = [ "chrono", "serde", diff --git a/Cargo.toml b/Cargo.toml index 9445977b..3621bf91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ members = [ resolver = "3" [workspace.package] -version = "0.0.4-alpha" +version = "0.0.5-alpha" edition = "2024" rust-version = "1.92" license = "MIT" diff --git a/assets/linux/io.github.rubentalstra.trial-submission-studio.metainfo.xml b/assets/linux/io.github.rubentalstra.trial-submission-studio.metainfo.xml index 041ed85a..8a959340 100644 --- a/assets/linux/io.github.rubentalstra.trial-submission-studio.metainfo.xml +++ b/assets/linux/io.github.rubentalstra.trial-submission-studio.metainfo.xml @@ -81,7 +81,7 @@ - +

Alpha release with core SDTM transformation features:

    diff --git a/codemeta.json b/codemeta.json index d81cd5b9..576dc3ce 100644 --- a/codemeta.json +++ b/codemeta.json @@ -4,7 +4,7 @@ "@id": "https://doi.org/10.5281/zenodo.18148150", "name": "Trial Submission Studio", "description": "Trial Submission Studio is an open-source desktop application that transforms clinical trial data into FDA-compliant CDISC formats. It provides multi-format output support (XPT V5/V8, Dataset-XML, Define-XML 2.1), intelligent fuzzy column matching with confidence scores, built-in CDISC Controlled Terminology validation, and cross-platform native applications for macOS, Windows, and Linux. All standards are embedded for offline use.", - "version": "0.0.4-alpha", + "version": "0.0.5-alpha", "identifier": [ { "@type": "PropertyValue", @@ -47,13 +47,13 @@ "continuousIntegration": "https://github.com/rubentalstra/trial-submission-studio/actions", "issueTracker": "https://github.com/rubentalstra/trial-submission-studio/issues", "dateCreated": "2025-12-12", - "dateModified": "2026-01-24", + "dateModified": "2026-01-25", "datePublished": "2026-01-05", "programmingLanguage": [ { "@type": "ComputerLanguage", "name": "Rust", - "version": "0.0.4-alpha", + "version": "0.0.5-alpha", "url": "https://www.rust-lang.org/" } ],