Alpha release with core SDTM transformation features:
diff --git a/codemeta.json b/codemeta.json
index d81cd5b9..576dc3ce 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -4,7 +4,7 @@
"@id": "https://doi.org/10.5281/zenodo.18148150",
"name": "Trial Submission Studio",
"description": "Trial Submission Studio is an open-source desktop application that transforms clinical trial data into FDA-compliant CDISC formats. It provides multi-format output support (XPT V5/V8, Dataset-XML, Define-XML 2.1), intelligent fuzzy column matching with confidence scores, built-in CDISC Controlled Terminology validation, and cross-platform native applications for macOS, Windows, and Linux. All standards are embedded for offline use.",
- "version": "0.0.4-alpha",
+ "version": "0.0.5-alpha",
"identifier": [
{
"@type": "PropertyValue",
@@ -47,13 +47,13 @@
"continuousIntegration": "https://github.com/rubentalstra/trial-submission-studio/actions",
"issueTracker": "https://github.com/rubentalstra/trial-submission-studio/issues",
"dateCreated": "2025-12-12",
- "dateModified": "2026-01-24",
+ "dateModified": "2026-01-25",
"datePublished": "2026-01-05",
"programmingLanguage": [
{
"@type": "ComputerLanguage",
"name": "Rust",
- "version": "0.0.4-alpha",
+ "version": "0.0.5-alpha",
"url": "https://www.rust-lang.org/"
}
],
diff --git a/crates/tss-gui/src/app/mod.rs b/crates/tss-gui/src/app/mod.rs
index 4dfa33f8..92950f70 100644
--- a/crates/tss-gui/src/app/mod.rs
+++ b/crates/tss-gui/src/app/mod.rs
@@ -16,12 +16,12 @@
// Submodules - handlers are organized by category in handler/
mod handler;
+mod subscription;
pub mod util;
// Re-export utility functions for internal use
use util::load_app_icon;
-use iced::keyboard;
use iced::widget::container;
use iced::window;
use iced::{Element, Size, Subscription, Task, Theme};
@@ -680,58 +680,11 @@ impl App {
}
/// Subscribe to runtime events.
+ ///
+ /// Delegates to `subscription::create_subscription` which manages all
+ /// application subscriptions in a centralized module.
pub fn subscription(&self) -> Subscription {
- use iced::{system, time};
- use std::time::Duration;
-
- // Keyboard events
- let keyboard_sub = keyboard::listen().map(|event| match event {
- keyboard::Event::KeyPressed { key, modifiers, .. } => {
- Message::KeyPressed(key, modifiers)
- }
- _ => Message::Noop,
- });
-
- // System theme changes (for ThemeMode::System)
- let system_theme_sub = system::theme_changes().map(Message::SystemThemeChanged);
-
- // Native menu event polling (macOS only, polls every 50ms for responsiveness)
- #[cfg(target_os = "macos")]
- let menu_sub = crate::menu::menu_subscription().map(|action| match action {
- Some(a) => Message::MenuAction(a),
- None => Message::Noop,
- });
-
- #[cfg(not(target_os = "macos"))]
- let menu_sub = Subscription::none();
-
- // Window close events (for cleaning up dialog windows)
- let window_sub = window::close_requests().map(Message::DialogWindowClosed);
-
- // Toast auto-dismiss timer (5 seconds)
- let toast_sub = if self.state.toast.is_some() {
- time::every(Duration::from_secs(5))
- .map(|_| Message::Toast(crate::message::ToastMessage::Dismiss))
- } else {
- Subscription::none()
- };
-
- // Auto-save timer (polls every 500ms to check if auto-save should trigger)
- // The actual save only happens if the dirty tracker indicates it should
- let auto_save_sub = if self.state.auto_save_config.enabled && self.state.study.is_some() {
- time::every(Duration::from_millis(500)).map(|_| Message::AutoSaveTick)
- } else {
- Subscription::none()
- };
-
- Subscription::batch([
- keyboard_sub,
- system_theme_sub,
- menu_sub,
- window_sub,
- toast_sub,
- auto_save_sub,
- ])
+ subscription::create_subscription(&self.state)
}
}
diff --git a/crates/tss-gui/src/app/subscription.rs b/crates/tss-gui/src/app/subscription.rs
new file mode 100644
index 00000000..8c701ef4
--- /dev/null
+++ b/crates/tss-gui/src/app/subscription.rs
@@ -0,0 +1,147 @@
+//! Application subscriptions.
+//!
+//! This module centralizes all Iced subscriptions for the application.
+//! Subscriptions are reactive event sources that run alongside the app.
+//!
+//! # Subscription Overview
+//!
+//! | Subscription | Interval | Condition | Purpose |
+//! |--------------|----------|-----------|---------|
+//! | Keyboard | Continuous | Always | Global keyboard shortcuts |
+//! | System Theme | Continuous | Always | Track OS theme changes |
+//! | Menu (macOS) | 100ms poll | Always | Native menu bar events |
+//! | Window Close | Continuous | Always | Dialog window cleanup |
+//! | Toast Dismiss | 5 seconds | Toast visible | Auto-dismiss notifications |
+//! | Auto-Save | 500ms poll | Study loaded + enabled | Debounced project saves |
+//!
+//! # Architecture
+//!
+//! Subscriptions are batched together in `create_subscription()` and run
+//! concurrently. Conditional subscriptions (toast, auto-save) return
+//! `Subscription::none()` when their condition is not met, avoiding
+//! unnecessary polling.
+
+use std::time::Duration;
+
+use iced::Subscription;
+use iced::keyboard;
+use iced::window;
+use iced::{system, time};
+
+use crate::message::Message;
+use crate::state::AppState;
+
+/// Create all application subscriptions.
+///
+/// This batches together all event subscriptions:
+/// - Keyboard events for shortcuts
+/// - System theme changes for automatic theme switching
+/// - Native menu events (macOS only)
+/// - Window close events for dialog cleanup
+/// - Toast auto-dismiss timer (conditional)
+/// - Auto-save timer (conditional)
+pub fn create_subscription(state: &AppState) -> Subscription {
+ Subscription::batch([
+ keyboard_subscription(),
+ system_theme_subscription(),
+ menu_subscription(),
+ window_close_subscription(),
+ toast_subscription(state),
+ auto_save_subscription(state),
+ ])
+}
+
+/// Keyboard event subscription.
+///
+/// Listens for all key press events to handle global shortcuts.
+/// Runs continuously without polling.
+fn keyboard_subscription() -> Subscription {
+ keyboard::listen().map(|event| match event {
+ keyboard::Event::KeyPressed { key, modifiers, .. } => Message::KeyPressed(key, modifiers),
+ _ => Message::Noop,
+ })
+}
+
+/// System theme change subscription.
+///
+/// Monitors OS theme changes (light/dark) for ThemeMode::System.
+/// Runs continuously without polling.
+fn system_theme_subscription() -> Subscription {
+ system::theme_changes().map(Message::SystemThemeChanged)
+}
+
+/// Native menu event subscription (macOS only).
+///
+/// Polls the menu event channel every 100ms. This interval balances
+/// responsiveness with efficiency:
+/// - Menu clicks are human-initiated (~200-300ms reaction time)
+/// - A background forwarder thread captures events immediately
+/// - We're just polling a local mpsc channel, not the native event system
+///
+/// On non-macOS platforms, returns an empty subscription.
+fn menu_subscription() -> Subscription {
+ #[cfg(target_os = "macos")]
+ {
+ crate::menu::menu_subscription().map(|action| match action {
+ Some(a) => Message::MenuAction(a),
+ None => Message::Noop,
+ })
+ }
+
+ #[cfg(not(target_os = "macos"))]
+ {
+ Subscription::none()
+ }
+}
+
+/// Window close event subscription.
+///
+/// Listens for window close requests to clean up dialog window state.
+/// Runs continuously without polling.
+fn window_close_subscription() -> Subscription {
+ window::close_requests().map(Message::DialogWindowClosed)
+}
+
+/// Toast auto-dismiss subscription.
+///
+/// When a toast notification is visible, polls every 5 seconds to
+/// trigger auto-dismissal. Returns no subscription when no toast exists.
+///
+/// # Conditional Behavior
+/// - Active: When `state.toast.is_some()`
+/// - Inactive: When no toast is displayed
+fn toast_subscription(state: &AppState) -> Subscription {
+ if state.toast.is_some() {
+ time::every(Duration::from_secs(5))
+ .map(|_| Message::Toast(crate::message::ToastMessage::Dismiss))
+ } else {
+ Subscription::none()
+ }
+}
+
+/// Auto-save subscription.
+///
+/// Polls every 500ms to check if an auto-save should trigger. The actual
+/// save only occurs if the dirty tracker indicates changes need saving.
+///
+/// # Conditional Behavior
+/// - Active: When auto-save is enabled AND a study is loaded
+/// - Inactive: When auto-save is disabled OR no study is loaded
+///
+/// The 500ms interval provides a balance between:
+/// - Responsiveness (saves happen within 500ms of idle threshold)
+/// - Efficiency (only 2 checks per second)
+fn auto_save_subscription(state: &AppState) -> Subscription {
+ if state.auto_save_config.enabled && state.study.is_some() {
+ time::every(Duration::from_millis(500)).map(|_| Message::AutoSaveTick)
+ } else {
+ Subscription::none()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ // Note: Subscription testing requires an Iced runtime, which is not
+ // available in unit tests. Integration tests should verify subscription
+ // behavior through the full application.
+}
diff --git a/crates/tss-gui/src/app/util.rs b/crates/tss-gui/src/app/util.rs
index 8bda71ac..392e53f1 100644
--- a/crates/tss-gui/src/app/util.rs
+++ b/crates/tss-gui/src/app/util.rs
@@ -54,7 +54,7 @@ pub async fn create_study_from_assignments(
let domains = tss_standards::load_sdtm_ig()
.map_err(|e| format!("Failed to load SDTM-IG: {}", e))?;
let ct_version = tss_standards::ct::CtVersion::default();
- let ct = tss_standards::ct::load(ct_version).map_err(|e| {
+ let ct = tss_standards::ct::load(ct_version, Some("SDTM")).map_err(|e| {
format!(
"Failed to load Controlled Terminology ({}): {}",
ct_version, e
diff --git a/crates/tss-gui/src/handler/domain_editor.rs b/crates/tss-gui/src/handler/domain_editor.rs
index aef9ab85..01449e7f 100644
--- a/crates/tss-gui/src/handler/domain_editor.rs
+++ b/crates/tss-gui/src/handler/domain_editor.rs
@@ -60,9 +60,8 @@ impl MessageHandler for DomainEditorHandler {
/// Trigger automatic preview rebuild after mapping changes.
/// Returns a Task that computes the preview in the background.
fn trigger_preview_rebuild(state: &AppState, domain_code: &str) -> Task {
- let domain = match state.study.as_ref().and_then(|s| s.domain(domain_code)) {
- Some(d) => d,
- None => return Task::none(),
+ let Some(domain) = state.study.as_ref().and_then(|s| s.domain(domain_code)) else {
+ return Task::none();
};
let input = PreviewInput {
@@ -87,10 +86,10 @@ fn trigger_preview_rebuild(state: &AppState, domain_code: &str) -> Task
fn handle_mapping_message(state: &mut AppState, msg: MappingMessage) -> Task {
// Get current domain code
- let domain_code = match &state.view {
- ViewState::DomainEditor(editor) => editor.domain.clone(),
- _ => return Task::none(),
+ let ViewState::DomainEditor(editor) = &state.view else {
+ return Task::none();
};
+ let domain_code = editor.domain.clone();
match msg {
MappingMessage::VariableSelected(idx) => {
@@ -338,16 +337,15 @@ fn handle_normalization_message(state: &mut AppState, msg: NormalizationMessage)
// =============================================================================
fn handle_validation_message(state: &mut AppState, msg: ValidationMessage) -> Task {
- let domain_code = match &state.view {
- ViewState::DomainEditor(editor) => editor.domain.clone(),
- _ => return Task::none(),
+ let ViewState::DomainEditor(editor) = &state.view else {
+ return Task::none();
};
+ let domain_code = editor.domain.clone();
match msg {
ValidationMessage::RefreshValidation => {
- let domain = match state.study.as_ref().and_then(|s| s.domain(&domain_code)) {
- Some(d) => d,
- None => return Task::none(),
+ let Some(domain) = state.study.as_ref().and_then(|s| s.domain(&domain_code)) else {
+ return Task::none();
};
let df = match &state.view {
@@ -434,10 +432,10 @@ fn handle_validation_message(state: &mut AppState, msg: ValidationMessage) -> Ta
#[allow(clippy::needless_pass_by_value)]
fn handle_preview_message(state: &mut AppState, msg: PreviewMessage) -> Task {
- let domain_code = match &state.view {
- ViewState::DomainEditor(editor) => editor.domain.clone(),
- _ => return Task::none(),
+ let ViewState::DomainEditor(editor) = &state.view else {
+ return Task::none();
};
+ let domain_code = editor.domain.clone();
match msg {
PreviewMessage::GoToPage(page) => {
@@ -473,9 +471,8 @@ fn handle_preview_message(state: &mut AppState, msg: PreviewMessage) -> Task {
- let domain = match state.study.as_ref().and_then(|s| s.domain(&domain_code)) {
- Some(d) => d,
- None => return Task::none(),
+ let Some(domain) = state.study.as_ref().and_then(|s| s.domain(&domain_code)) else {
+ return Task::none();
};
// Mark as rebuilding
@@ -506,10 +503,10 @@ fn handle_preview_message(state: &mut AppState, msg: PreviewMessage) -> Task Task {
- let domain_code = match &state.view {
- ViewState::DomainEditor(editor) => editor.domain.clone(),
- _ => return Task::none(),
+ let ViewState::DomainEditor(editor) = &state.view else {
+ return Task::none();
};
+ let domain_code = editor.domain.clone();
match msg {
SuppMessage::ColumnSelected(col_name) => {
diff --git a/crates/tss-gui/src/handler/export.rs b/crates/tss-gui/src/handler/export.rs
index 1bd0e88b..616bab40 100644
--- a/crates/tss-gui/src/handler/export.rs
+++ b/crates/tss-gui/src/handler/export.rs
@@ -170,9 +170,8 @@ fn start_export(state: &mut AppState) -> Task {
}
// Get study data
- let study = match &state.study {
- Some(s) => s,
- None => return Task::none(),
+ let Some(study) = &state.study else {
+ return Task::none();
};
let study_id = study.study_id.clone();
diff --git a/crates/tss-gui/src/service/study.rs b/crates/tss-gui/src/service/study.rs
index 92d3dee8..7fa6d2b5 100644
--- a/crates/tss-gui/src/service/study.rs
+++ b/crates/tss-gui/src/service/study.rs
@@ -52,7 +52,7 @@ pub async fn create_study_from_assignments(
GuiError::operation("Load standards", format!("Failed to load SDTM-IG: {}", e))
})?;
let ct_version = tss_standards::ct::CtVersion::default();
- let ct = tss_standards::ct::load(ct_version).map_err(|e| {
+ let ct = tss_standards::ct::load(ct_version, Some("SDTM")).map_err(|e| {
GuiError::operation(
"Load standards",
format!(
diff --git a/crates/tss-gui/src/view/domain_editor/validation.rs b/crates/tss-gui/src/view/domain_editor/validation.rs
index 49e1f8f5..29b14eb8 100644
--- a/crates/tss-gui/src/view/domain_editor/validation.rs
+++ b/crates/tss-gui/src/view/domain_editor/validation.rs
@@ -67,7 +67,7 @@ pub fn view_validation_tab<'a>(state: &'a AppState, domain_code: &'a str) -> Ele
matches!(issue.severity(), Severity::Error | Severity::Reject)
}
SeverityFilter::Warnings => matches!(issue.severity(), Severity::Warning),
- SeverityFilter::Info => false, // No info level in current model
+ SeverityFilter::Info => matches!(issue.severity(), Severity::Info),
})
.collect();
@@ -257,6 +257,7 @@ fn view_issue_row<'a>(issue: &'a Issue, idx: usize, is_selected: bool) -> Elemen
lucide::circle_x().size(14).color(severity_color).into()
}
Severity::Warning => lucide::circle_alert().size(14).color(severity_color).into(),
+ Severity::Info => lucide::info().size(14).color(severity_color).into(),
};
// Short description (truncated)
@@ -280,6 +281,7 @@ fn get_severity_color(severity: Severity) -> iced::Color {
match severity {
Severity::Reject | Severity::Error => iced::Color::from_rgb(0.90, 0.30, 0.25),
Severity::Warning => iced::Color::from_rgb(0.95, 0.65, 0.15),
+ Severity::Info => iced::Color::from_rgb(0.30, 0.60, 0.85), // Blue for informational
}
}
@@ -301,6 +303,7 @@ fn view_issue_detail<'a>(issue: &Issue) -> Element<'a, Message> {
lucide::circle_x().size(12).color(severity_color).into()
}
Severity::Warning => lucide::circle_alert().size(12).color(severity_color).into(),
+ Severity::Info => lucide::info().size(12).color(severity_color).into(),
};
// Header with variable name and severity badge
@@ -390,14 +393,14 @@ fn view_issue_metadata<'a>(
codelist_code,
codelist_name,
extensible,
- invalid_count,
+ total_invalid,
invalid_values,
allowed_count,
..
} => {
metadata = metadata.row("Codelist", format!("{} ({})", codelist_name, codelist_code));
metadata = metadata.row("Extensible", if *extensible { "Yes" } else { "No" });
- metadata = metadata.row("Invalid Values", invalid_count.to_string());
+ metadata = metadata.row("Invalid Values", total_invalid.to_string());
metadata = metadata.row("Allowed Terms", allowed_count.to_string());
if !invalid_values.is_empty() {
let display_values: Vec<&str> =
@@ -579,7 +582,13 @@ fn issue_category(issue: &Issue) -> &'static str {
Issue::InvalidDate { .. } | Issue::TextTooLong { .. } => "Format",
Issue::DataTypeMismatch { .. } => "Type",
Issue::DuplicateSequence { .. } => "Consistency",
- Issue::UsubjidNotInDm { .. } | Issue::ParentNotFound { .. } => "Cross Reference",
+ Issue::UsubjidNotInDm { .. }
+ | Issue::ParentNotFound { .. }
+ | Issue::InvalidRdomain { .. }
+ | Issue::RelsubNotInDm { .. }
+ | Issue::RelsubNotBidirectional { .. }
+ | Issue::RelspecInvalidParent { .. }
+ | Issue::RelrecInvalidReference { .. } => "Cross Reference",
Issue::CtViolation { .. } => "Terminology",
}
}
diff --git a/crates/tss-ingest/Cargo.toml b/crates/tss-ingest/Cargo.toml
index 5859b75c..e46f2912 100644
--- a/crates/tss-ingest/Cargo.toml
+++ b/crates/tss-ingest/Cargo.toml
@@ -11,6 +11,7 @@ workspace = true
[dependencies]
# Workspace dependencies (alphabetical)
+encoding_rs.workspace = true
polars.workspace = true
serde = { workspace = true, features = ["derive"] }
thiserror.workspace = true
diff --git a/crates/tss-ingest/src/csv/mod.rs b/crates/tss-ingest/src/csv/mod.rs
index 7032c291..f20f5542 100644
--- a/crates/tss-ingest/src/csv/mod.rs
+++ b/crates/tss-ingest/src/csv/mod.rs
@@ -5,6 +5,7 @@ mod reader;
pub use header::CsvHeaders;
pub use reader::{
- MAX_CSV_FILE_SIZE, check_file_size, check_file_size_with_limit, read_csv_schema,
- read_csv_table, validate_dataframe_shape, validate_encoding,
+ EncodingResult, MAX_CSV_FILE_SIZE, check_file_size, check_file_size_with_limit,
+ check_path_length, detect_and_transcode, read_csv_schema, read_csv_table,
+ validate_dataframe_shape,
};
diff --git a/crates/tss-ingest/src/csv/reader.rs b/crates/tss-ingest/src/csv/reader.rs
index c36ffbbd..a15253b2 100644
--- a/crates/tss-ingest/src/csv/reader.rs
+++ b/crates/tss-ingest/src/csv/reader.rs
@@ -1,9 +1,10 @@
//! CSV file reading with explicit header row configuration.
use std::fs::File;
-use std::io::{BufRead, BufReader, Read};
+use std::io::{BufRead, BufReader, Cursor, Read};
use std::path::Path;
+use encoding_rs::{UTF_16BE, UTF_16LE};
use polars::prelude::*;
use crate::error::{IngestError, Result};
@@ -13,6 +14,38 @@ use super::header::{CsvHeaders, parse_csv_line};
/// Maximum file size for CSV loading (500 MB default).
pub const MAX_CSV_FILE_SIZE: u64 = 500 * 1024 * 1024;
+/// Maximum path length on Windows (MAX_PATH).
+/// TODO: Consider supporting extended-length paths (\\?\) prefix in the future
+/// to allow paths up to 32,767 characters on Windows.
+#[cfg(windows)]
+const MAX_PATH_LENGTH: usize = 260;
+
+/// Check path length on Windows.
+///
+/// Windows has a MAX_PATH limit of 260 characters. Paths exceeding this
+/// may cause silent failures or cryptic errors.
+#[cfg(windows)]
+pub fn check_path_length(path: &Path) -> Result<()> {
+ let path_str = path.to_string_lossy();
+ let length = path_str.len();
+
+ if length > MAX_PATH_LENGTH {
+ return Err(IngestError::PathTooLong {
+ path: path.to_path_buf(),
+ length,
+ max_length: MAX_PATH_LENGTH,
+ });
+ }
+
+ Ok(())
+}
+
+/// No-op path length check on non-Windows platforms.
+#[cfg(not(windows))]
+pub fn check_path_length(_path: &Path) -> Result<()> {
+ Ok(())
+}
+
/// Check file size before loading.
///
/// This is a sync function meant to be called via `spawn_blocking` from async contexts.
@@ -46,10 +79,22 @@ pub fn check_file_size_with_limit(path: &Path, max_size: u64) -> Result<()> {
Ok(())
}
-/// Detect encoding and validate it's supported (UTF-8 only).
+/// Encoding detection result.
+#[derive(Debug)]
+pub enum EncodingResult {
+ /// File is UTF-8 (with or without BOM), read directly.
+ Utf8,
+ /// File was transcoded from UTF-16, use this content.
+ Transcoded(String),
+}
+
+/// Detect encoding and transcode if necessary.
+///
+/// - UTF-8 (with or without BOM): Returns `Utf8`, file can be read directly
+/// - UTF-16 LE/BE: Transcodes to UTF-8 and returns `Transcoded(content)`
///
-/// Checks for UTF-16 BOM markers which are not supported.
-pub fn validate_encoding(path: &Path) -> Result<()> {
+/// This handles Windows/Excel exports that often use UTF-16.
+pub fn detect_and_transcode(path: &Path) -> Result {
let mut file = File::open(path).map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
IngestError::FileNotFound {
@@ -63,32 +108,62 @@ pub fn validate_encoding(path: &Path) -> Result<()> {
}
})?;
- let mut buffer = [0u8; 4];
- let bytes_read = file.read(&mut buffer).map_err(|e| IngestError::FileRead {
- path: path.to_path_buf(),
- source: e,
- })?;
+ // Read first 4 bytes to detect BOM
+ let mut bom_buffer = [0u8; 4];
+ let bytes_read = file
+ .read(&mut bom_buffer)
+ .map_err(|e| IngestError::FileRead {
+ path: path.to_path_buf(),
+ source: e,
+ })?;
- // Check for UTF-16 BOM (not supported)
if bytes_read >= 2 {
- // UTF-16 LE BOM
- if buffer[0..2] == [0xFF, 0xFE] {
- return Err(IngestError::UnsupportedEncoding {
- path: path.to_path_buf(),
- encoding: "UTF-16 LE",
- });
+ // UTF-16 LE BOM: FF FE
+ if bom_buffer[0..2] == [0xFF, 0xFE] {
+ return transcode_utf16(path, UTF_16LE, 2);
}
- // UTF-16 BE BOM
- if buffer[0..2] == [0xFE, 0xFF] {
- return Err(IngestError::UnsupportedEncoding {
- path: path.to_path_buf(),
- encoding: "UTF-16 BE",
- });
+ // UTF-16 BE BOM: FE FF
+ if bom_buffer[0..2] == [0xFE, 0xFF] {
+ return transcode_utf16(path, UTF_16BE, 2);
}
}
- // UTF-8 BOM is acceptable (handled in read_first_lines)
- Ok(())
+ // UTF-8 (with or without BOM)
+ Ok(EncodingResult::Utf8)
+}
+
+/// Transcode a UTF-16 file to UTF-8.
+fn transcode_utf16(
+ path: &Path,
+ encoding: &'static encoding_rs::Encoding,
+ bom_size: usize,
+) -> Result {
+ // Read entire file
+ let bytes = std::fs::read(path).map_err(|e| IngestError::FileRead {
+ path: path.to_path_buf(),
+ source: e,
+ })?;
+
+ // Skip BOM and decode
+ let (decoded, _, had_errors) = encoding.decode(&bytes[bom_size..]);
+
+ if had_errors {
+ tracing::warn!(
+ path = %path.display(),
+ encoding = %encoding.name(),
+ "UTF-16 transcoding had replacement characters"
+ );
+ }
+
+ tracing::info!(
+ path = %path.display(),
+ encoding = %encoding.name(),
+ original_size = bytes.len(),
+ transcoded_size = decoded.len(),
+ "Transcoded UTF-16 file to UTF-8"
+ );
+
+ Ok(EncodingResult::Transcoded(decoded.into_owned()))
}
/// Validate DataFrame shape after loading.
@@ -193,27 +268,57 @@ pub fn read_csv_schema(path: &Path, header_rows: usize) -> Result {
/// - `header_rows = 1`: Single header row
/// - `header_rows = 2`: Double header (labels + column names)
///
+/// Automatically handles:
+/// - UTF-8 files (with or without BOM)
+/// - UTF-16 LE/BE files (transcoded to UTF-8)
+/// - Windows path length validation
+///
/// Returns both the DataFrame and the header information.
pub fn read_csv_table(path: &Path, header_rows: usize) -> Result<(DataFrame, CsvHeaders)> {
+ // Check path length on Windows
+ check_path_length(path)?;
+
+ // Detect encoding and transcode if needed
+ let encoding_result = detect_and_transcode(path)?;
+
let headers = read_csv_schema(path, header_rows)?;
// Skip additional rows beyond the first header row
let skip_rows = header_rows.saturating_sub(1);
- let df = CsvReadOptions::default()
- .with_has_header(true)
- .with_skip_rows(skip_rows)
- .with_infer_schema_length(Some(100))
- .try_into_reader_with_file_path(Some(path.to_path_buf()))
- .map_err(|e| IngestError::CsvParse {
- path: path.to_path_buf(),
- message: e.to_string(),
- })?
- .finish()
- .map_err(|e| IngestError::CsvParse {
- path: path.to_path_buf(),
- message: e.to_string(),
- })?;
+ let df = match encoding_result {
+ EncodingResult::Utf8 => {
+ // Read directly from file
+ CsvReadOptions::default()
+ .with_has_header(true)
+ .with_skip_rows(skip_rows)
+ .with_infer_schema_length(None)
+ .try_into_reader_with_file_path(Some(path.to_path_buf()))
+ .map_err(|e| IngestError::CsvParse {
+ path: path.to_path_buf(),
+ message: e.to_string(),
+ })?
+ .finish()
+ .map_err(|e| IngestError::CsvParse {
+ path: path.to_path_buf(),
+ message: e.to_string(),
+ })?
+ }
+ EncodingResult::Transcoded(content) => {
+ // Read from transcoded content
+ let cursor = Cursor::new(content.as_bytes());
+ CsvReadOptions::default()
+ .with_has_header(true)
+ .with_skip_rows(skip_rows)
+ .with_infer_schema_length(None)
+ .into_reader_with_file_handle(cursor)
+ .finish()
+ .map_err(|e| IngestError::CsvParse {
+ path: path.to_path_buf(),
+ message: e.to_string(),
+ })?
+ }
+ };
Ok((df, headers))
}
@@ -295,4 +400,58 @@ mod tests {
assert_eq!(df.height(), 2);
assert_eq!(df.width(), 3);
}
+
+ #[test]
+ fn test_detect_and_transcode_utf8() {
+ let file = create_temp_csv("A,B,C\n1,2,3\n");
+ let result = detect_and_transcode(file.path()).unwrap();
+
+ assert!(matches!(result, EncodingResult::Utf8));
+ }
+
+ #[test]
+ fn test_detect_and_transcode_utf16le() {
+ // Create UTF-16 LE file with BOM
+ let content = "A,B,C\n1,2,3\n";
+ let mut bytes = vec![0xFF, 0xFE]; // UTF-16 LE BOM
+ for c in content.encode_utf16() {
+ bytes.extend_from_slice(&c.to_le_bytes());
+ }
+
+ let mut file = NamedTempFile::new().unwrap();
+ file.write_all(&bytes).unwrap();
+
+ let result = detect_and_transcode(file.path()).unwrap();
+
+ match result {
+ EncodingResult::Transcoded(s) => {
+ assert!(s.contains("A,B,C"));
+ assert!(s.contains("1,2,3"));
+ }
+ EncodingResult::Utf8 => panic!("Expected Transcoded, got Utf8"),
+ }
+ }
+
+ #[test]
+ fn test_detect_and_transcode_utf16be() {
+ // Create UTF-16 BE file with BOM
+ let content = "A,B,C\n1,2,3\n";
+ let mut bytes = vec![0xFE, 0xFF]; // UTF-16 BE BOM
+ for c in content.encode_utf16() {
+ bytes.extend_from_slice(&c.to_be_bytes());
+ }
+
+ let mut file = NamedTempFile::new().unwrap();
+ file.write_all(&bytes).unwrap();
+
+ let result = detect_and_transcode(file.path()).unwrap();
+
+ match result {
+ EncodingResult::Transcoded(s) => {
+ assert!(s.contains("A,B,C"));
+ assert!(s.contains("1,2,3"));
+ }
+ EncodingResult::Utf8 => panic!("Expected Transcoded, got Utf8"),
+ }
+ }
}
diff --git a/crates/tss-ingest/src/error.rs b/crates/tss-ingest/src/error.rs
index f3ed9e1e..4eea23c0 100644
--- a/crates/tss-ingest/src/error.rs
+++ b/crates/tss-ingest/src/error.rs
@@ -61,13 +61,24 @@ pub enum IngestError {
#[error("empty column name found in {path}")]
EmptyColumnName { path: PathBuf },
- /// Unsupported file encoding.
- #[error("unsupported encoding '{encoding}' in {path}: only UTF-8 is supported")]
+ /// Unsupported file encoding (cannot be transcoded).
+ #[error("unsupported encoding '{encoding}' in {path}: only UTF-8 and UTF-16 are supported")]
UnsupportedEncoding {
path: PathBuf,
encoding: &'static str,
},
+ /// Path exceeds maximum length for the platform.
+ ///
+ /// On Windows, paths longer than 260 characters may fail.
+ /// TODO: Consider supporting extended-length paths (\\?\) prefix in the future.
+ #[error("path too long ({length} chars, max {max_length}): {path}")]
+ PathTooLong {
+ path: PathBuf,
+ length: usize,
+ max_length: usize,
+ },
+
// === Metadata Errors ===
/// Required column not found in metadata file.
#[error("required column '{column}' not found in {path}")]
diff --git a/crates/tss-ingest/src/lib.rs b/crates/tss-ingest/src/lib.rs
index 1b1114ea..cd1b72cb 100644
--- a/crates/tss-ingest/src/lib.rs
+++ b/crates/tss-ingest/src/lib.rs
@@ -40,8 +40,9 @@ pub use error::{IngestError, Result};
// === CSV Reading ===
pub use csv::{
- CsvHeaders, MAX_CSV_FILE_SIZE, check_file_size, check_file_size_with_limit, read_csv_schema,
- read_csv_table, validate_dataframe_shape, validate_encoding,
+ CsvHeaders, EncodingResult, MAX_CSV_FILE_SIZE, check_file_size, check_file_size_with_limit,
+ check_path_length, detect_and_transcode, read_csv_schema, read_csv_table,
+ validate_dataframe_shape,
};
// === File Discovery ===
diff --git a/crates/tss-standards/Cargo.toml b/crates/tss-standards/Cargo.toml
index af32dc42..cd1ac34a 100644
--- a/crates/tss-standards/Cargo.toml
+++ b/crates/tss-standards/Cargo.toml
@@ -15,3 +15,4 @@ csv.workspace = true
polars.workspace = true
serde = { workspace = true, features = ["derive"] }
thiserror.workspace = true
+tracing.workspace = true
diff --git a/crates/tss-standards/data/terminology/README.md b/crates/tss-standards/data/terminology/README.md
new file mode 100644
index 00000000..782f71a1
--- /dev/null
+++ b/crates/tss-standards/data/terminology/README.md
@@ -0,0 +1,143 @@
+# CDISC Controlled Terminology
+
+This folder contains CDISC Controlled Terminology (CT) CSV files embedded at compile time.
+
+## Folder Structure
+
+```
+terminology/
+├── README.md # This file
+├── 2024-03-29/ # CT version 2024-03-29 (default)
+│ ├── SDTM_CT_2024-03-29.csv
+│ ├── ADaM_CT_2024-03-29.csv
+│ ├── SEND_CT_2024-03-29.csv
+│ └── ...
+├── 2025-03-28/ # CT version 2025-03-28
+│ └── ...
+└── 2025-09-26/ # CT version 2025-09-26 (latest)
+ └── ...
+```
+
+## Adding a New CT Version
+
+CDISC publishes CT updates quarterly. Follow these steps to add a new version:
+
+### Step 1: Download CT Files
+
+1. Go to the [CDISC Library Browser](https://library.cdisc.org/browser/#/)
+2. Navigate to Controlled Terminology and select the version you need
+3. Download the CT packages (SDTM, ADaM, SEND, etc.) as CSV files
+
+### Step 2: Create Version Folder
+
+Create a new folder named with the CT release date:
+
+```bash
+mkdir crates/tss-standards/data/terminology/YYYY-MM-DD
+```
+
+### Step 3: Update `embedded.rs`
+
+Add `include_str!()` constants for each new file:
+
+```rust
+// =============================================================================
+// Controlled Terminology - YYYY-MM-DD
+// =============================================================================
+
+/// SDTM CT YYYY-MM-DD
+pub const CT_YYYY_MM_DD_SDTM: &str =
+ include_str!("../data/terminology/YYYY-MM-DD/SDTM_CT_YYYY-MM-DD.csv");
+
+// ... repeat for each CT file
+```
+
+### Step 4: Update `ct/loader.rs`
+
+1. Add new variant to `CtVersion` enum:
+
+```rust
+pub enum CtVersion {
+ V2024_03_29,
+ V2025_03_28,
+ V2025_09_26,
+ V_YYYY_MM_DD, // Add new variant
+}
+```
+
+2. Update `dir_name()`:
+
+```rust
+pub const fn dir_name(&self) -> &'static str {
+ match self {
+ Self::V2024_03_29 => "2024-03-29",
+ Self::V2025_03_28 => "2025-03-28",
+ Self::V2025_09_26 => "2025-09-26",
+ Self::V_YYYY_MM_DD => "YYYY-MM-DD", // Add
+ }
+}
+```
+
+3. Update `all()` to include new variant
+
+4. Update `latest()` if this is the newest version
+
+### Step 5: Update `ct_files_for_version()` in `embedded.rs`
+
+Add match arm for the new version:
+
+```rust
+CtVersion::V_YYYY_MM_DD => vec![
+ ("SDTM_CT_YYYY-MM-DD.csv", CT_YYYY_MM_DD_SDTM),
+ ("ADaM_CT_YYYY-MM-DD.csv", CT_YYYY_MM_DD_ADAM),
+ // ... all files for this version
+],
+```
+
+### Step 6: Update `sdtm_ct_for_version()` in `embedded.rs`
+
+Add match arm for SDTM-only loading.
+
+### Step 7: Verify
+
+```bash
+cargo test --package tss-standards ct
+cargo clippy --package tss-standards
+```
+
+## CT File Format
+
+CDISC CT CSV files have these columns:
+
+| Column | Description |
+|--------------------------------|------------------------------------------------|
+| `Code` | NCI concept code |
+| `Codelist Code` | Parent codelist code (blank for codelist rows) |
+| `Codelist Extensible (Yes/No)` | Whether custom values are allowed |
+| `Codelist Name` | Human-readable name |
+| `CDISC Submission Value` | The valid value for submissions |
+| `CDISC Synonym(s)` | Alternative names (semicolon-separated) |
+| `CDISC Definition` | Term definition |
+| `NCI Preferred Term` | NCI standard term |
+
+## Publishing Sets
+
+Each CT version contains multiple publishing sets:
+
+| Set | Description |
+|------------|---------------------------------------------------|
+| SDTM | Study Data Tabulation Model |
+| ADaM | Analysis Data Model |
+| SEND | Standard for Exchange of Nonclinical Data |
+| Define-XML | Define-XML metadata |
+| Protocol | Protocol terminology |
+| CDASH | Clinical Data Acquisition Standards Harmonization |
+| DDF | Digital Data Flow |
+| MRCT | Multi-Regional Clinical Trials |
+| Glossary | CDISC Glossary |
+
+Not all sets are available in every CT release.
+
+## Source
+
+Official CT packages:
diff --git a/crates/tss-standards/src/ct/loader.rs b/crates/tss-standards/src/ct/loader.rs
index 86c259ae..71db5a72 100644
--- a/crates/tss-standards/src/ct/loader.rs
+++ b/crates/tss-standards/src/ct/loader.rs
@@ -66,20 +66,36 @@ impl std::fmt::Display for CtVersion {
/// # Arguments
///
/// * `version` - The CT version to load
+/// * `primary_set` - The publishing set to mark as primary (e.g., "SDTM", "SEND").
+/// Pass `None` for no primary designation.
///
/// # Example
///
/// ```rust,ignore
/// use tss_standards::ct::{self, CtVersion};
///
-/// let registry = ct::load(CtVersion::default())?;
-/// let ny = registry.resolve("NY", None);
+/// // Load with SDTM CT as primary (typical for SDTM studies)
+/// let registry = ct::load(CtVersion::default(), Some("SDTM"))?;
+///
+/// // For SEND studies, mark SEND CT as primary
+/// let registry = ct::load(CtVersion::default(), Some("SEND"))?;
/// ```
-pub fn load(version: CtVersion) -> Result {
+/// TODO: i think we should refactor and rewrite the whole logic and every file that ahs to do with dataset and use ENUM's so we can use these ENUM's troughout the whole codebase! i think this idead is way better right? no custom strings or using SOME() //// Foundational: CDISC Foundational Standards are the basis of a complete suite of data standards, enhancing the quality, efficiency and cost effectiveness of clinical research processes from beginning to end. Foundational Standards focus on the core principles for defining data standards and include models, domains and specifications for data representation.
+///
+pub fn load(version: CtVersion, primary_set: Option<&str>) -> Result {
let mut registry = TerminologyRegistry::new();
for (filename, content) in embedded::ct_files_for_version(version) {
- let catalog = load_catalog_from_str(content, filename)?;
+ let mut catalog = load_catalog_from_str(content, filename)?;
+
+ // Mark catalog as primary if it matches the requested publishing set
+ if let Some(primary) = primary_set
+ && let Some(ref pub_set) = catalog.publishing_set
+ && pub_set.eq_ignore_ascii_case(primary)
+ {
+ catalog.set_primary(true);
+ }
+
registry.add_catalog(catalog);
}
@@ -129,6 +145,7 @@ pub fn load_catalog_from_str(content: &str, filename: &str) -> Result Result 0 {
+ tracing::warn!(
+ file = %filename,
+ orphan_count = orphan_term_count,
+ "CT file contains orphan terms referencing non-existent codelists"
+ );
+ }
+
+ // Third pass: check for empty codelists
+ let empty_codelists: Vec<&str> = catalog
+ .codelists
+ .values()
+ .filter(|cl| cl.terms.is_empty())
+ .map(|cl| cl.code.as_str())
+ .collect();
+
+ if !empty_codelists.is_empty() {
+ tracing::warn!(
+ file = %filename,
+ empty_count = empty_codelists.len(),
+ sample_codes = ?empty_codelists.iter().take(5).collect::>(),
+ "CT file contains codelists with no terms"
+ );
+ }
+
Ok(catalog)
}
@@ -259,7 +314,7 @@ mod tests {
#[test]
fn test_load_ct_default() {
- let registry = load(CtVersion::default()).expect("load CT");
+ let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT");
assert!(
!registry.catalogs.is_empty(),
"Registry should not be empty"
@@ -268,7 +323,7 @@ mod tests {
#[test]
fn test_load_ct_latest() {
- let registry = load(CtVersion::latest()).expect("load CT");
+ let registry = load(CtVersion::latest(), Some("SDTM")).expect("load CT");
assert!(
!registry.catalogs.is_empty(),
"Registry should not be empty"
@@ -305,7 +360,7 @@ mod tests {
#[test]
fn test_resolve_codelist() {
- let registry = load(CtVersion::default()).expect("load CT");
+ let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT");
// Test resolving NY codelist by NCI code C66742
let resolved = registry.resolve("C66742", None);
@@ -322,7 +377,7 @@ mod tests {
#[test]
fn test_submission_value_validation() {
- let registry = load(CtVersion::default()).expect("load CT");
+ let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT");
// C66742 is NY (Yes/No) - non-extensible
// Valid submission values: Y, N
@@ -336,4 +391,61 @@ mod tests {
"YES should fail - it's a synonym, not a submission value"
);
}
+
+ #[test]
+ fn test_primary_catalog_marking() {
+ // Load with SDTM as primary
+ let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT");
+
+ // SDTM CT should be marked as primary
+ let sdtm_catalog = registry.catalogs.get("SDTM CT");
+ assert!(sdtm_catalog.is_some(), "Should have SDTM CT catalog");
+ assert!(
+ sdtm_catalog.unwrap().primary,
+ "SDTM CT should be marked as primary"
+ );
+
+ // SEND CT should NOT be primary (if it exists)
+ if let Some(send_catalog) = registry.catalogs.get("SEND CT") {
+ assert!(
+ !send_catalog.primary,
+ "SEND CT should NOT be primary by default"
+ );
+ }
+ }
+
+ #[test]
+ fn test_load_with_send_primary() {
+ // Load with SEND as primary
+ let registry = load(CtVersion::default(), Some("SEND")).expect("load CT");
+
+ // SEND CT should be marked as primary
+ if let Some(send_catalog) = registry.catalogs.get("SEND CT") {
+ assert!(
+ send_catalog.primary,
+ "SEND CT should be marked as primary when requested"
+ );
+ }
+
+ // SDTM CT should NOT be primary
+ if let Some(sdtm_catalog) = registry.catalogs.get("SDTM CT") {
+ assert!(
+ !sdtm_catalog.primary,
+ "SDTM CT should NOT be primary when SEND is requested"
+ );
+ }
+ }
+
+ #[test]
+ fn test_resolved_from_primary() {
+ let registry = load(CtVersion::default(), Some("SDTM")).expect("load CT");
+
+ // Resolve a codelist - should come from SDTM CT (primary)
+ let resolved = registry.resolve("C66742", None);
+ assert!(resolved.is_some());
+ assert!(
+ resolved.unwrap().from_primary(),
+ "Resolution should come from primary catalog"
+ );
+ }
}
diff --git a/crates/tss-standards/src/ct/mod.rs b/crates/tss-standards/src/ct/mod.rs
index 44bad649..56f67904 100644
--- a/crates/tss-standards/src/ct/mod.rs
+++ b/crates/tss-standards/src/ct/mod.rs
@@ -12,8 +12,8 @@
//! ```rust,ignore
//! use tss_standards::ct::{self, CtVersion};
//!
-//! // Load all CT for a version (from embedded data)
-//! let registry = ct::load(CtVersion::default())?;
+//! // Load all CT for a version with SDTM as primary
+//! let registry = ct::load(CtVersion::default(), Some("SDTM"))?;
//!
//! // Validate a value against a codelist
//! if let Some(issue) = registry.validate_submission_value("C66742", "INVALID") {
diff --git a/crates/tss-standards/src/ct/types.rs b/crates/tss-standards/src/ct/types.rs
index 2fc480f2..7cd20049 100644
--- a/crates/tss-standards/src/ct/types.rs
+++ b/crates/tss-standards/src/ct/types.rs
@@ -232,6 +232,10 @@ pub struct TerminologyCatalog {
/// Source file name.
pub source: Option,
+ /// Whether this is the primary catalog for its publishing set.
+ /// Used to track if resolutions came from the expected catalog.
+ pub primary: bool,
+
/// Codelists by NCI code (uppercase).
pub codelists: BTreeMap,
}
@@ -244,10 +248,16 @@ impl TerminologyCatalog {
version,
publishing_set,
source: None,
+ primary: false,
codelists: BTreeMap::new(),
}
}
+ /// Mark this catalog as primary for its publishing set.
+ pub fn set_primary(&mut self, primary: bool) {
+ self.primary = primary;
+ }
+
/// Get a codelist by NCI code.
pub fn get(&self, code: &str) -> Option<&Codelist> {
self.codelists.get(&code.to_uppercase())
@@ -402,6 +412,14 @@ impl<'a> ResolvedCodelist<'a> {
&self.catalog.label
}
+ /// Check if the resolution came from a primary catalog.
+ ///
+ /// If false, the codelist was found in a fallback catalog,
+ /// which may indicate a configuration issue or missing data.
+ pub fn from_primary(&self) -> bool {
+ self.catalog.primary
+ }
+
/// Check if a value is a valid submission value.
pub fn is_valid_submission_value(&self, value: &str) -> bool {
self.codelist.is_valid_submission_value(value)
diff --git a/crates/tss-standards/src/embedded.rs b/crates/tss-standards/src/embedded.rs
index 299f47c4..c9b5c316 100644
--- a/crates/tss-standards/src/embedded.rs
+++ b/crates/tss-standards/src/embedded.rs
@@ -8,7 +8,11 @@
//! - SDTM-IG v3.4: Datasets and Variables
//! - ADaM-IG v1.3: DataStructures and Variables
//! - SEND-IG v3.1.1: Datasets and Variables
-//! - Controlled Terminology: Multiple versions (2024-03-29, 2025-03-28, 2025-09-26)
+//! - Controlled Terminology: Multiple versions
+//!
+//! # Adding New CT Versions
+//!
+//! See `data/terminology/README.md` for step-by-step instructions.
// =============================================================================
// SDTM-IG v3.4
diff --git a/crates/tss-standards/src/registry.rs b/crates/tss-standards/src/registry.rs
index 5721fe53..1013ba16 100644
--- a/crates/tss-standards/src/registry.rs
+++ b/crates/tss-standards/src/registry.rs
@@ -91,7 +91,8 @@ impl StandardsRegistry {
/// Errors include which standard failed to load for easier debugging.
pub fn load(config: &StandardsConfig) -> Result {
// Load controlled terminology first (required for validation)
- let ct = ct::load(config.ct_version)?;
+ // TODO: Primary set should come from config based on study type
+ let ct = ct::load(config.ct_version, Some("SDTM"))?;
// Load SDTM-IG if requested
let sdtm_domains = if config.load_sdtm {
diff --git a/crates/tss-standards/src/sdtm_ig.rs b/crates/tss-standards/src/sdtm_ig.rs
index 2a3f51e3..8df2657f 100644
--- a/crates/tss-standards/src/sdtm_ig.rs
+++ b/crates/tss-standards/src/sdtm_ig.rs
@@ -264,4 +264,42 @@ mod tests {
}
}
}
+
+ #[test]
+ fn test_special_purpose_and_relationship_domains() {
+ let domains = load().expect("load SDTM-IG");
+
+ // Verify CO (Comments) domain loads with correct class
+ let co = domains
+ .iter()
+ .find(|d| d.name == "CO")
+ .expect("CO domain should exist");
+ assert_eq!(co.class, Some(SdtmDatasetClass::SpecialPurpose));
+ assert!(
+ co.variables.len() >= 10,
+ "CO should have at least 10 variables"
+ );
+
+ // Verify RELREC (Related Records) domain
+ let relrec = domains
+ .iter()
+ .find(|d| d.name == "RELREC")
+ .expect("RELREC domain should exist");
+ assert_eq!(relrec.class, Some(SdtmDatasetClass::Relationship));
+ assert!(!relrec.variables.is_empty(), "RELREC should have variables");
+
+ // Verify RELSPEC (Related Specimens) domain
+ let relspec = domains
+ .iter()
+ .find(|d| d.name == "RELSPEC")
+ .expect("RELSPEC domain should exist");
+ assert_eq!(relspec.class, Some(SdtmDatasetClass::Relationship));
+
+ // Verify RELSUB (Related Subjects) domain
+ let relsub = domains
+ .iter()
+ .find(|d| d.name == "RELSUB")
+ .expect("RELSUB domain should exist");
+ assert_eq!(relsub.class, Some(SdtmDatasetClass::Relationship));
+ }
}
diff --git a/crates/tss-submit/src/export/define_xml.rs b/crates/tss-submit/src/export/define_xml.rs
index c9389c84..6ddb83bc 100644
--- a/crates/tss-submit/src/export/define_xml.rs
+++ b/crates/tss-submit/src/export/define_xml.rs
@@ -83,9 +83,11 @@ pub fn write_define_xml(
});
}
let study_id = normalize_study_id(study_id);
- let study_oid = format!("STDY.{study_id}");
+ let sanitized_study_id = sanitize_oid_component(&study_id);
+ let sanitized_ig_version = sanitize_oid_component(&options.ig_version);
+ let study_oid = format!("STDY.{sanitized_study_id}");
let file_oid = format!("{study_oid}.Define-XML_{DEFINE_XML_VERSION}");
- let mdv_oid = format!("MDV.{study_oid}.SDTMIG.{}", options.ig_version);
+ let mdv_oid = format!("MDV.{study_oid}.SDTMIG.{sanitized_ig_version}");
let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true);
let domain_lookup = domain_map_by_code(domains);
@@ -101,7 +103,7 @@ pub fn write_define_xml(
}
entries.sort_by(|a, b| a.0.name.cmp(&b.0.name));
- let ct_registry = load_ct(CtVersion::default())?;
+ let ct_registry = load_ct(CtVersion::default(), Some("SDTM"))?;
let mut item_defs: BTreeMap = BTreeMap::new();
let mut code_lists: BTreeMap = BTreeMap::new();
let mut ct_standards: BTreeMap = BTreeMap::new();
@@ -115,7 +117,11 @@ pub fn write_define_xml(
continue;
}
- let oid = format!("IT.{}.{}", output_dataset_name, variable.name);
+ let oid = format!(
+ "IT.{}.{}",
+ sanitize_oid_component(&output_dataset_name),
+ sanitize_oid_component(&variable.name)
+ );
let length = match variable.data_type {
VariableType::Char => Some(variable_length(variable, &frame.data)?),
VariableType::Num => None,
@@ -214,7 +220,7 @@ pub fn write_define_xml(
let output_dataset_name = frame.dataset_name();
let base_domain_code = frame.base_domain_code();
let mut ig = BytesStart::new("ItemGroupDef");
- let ig_oid = format!("IG.{}", output_dataset_name);
+ let ig_oid = format!("IG.{}", sanitize_oid_component(&output_dataset_name));
let sas_dataset_name: String = output_dataset_name.chars().take(8).collect();
ig.push_attribute(("OID", ig_oid.as_str()));
ig.push_attribute(("Name", output_dataset_name.as_str()));
@@ -245,7 +251,11 @@ pub fn write_define_xml(
let mut key_sequence = 1usize;
for (idx, variable) in ordered_vars.iter().enumerate() {
let mut item_ref = BytesStart::new("ItemRef");
- let item_oid = format!("IT.{}.{}", output_dataset_name, variable.name);
+ let item_oid = format!(
+ "IT.{}.{}",
+ sanitize_oid_component(&output_dataset_name),
+ sanitize_oid_component(&variable.name)
+ );
let order_number = format!("{}", idx + 1);
item_ref.push_attribute(("ItemOID", item_oid.as_str()));
item_ref.push_attribute(("OrderNumber", order_number.as_str()));
@@ -283,8 +293,9 @@ pub fn write_define_xml(
}
let core_designation = item_def.core.as_deref().and_then(|c| c.parse().ok());
+ // Define-XML 2.1 valid OriginTypes: Assigned, Collected, Derived, Not Available, Other, Predecessor, Protocol
let origin_type = if is_expected(core_designation) && !item_def.has_data {
- "Not Collected"
+ "Not Available"
} else if item_def.has_data {
"Collected"
} else {
@@ -370,7 +381,11 @@ fn resolve_codelist(
catalog.and_then(|cat| {
let publishing_set = cat.publishing_set.as_ref()?;
let version = cat.version.as_ref()?;
- let oid = format!("STD.CT.{}.{}", publishing_set, version);
+ let oid = format!(
+ "STD.CT.{}.{}",
+ sanitize_oid_component(publishing_set),
+ sanitize_oid_component(version)
+ );
ct_standards
.entry(oid.clone())
@@ -385,7 +400,11 @@ fn resolve_codelist(
})
});
- let oid = format!("CL.{}.{}", domain.name, variable.name);
+ let oid = format!(
+ "CL.{}.{}",
+ sanitize_oid_component(&domain.name),
+ sanitize_oid_component(&variable.name)
+ );
if !code_lists.contains_key(&oid) {
let mut values = BTreeSet::new();
let mut names = BTreeSet::new();
@@ -422,3 +441,27 @@ fn parse_codelist_codes(raw: &str) -> Vec {
.map(std::string::ToString::to_string)
.collect()
}
+
+/// Sanitize a string for use in OID generation.
+///
+/// Per Define-XML 2.1 specification, OIDs must contain only:
+/// - Letters (A-Z, a-z)
+/// - Digits (0-9)
+/// - Underscores (_)
+/// - Periods (.)
+/// - Hyphens (-)
+///
+/// Invalid characters are replaced with underscores.
+/// Leading/trailing whitespace is trimmed.
+fn sanitize_oid_component(s: &str) -> String {
+ s.trim()
+ .chars()
+ .map(|c| {
+ if c.is_ascii_alphanumeric() || c == '_' || c == '.' || c == '-' {
+ c
+ } else {
+ '_'
+ }
+ })
+ .collect()
+}
diff --git a/crates/tss-submit/src/normalize/error.rs b/crates/tss-submit/src/normalize/error.rs
index 913b6ee9..9b46071d 100644
--- a/crates/tss-submit/src/normalize/error.rs
+++ b/crates/tss-submit/src/normalize/error.rs
@@ -30,6 +30,13 @@ pub enum NormalizationError {
/// Invalid normalization configuration.
#[error("Invalid normalization configuration: {0}")]
InvalidConfig(String),
+
+ /// Unknown constant requested for normalization.
+ #[error("Unknown constant '{name}' - only STUDYID and DOMAIN are supported")]
+ UnknownConstant {
+ /// The unknown constant name.
+ name: String,
+ },
}
/// Result type for normalization operations.
diff --git a/crates/tss-submit/src/normalize/executor.rs b/crates/tss-submit/src/normalize/executor.rs
index a5fac91f..61624568 100644
--- a/crates/tss-submit/src/normalize/executor.rs
+++ b/crates/tss-submit/src/normalize/executor.rs
@@ -104,10 +104,14 @@ fn execute_constant(
context: &NormalizationContext,
row_count: usize,
) -> Result {
- let value = match target_name {
+ let value: &str = match target_name {
"STUDYID" => &context.study_id,
"DOMAIN" => &context.domain_code,
- _ => "",
+ _ => {
+ return Err(NormalizationError::UnknownConstant {
+ name: target_name.to_string(),
+ });
+ }
};
Ok(Series::new(target_name.into(), vec![value; row_count]))
@@ -533,6 +537,26 @@ mod tests {
assert_eq!(result.get(2).unwrap(), AnyValue::String("CDISC01"));
}
+ #[test]
+ fn test_execute_constant_domain() {
+ let context = NormalizationContext::new("CDISC01", "AE");
+ let result = execute_constant("DOMAIN", &context, 2).unwrap();
+
+ assert_eq!(result.len(), 2);
+ assert_eq!(result.get(0).unwrap(), AnyValue::String("AE"));
+ assert_eq!(result.get(1).unwrap(), AnyValue::String("AE"));
+ }
+
+ #[test]
+ fn test_execute_constant_unknown_returns_error() {
+ let context = NormalizationContext::new("CDISC01", "AE");
+ let result = execute_constant("UNKNOWN", &context, 3);
+
+ assert!(result.is_err());
+ let err = result.unwrap_err();
+ assert!(matches!(err, NormalizationError::UnknownConstant { name } if name == "UNKNOWN"));
+ }
+
#[test]
fn test_execute_normalization() {
let domain = create_test_domain();
diff --git a/crates/tss-submit/src/validate/checks/cross_domain.rs b/crates/tss-submit/src/validate/checks/cross_domain.rs
index e53afa07..6503cd6c 100644
--- a/crates/tss-submit/src/validate/checks/cross_domain.rs
+++ b/crates/tss-submit/src/validate/checks/cross_domain.rs
@@ -3,11 +3,14 @@
//! Validates referential integrity across SDTM domains:
//! - All USUBJIDs in non-DM domains must exist in DM
//! - Parent record references (--SPID) must exist in parent domain
+//! - CO/RELREC RDOMAIN references valid domains
+//! - RELSUB RSUBJID exists in DM and relationships are bidirectional
+//! - RELSPEC PARENT references valid REFID within subject
//!
//! These checks ensure data consistency across the submission package.
use polars::prelude::DataFrame;
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
use super::super::column_reader::ColumnReader;
use super::super::issue::Issue;
@@ -78,6 +81,375 @@ pub fn check_usubjid_in_dm(
}
}
+// =============================================================================
+// RDOMAIN VALIDATION (CO, RELREC)
+// =============================================================================
+
+/// Check that RDOMAIN values reference valid domains in the submission.
+///
+/// # Arguments
+/// * `domain_name` - Name of the domain being checked (e.g., "CO", "RELREC")
+/// * `df` - DataFrame to check
+/// * `valid_domains` - Set of valid domain codes in the submission
+///
+/// # Returns
+/// A vector of issues (empty if all RDOMAIN values are valid).
+pub fn check_rdomain_valid(
+ domain_name: &str,
+ df: &DataFrame,
+ valid_domains: &HashSet,
+) -> Vec {
+ let reader = ColumnReader::new(df);
+
+ let Some(values) = reader.values("RDOMAIN") else {
+ return vec![];
+ };
+
+ let mut invalid_values = Vec::new();
+ let mut invalid_count = 0u64;
+
+ for (_, rdomain) in values {
+ let trimmed = rdomain.trim().to_uppercase();
+ // Skip empty values (RDOMAIN can be null for standalone comments)
+ if !trimmed.is_empty() && !valid_domains.contains(&trimmed) {
+ invalid_count += 1;
+ if invalid_values.len() < 5 && !invalid_values.contains(&trimmed) {
+ invalid_values.push(trimmed);
+ }
+ }
+ }
+
+ if invalid_count > 0 {
+ vec![Issue::InvalidRdomain {
+ domain: domain_name.to_string(),
+ invalid_count,
+ samples: invalid_values,
+ }]
+ } else {
+ vec![]
+ }
+}
+
+// =============================================================================
+// RELSUB VALIDATION
+// =============================================================================
+
+/// Check RELSUB domain for valid RSUBJID and bidirectional relationships.
+///
+/// Per SDTM-IG 8.7:
+/// - RSUBJID must be a USUBJID value present in DM
+/// - Relationships must be bidirectional (A→B requires B→A)
+///
+/// # Arguments
+/// * `relsub_df` - RELSUB DataFrame
+/// * `dm_subjects` - Set of valid USUBJIDs from DM
+///
+/// # Returns
+/// A vector of issues.
+pub fn check_relsub(relsub_df: &DataFrame, dm_subjects: &HashSet) -> Vec {
+ let mut issues = Vec::new();
+ let reader = ColumnReader::new(relsub_df);
+
+ // Check RSUBJID exists in DM
+ if let Some(rsubjid_values) = reader.values("RSUBJID") {
+ let mut missing_values = Vec::new();
+ let mut missing_count = 0u64;
+
+ for (_, rsubjid) in rsubjid_values {
+ let trimmed = rsubjid.trim();
+ if !trimmed.is_empty() && !dm_subjects.contains(trimmed) {
+ missing_count += 1;
+ if missing_values.len() < 5 && !missing_values.contains(&trimmed.to_string()) {
+ missing_values.push(trimmed.to_string());
+ }
+ }
+ }
+
+ if missing_count > 0 {
+ issues.push(Issue::RelsubNotInDm {
+ missing_count,
+ samples: missing_values,
+ });
+ }
+ }
+
+ // Check bidirectional relationships
+ // Collect all (USUBJID, RSUBJID) pairs
+ let usubjid_values = reader.values("USUBJID");
+ let rsubjid_values = reader.values("RSUBJID");
+
+ if let (Some(usubjids), Some(rsubjids)) = (usubjid_values, rsubjid_values) {
+ let mut relationships: HashSet<(String, String)> = HashSet::new();
+
+ for ((_, usubjid), (_, rsubjid)) in usubjids.zip(rsubjids) {
+ let u = usubjid.trim().to_string();
+ let r = rsubjid.trim().to_string();
+ if !u.is_empty() && !r.is_empty() {
+ relationships.insert((u, r));
+ }
+ }
+
+ // Check for missing reciprocal relationships
+ let mut missing_reciprocal = Vec::new();
+ let mut missing_count = 0u64;
+
+ for (usubjid, rsubjid) in &relationships {
+ // The reciprocal would be (rsubjid, usubjid)
+ if !relationships.contains(&(rsubjid.clone(), usubjid.clone())) {
+ missing_count += 1;
+ if missing_reciprocal.len() < 5 {
+ missing_reciprocal.push(format!("{}→{}", usubjid, rsubjid));
+ }
+ }
+ }
+
+ if missing_count > 0 {
+ issues.push(Issue::RelsubNotBidirectional {
+ missing_count,
+ samples: missing_reciprocal,
+ });
+ }
+ }
+
+ issues
+}
+
+// =============================================================================
+// RELSPEC VALIDATION
+// =============================================================================
+
+/// Check RELSPEC PARENT references valid REFID within each subject.
+///
+/// Per SDTM-IG 8.8:
+/// - PARENT must reference an existing REFID for the same USUBJID
+/// - LEVEL=1 specimens should have null PARENT
+///
+/// # Arguments
+/// * `relspec_df` - RELSPEC DataFrame
+///
+/// # Returns
+/// A vector of issues.
+pub fn check_relspec(relspec_df: &DataFrame) -> Vec {
+ let reader = ColumnReader::new(relspec_df);
+
+ // Build a map of USUBJID -> Set of REFIDs
+ let mut refids_by_subject: HashMap> = HashMap::new();
+
+ let usubjid_col = reader.values("USUBJID");
+ let refid_col = reader.values("REFID");
+
+ if let (Some(usubjids), Some(refids)) = (usubjid_col, refid_col) {
+ for ((_, usubjid), (_, refid)) in usubjids.zip(refids) {
+ let u = usubjid.trim().to_string();
+ let r = refid.trim().to_string();
+ if !u.is_empty() && !r.is_empty() {
+ refids_by_subject.entry(u).or_default().insert(r);
+ }
+ }
+ }
+
+ // Check PARENT references
+ let usubjid_col = reader.values("USUBJID");
+ let parent_col = reader.values("PARENT");
+
+ let Some(parents) = parent_col else {
+ return vec![];
+ };
+
+ let mut invalid_values = Vec::new();
+ let mut invalid_count = 0u64;
+
+ if let Some(usubjids) = usubjid_col {
+ for ((_, usubjid), (_, parent)) in usubjids.zip(parents) {
+ let u = usubjid.trim();
+ let p = parent.trim();
+
+ // Skip empty PARENT (valid for LEVEL=1 collected specimens)
+ if p.is_empty() {
+ continue;
+ }
+
+ // Check if PARENT exists in this subject's REFIDs
+ let valid_refids = refids_by_subject.get(u);
+ let parent_exists = valid_refids.is_some_and(|refs| refs.contains(p));
+
+ if !parent_exists {
+ invalid_count += 1;
+ if invalid_values.len() < 5 {
+ invalid_values.push(format!("{}:{}", u, p));
+ }
+ }
+ }
+ }
+
+ if invalid_count > 0 {
+ vec![Issue::RelspecInvalidParent {
+ invalid_count,
+ samples: invalid_values,
+ }]
+ } else {
+ vec![]
+ }
+}
+
+// =============================================================================
+// RELREC VALIDATION
+// =============================================================================
+
+/// Context for RELREC validation containing domain data.
+pub struct RelrecContext<'a> {
+ /// Map of domain code -> (DataFrame, key_variable -> Set of key values)
+ pub domains: HashMap>)>,
+}
+
+impl<'a> RelrecContext<'a> {
+ /// Build context from a list of (domain_name, DataFrame) pairs.
+ pub fn new(domain_list: &[(&str, &'a DataFrame)]) -> Self {
+ let mut domains = HashMap::new();
+
+ for (name, df) in domain_list {
+ let reader = ColumnReader::new(df);
+ let mut key_values: HashMap> = HashMap::new();
+
+ // Extract common key variables: --SEQ, --GRPID, --REFID, --LNKID
+ let domain_prefix = name.to_uppercase();
+ let seq_var = format!("{}SEQ", domain_prefix);
+ let grpid_var = format!("{}GRPID", domain_prefix);
+ let refid_var = format!("{}REFID", domain_prefix);
+ let lnkid_var = format!("{}LNKID", domain_prefix);
+
+ for var in [&seq_var, &grpid_var, &refid_var, &lnkid_var] {
+ if let Some(values) = reader.values(var) {
+ let mut set = HashSet::new();
+ for (_, val) in values {
+ let trimmed = val.trim();
+ if !trimmed.is_empty() {
+ set.insert(trimmed.to_string());
+ }
+ }
+ if !set.is_empty() {
+ key_values.insert(var.clone(), set);
+ }
+ }
+ }
+
+ // Also check generic variable names (VISITNUM, etc.)
+ for var in ["VISITNUM"] {
+ if let Some(values) = reader.values(var) {
+ let mut set = HashSet::new();
+ for (_, val) in values {
+ let trimmed = val.trim();
+ if !trimmed.is_empty() {
+ set.insert(trimmed.to_string());
+ }
+ }
+ if !set.is_empty() {
+ key_values.insert(var.to_string(), set);
+ }
+ }
+ }
+
+ domains.insert(name.to_uppercase(), (*df, key_values));
+ }
+
+ Self { domains }
+ }
+
+ /// Check if a reference exists in the context.
+ pub fn reference_exists(&self, rdomain: &str, idvar: &str, idvarval: &str) -> bool {
+ let domain_upper = rdomain.to_uppercase();
+ let Some((_, key_values)) = self.domains.get(&domain_upper) else {
+ return false;
+ };
+
+ // Check if the IDVAR exists and contains the IDVARVAL
+ key_values
+ .get(idvar)
+ .is_some_and(|values| values.contains(idvarval))
+ }
+}
+
+/// Check RELREC references point to existing records.
+///
+/// Per SDTM-IG 8.2:
+/// - RDOMAIN + IDVAR + IDVARVAL should reference existing records
+/// - For dataset relationships, USUBJID and IDVARVAL are null
+///
+/// # Arguments
+/// * `relrec_df` - RELREC DataFrame
+/// * `context` - Context containing all domain data
+///
+/// # Returns
+/// A vector of issues grouped by RDOMAIN.
+pub fn check_relrec(relrec_df: &DataFrame, context: &RelrecContext) -> Vec {
+ let reader = ColumnReader::new(relrec_df);
+
+ let rdomain_col = reader.values("RDOMAIN");
+ let idvar_col = reader.values("IDVAR");
+ let idvarval_col = reader.values("IDVARVAL");
+ let usubjid_col = reader.values("USUBJID");
+
+ let Some(rdomains) = rdomain_col else {
+ return vec![];
+ };
+ let Some(idvars) = idvar_col else {
+ return vec![];
+ };
+ let Some(idvarvals) = idvarval_col else {
+ return vec![];
+ };
+
+ // Group invalid references by RDOMAIN
+ let mut invalid_by_domain: HashMap)> = HashMap::new();
+
+ // Handle case where USUBJID might be null (dataset-level relationships)
+ let usubjids: Vec = usubjid_col
+ .map(|col| col.map(|(_, v)| v.to_string()).collect())
+ .unwrap_or_else(|| vec!["".to_string(); rdomains.size_hint().0]);
+
+ for (((_, rdomain), (_, idvar)), ((_, idvarval), usubjid)) in
+ rdomains.zip(idvars).zip(idvarvals.zip(usubjids.iter()))
+ {
+ let rdomain = rdomain.trim();
+ let idvar = idvar.trim();
+ let idvarval = idvarval.trim();
+ let usubjid = usubjid.trim();
+
+ // Skip dataset-level relationships (USUBJID and IDVARVAL are null)
+ if usubjid.is_empty() && idvarval.is_empty() {
+ continue;
+ }
+
+ // Skip empty values
+ if rdomain.is_empty() || idvar.is_empty() || idvarval.is_empty() {
+ continue;
+ }
+
+ // Check if reference exists
+ if !context.reference_exists(rdomain, idvar, idvarval) {
+ let entry = invalid_by_domain
+ .entry(rdomain.to_string())
+ .or_insert((0, Vec::new()));
+ entry.0 += 1;
+ if entry.1.len() < 5 {
+ entry.1.push(format!("{}={}", idvar, idvarval));
+ }
+ }
+ }
+
+ // Convert to issues
+ invalid_by_domain
+ .into_iter()
+ .map(
+ |(rdomain, (count, samples))| Issue::RelrecInvalidReference {
+ rdomain,
+ invalid_count: count,
+ samples,
+ },
+ )
+ .collect()
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -188,4 +560,319 @@ mod tests {
_ => panic!("Expected UsubjidNotInDm issue"),
}
}
+
+ // =========================================================================
+ // RDOMAIN VALIDATION TESTS
+ // =========================================================================
+
+ #[test]
+ fn test_check_rdomain_valid_all_valid() {
+ let co = df! {
+ "USUBJID" => &["STUDY-001", "STUDY-002"],
+ "RDOMAIN" => &["AE", "LB"],
+ "COVAL" => &["Comment 1", "Comment 2"],
+ }
+ .unwrap();
+
+ let valid_domains: HashSet = ["DM", "AE", "LB", "CO"]
+ .into_iter()
+ .map(String::from)
+ .collect();
+
+ let issues = check_rdomain_valid("CO", &co, &valid_domains);
+ assert!(issues.is_empty());
+ }
+
+ #[test]
+ fn test_check_rdomain_valid_invalid_references() {
+ let co = df! {
+ "USUBJID" => &["STUDY-001", "STUDY-002", "STUDY-003"],
+ "RDOMAIN" => &["AE", "XX", "YY"],
+ "COVAL" => &["Comment 1", "Comment 2", "Comment 3"],
+ }
+ .unwrap();
+
+ let valid_domains: HashSet =
+ ["DM", "AE", "LB"].into_iter().map(String::from).collect();
+
+ let issues = check_rdomain_valid("CO", &co, &valid_domains);
+ assert_eq!(issues.len(), 1);
+
+ match &issues[0] {
+ Issue::InvalidRdomain {
+ domain,
+ invalid_count,
+ samples,
+ } => {
+ assert_eq!(domain, "CO");
+ assert_eq!(*invalid_count, 2);
+ assert!(samples.contains(&"XX".to_string()));
+ assert!(samples.contains(&"YY".to_string()));
+ }
+ _ => panic!("Expected InvalidRdomain issue"),
+ }
+ }
+
+ #[test]
+ fn test_check_rdomain_valid_empty_rdomain_allowed() {
+ // Empty RDOMAIN is valid for standalone comments
+ let co = df! {
+ "USUBJID" => &["STUDY-001"],
+ "RDOMAIN" => &[""],
+ "COVAL" => &["General study comment"],
+ }
+ .unwrap();
+
+ let valid_domains: HashSet = ["DM", "AE"].into_iter().map(String::from).collect();
+
+ let issues = check_rdomain_valid("CO", &co, &valid_domains);
+ assert!(issues.is_empty());
+ }
+
+ // =========================================================================
+ // RELSUB VALIDATION TESTS
+ // =========================================================================
+
+ #[test]
+ fn test_check_relsub_all_valid() {
+ // Bidirectional twin relationship
+ let relsub = df! {
+ "USUBJID" => &["STUDY-001", "STUDY-002"],
+ "RSUBJID" => &["STUDY-002", "STUDY-001"],
+ "SREL" => &["TWIN", "TWIN"],
+ }
+ .unwrap();
+
+ let dm_subjects: HashSet = ["STUDY-001", "STUDY-002", "STUDY-003"]
+ .into_iter()
+ .map(String::from)
+ .collect();
+
+ let issues = check_relsub(&relsub, &dm_subjects);
+ assert!(issues.is_empty());
+ }
+
+ #[test]
+ fn test_check_relsub_rsubjid_not_in_dm() {
+ let relsub = df! {
+ "USUBJID" => &["STUDY-001", "STUDY-002"],
+ "RSUBJID" => &["STUDY-999", "STUDY-001"],
+ "SREL" => &["TWIN", "TWIN"],
+ }
+ .unwrap();
+
+ let dm_subjects: HashSet = ["STUDY-001", "STUDY-002"]
+ .into_iter()
+ .map(String::from)
+ .collect();
+
+ let issues = check_relsub(&relsub, &dm_subjects);
+
+ // Should have RelsubNotInDm issue
+ let rsubjid_issue = issues
+ .iter()
+ .find(|i| matches!(i, Issue::RelsubNotInDm { .. }));
+ assert!(rsubjid_issue.is_some());
+
+ match rsubjid_issue.unwrap() {
+ Issue::RelsubNotInDm {
+ missing_count,
+ samples,
+ } => {
+ assert_eq!(*missing_count, 1);
+ assert!(samples.contains(&"STUDY-999".to_string()));
+ }
+ _ => panic!("Expected RelsubNotInDm issue"),
+ }
+ }
+
+ #[test]
+ fn test_check_relsub_not_bidirectional() {
+ // Unidirectional relationship - missing reciprocal
+ let relsub = df! {
+ "USUBJID" => &["STUDY-001"],
+ "RSUBJID" => &["STUDY-002"],
+ "SREL" => &["PARENT"],
+ }
+ .unwrap();
+
+ let dm_subjects: HashSet = ["STUDY-001", "STUDY-002"]
+ .into_iter()
+ .map(String::from)
+ .collect();
+
+ let issues = check_relsub(&relsub, &dm_subjects);
+
+ let bidirectional_issue = issues
+ .iter()
+ .find(|i| matches!(i, Issue::RelsubNotBidirectional { .. }));
+ assert!(bidirectional_issue.is_some());
+
+ match bidirectional_issue.unwrap() {
+ Issue::RelsubNotBidirectional {
+ missing_count,
+ samples,
+ } => {
+ assert_eq!(*missing_count, 1);
+ assert!(samples.contains(&"STUDY-001→STUDY-002".to_string()));
+ }
+ _ => panic!("Expected RelsubNotBidirectional issue"),
+ }
+ }
+
+ // =========================================================================
+ // RELSPEC VALIDATION TESTS
+ // =========================================================================
+
+ #[test]
+ fn test_check_relspec_valid_parent_chain() {
+ // Parent specimen → Child specimen
+ let relspec = df! {
+ "USUBJID" => &["STUDY-001", "STUDY-001"],
+ "REFID" => &["SPEC-001", "SPEC-002"],
+ "PARENT" => &["", "SPEC-001"], // SPEC-002 derived from SPEC-001
+ "LEVEL" => &[1i32, 2i32],
+ }
+ .unwrap();
+
+ let issues = check_relspec(&relspec);
+ assert!(issues.is_empty());
+ }
+
+ #[test]
+ fn test_check_relspec_invalid_parent() {
+ // PARENT references non-existent REFID
+ let relspec = df! {
+ "USUBJID" => &["STUDY-001", "STUDY-001"],
+ "REFID" => &["SPEC-001", "SPEC-002"],
+ "PARENT" => &["", "SPEC-999"], // SPEC-999 doesn't exist
+ "LEVEL" => &[1i32, 2i32],
+ }
+ .unwrap();
+
+ let issues = check_relspec(&relspec);
+ assert_eq!(issues.len(), 1);
+
+ match &issues[0] {
+ Issue::RelspecInvalidParent {
+ invalid_count,
+ samples,
+ } => {
+ assert_eq!(*invalid_count, 1);
+ assert!(samples.contains(&"STUDY-001:SPEC-999".to_string()));
+ }
+ _ => panic!("Expected RelspecInvalidParent issue"),
+ }
+ }
+
+ #[test]
+ fn test_check_relspec_parent_from_different_subject() {
+ // PARENT should be within same subject
+ let relspec = df! {
+ "USUBJID" => &["STUDY-001", "STUDY-002"],
+ "REFID" => &["SPEC-001", "SPEC-002"],
+ "PARENT" => &["", "SPEC-001"], // SPEC-001 is from STUDY-001, not STUDY-002
+ "LEVEL" => &[1i32, 2i32],
+ }
+ .unwrap();
+
+ let issues = check_relspec(&relspec);
+ assert_eq!(issues.len(), 1); // Should flag cross-subject reference
+ }
+
+ // =========================================================================
+ // RELREC VALIDATION TESTS
+ // =========================================================================
+
+ #[test]
+ fn test_check_relrec_valid_references() {
+ let relrec = df! {
+ "STUDYID" => &["STUDY", "STUDY"],
+ "USUBJID" => &["STUDY-001", "STUDY-001"],
+ "RDOMAIN" => &["AE", "CM"],
+ "IDVAR" => &["AESEQ", "CMSEQ"],
+ "IDVARVAL" => &["1", "1"],
+ "RELID" => &["REL1", "REL1"],
+ }
+ .unwrap();
+
+ let ae = df! {
+ "USUBJID" => &["STUDY-001"],
+ "AESEQ" => &["1"],
+ "AETERM" => &["HEADACHE"],
+ }
+ .unwrap();
+
+ let cm = df! {
+ "USUBJID" => &["STUDY-001"],
+ "CMSEQ" => &["1"],
+ "CMTRT" => &["ASPIRIN"],
+ }
+ .unwrap();
+
+ let domains: Vec<(&str, &DataFrame)> = vec![("AE", &ae), ("CM", &cm)];
+ let context = RelrecContext::new(&domains);
+
+ let issues = check_relrec(&relrec, &context);
+ assert!(issues.is_empty());
+ }
+
+ #[test]
+ fn test_check_relrec_invalid_reference() {
+ let relrec = df! {
+ "STUDYID" => &["STUDY"],
+ "USUBJID" => &["STUDY-001"],
+ "RDOMAIN" => &["AE"],
+ "IDVAR" => &["AESEQ"],
+ "IDVARVAL" => &["999"], // Doesn't exist
+ "RELID" => &["REL1"],
+ }
+ .unwrap();
+
+ let ae = df! {
+ "USUBJID" => &["STUDY-001"],
+ "AESEQ" => &["1"],
+ "AETERM" => &["HEADACHE"],
+ }
+ .unwrap();
+
+ let domains: Vec<(&str, &DataFrame)> = vec![("AE", &ae)];
+ let context = RelrecContext::new(&domains);
+
+ let issues = check_relrec(&relrec, &context);
+ assert_eq!(issues.len(), 1);
+
+ match &issues[0] {
+ Issue::RelrecInvalidReference {
+ rdomain,
+ invalid_count,
+ samples,
+ } => {
+ assert_eq!(rdomain, "AE");
+ assert_eq!(*invalid_count, 1);
+ assert!(samples.contains(&"AESEQ=999".to_string()));
+ }
+ _ => panic!("Expected RelrecInvalidReference issue"),
+ }
+ }
+
+ #[test]
+ fn test_check_relrec_dataset_level_relationship() {
+ // Dataset-level relationships have null USUBJID and IDVARVAL
+ let relrec = df! {
+ "STUDYID" => &["STUDY"],
+ "USUBJID" => &[""], // Null for dataset-level
+ "RDOMAIN" => &["SUPPAE"],
+ "IDVAR" => &["QNAM"],
+ "IDVARVAL" => &[""], // Null for dataset-level
+ "RELID" => &["REL1"],
+ }
+ .unwrap();
+
+ let domains: Vec<(&str, &DataFrame)> = vec![];
+ let context = RelrecContext::new(&domains);
+
+ let issues = check_relrec(&relrec, &context);
+ assert!(issues.is_empty()); // Should skip dataset-level relationships
+ }
}
diff --git a/crates/tss-submit/src/validate/checks/ct.rs b/crates/tss-submit/src/validate/checks/ct.rs
index 6bcfe131..8efa9e5f 100644
--- a/crates/tss-submit/src/validate/checks/ct.rs
+++ b/crates/tss-submit/src/validate/checks/ct.rs
@@ -52,15 +52,16 @@ fn check_ct_values(
return None;
}
+ // Capture total count before truncating sample list
+ let total_invalid = invalid.len() as u64;
let invalid_values: Vec = invalid.into_iter().take(MAX_INVALID_VALUES).collect();
- let invalid_count = invalid_values.len() as u64;
Some(Issue::CtViolation {
variable: variable.name.clone(),
codelist_code: ct.code.clone(),
codelist_name: ct.name.clone(),
extensible: ct.extensible,
- invalid_count,
+ total_invalid,
invalid_values,
allowed_count: ct.terms.len(),
})
diff --git a/crates/tss-submit/src/validate/issue.rs b/crates/tss-submit/src/validate/issue.rs
index 1fb0ddc0..9aae2a0b 100644
--- a/crates/tss-submit/src/validate/issue.rs
+++ b/crates/tss-submit/src/validate/issue.rs
@@ -16,6 +16,8 @@ pub enum Severity {
Error,
/// Should review
Warning,
+ /// Informational - no action required
+ Info,
}
impl Severity {
@@ -25,6 +27,7 @@ impl Severity {
"reject" => Some(Self::Reject),
"error" => Some(Self::Error),
"warning" => Some(Self::Warning),
+ "info" => Some(Self::Info),
_ => None,
}
}
@@ -35,6 +38,7 @@ impl Severity {
Self::Reject => "Reject",
Self::Error => "Error",
Self::Warning => "Warning",
+ Self::Info => "Info",
}
}
}
@@ -89,7 +93,9 @@ pub enum Issue {
codelist_code: String,
codelist_name: String,
extensible: bool,
- invalid_count: u64,
+ /// Total count of distinct invalid values (not truncated)
+ total_invalid: u64,
+ /// Sample of invalid values (up to 5)
invalid_values: Vec,
allowed_count: usize,
},
@@ -108,6 +114,35 @@ pub enum Issue {
missing_count: u64,
samples: Vec,
},
+
+ // Special domain cross-reference checks (#38)
+ /// RDOMAIN references a domain that doesn't exist in the submission
+ InvalidRdomain {
+ domain: String,
+ invalid_count: u64,
+ samples: Vec,
+ },
+ /// RSUBJID values not found in DM domain
+ RelsubNotInDm {
+ missing_count: u64,
+ samples: Vec,
+ },
+ /// RELSUB relationship is not bidirectional (missing reciprocal record)
+ RelsubNotBidirectional {
+ missing_count: u64,
+ samples: Vec,
+ },
+ /// RELSPEC PARENT references non-existent REFID
+ RelspecInvalidParent {
+ invalid_count: u64,
+ samples: Vec,
+ },
+ /// RELREC references a record that doesn't exist
+ RelrecInvalidReference {
+ rdomain: String,
+ invalid_count: u64,
+ samples: Vec,
+ },
}
impl Issue {
@@ -126,6 +161,12 @@ impl Issue {
// Cross-domain issues use USUBJID or the specific variable
Issue::UsubjidNotInDm { .. } => "USUBJID",
Issue::ParentNotFound { variable, .. } => variable,
+ // Special domain cross-reference issues
+ Issue::InvalidRdomain { .. } => "RDOMAIN",
+ Issue::RelsubNotInDm { .. } => "RSUBJID",
+ Issue::RelsubNotBidirectional { .. } => "SREL",
+ Issue::RelspecInvalidParent { .. } => "PARENT",
+ Issue::RelrecInvalidReference { .. } => "IDVARVAL",
}
}
@@ -144,9 +185,15 @@ impl Issue {
Issue::DuplicateSequence {
duplicate_count, ..
} => Some(*duplicate_count),
- Issue::CtViolation { invalid_count, .. } => Some(*invalid_count),
+ Issue::CtViolation { total_invalid, .. } => Some(*total_invalid),
Issue::UsubjidNotInDm { missing_count, .. } => Some(*missing_count),
Issue::ParentNotFound { missing_count, .. } => Some(*missing_count),
+ // Special domain cross-reference issues
+ Issue::InvalidRdomain { invalid_count, .. } => Some(*invalid_count),
+ Issue::RelsubNotInDm { missing_count, .. } => Some(*missing_count),
+ Issue::RelsubNotBidirectional { missing_count, .. } => Some(*missing_count),
+ Issue::RelspecInvalidParent { invalid_count, .. } => Some(*invalid_count),
+ Issue::RelrecInvalidReference { invalid_count, .. } => Some(*invalid_count),
}
}
@@ -170,6 +217,12 @@ impl Issue {
// Cross-domain reference checks
Issue::UsubjidNotInDm { .. } => Category::CrossReference,
Issue::ParentNotFound { .. } => Category::CrossReference,
+ // Special domain cross-reference checks
+ Issue::InvalidRdomain { .. } => Category::CrossReference,
+ Issue::RelsubNotInDm { .. } => Category::CrossReference,
+ Issue::RelsubNotBidirectional { .. } => Category::CrossReference,
+ Issue::RelspecInvalidParent { .. } => Category::CrossReference,
+ Issue::RelrecInvalidReference { .. } => Category::CrossReference,
}
}
@@ -180,10 +233,16 @@ impl Issue {
Issue::TextTooLong { .. } => Severity::Warning,
Issue::CtViolation {
extensible: true, ..
- } => Severity::Warning,
+ } => Severity::Info,
// Cross-domain reference issues are errors (data integrity)
Issue::UsubjidNotInDm { .. } => Severity::Error,
Issue::ParentNotFound { .. } => Severity::Error,
+ // Special domain cross-reference issues
+ Issue::InvalidRdomain { .. } => Severity::Error,
+ Issue::RelsubNotInDm { .. } => Severity::Error,
+ Issue::RelsubNotBidirectional { .. } => Severity::Warning,
+ Issue::RelspecInvalidParent { .. } => Severity::Error,
+ Issue::RelrecInvalidReference { .. } => Severity::Error,
_ => Severity::Error,
}
}
@@ -277,24 +336,37 @@ impl Issue {
variable,
codelist_name,
extensible,
- invalid_count,
+ total_invalid,
invalid_values,
..
} => {
- let ext_str = if *extensible {
- " (extensible)"
- } else {
- " (non-extensible)"
- };
+ let sample_count = invalid_values.len() as u64;
let values_str = if invalid_values.is_empty() {
String::new()
+ } else if sample_count < *total_invalid {
+ format!(
+ " (showing {} of {}): {}",
+ sample_count,
+ total_invalid,
+ invalid_values.join(", ")
+ )
} else {
format!(": {}", invalid_values.join(", "))
};
- format!(
- "Variable {} has {} values not in codelist {}{}{}",
- variable, invalid_count, codelist_name, ext_str, values_str
- )
+
+ if *extensible {
+ // Info: custom values are allowed per CDISC for extensible codelists
+ format!(
+ "Variable {} uses {} custom values not in codelist {} (allowed - extensible codelist){}",
+ variable, total_invalid, codelist_name, values_str
+ )
+ } else {
+ // Error: non-extensible codelist, values must be in codelist
+ format!(
+ "Variable {} has {} invalid values not in codelist {} (non-extensible){}",
+ variable, total_invalid, codelist_name, values_str
+ )
+ }
}
Issue::UsubjidNotInDm {
@@ -329,6 +401,84 @@ impl Issue {
variable, missing_count, parent_domain, sample_str
)
}
+
+ // Special domain cross-reference issues
+ Issue::InvalidRdomain {
+ domain,
+ invalid_count,
+ samples,
+ } => {
+ let sample_str = if samples.is_empty() {
+ String::new()
+ } else {
+ format!(": {}", samples.join(", "))
+ };
+ format!(
+ "{} domain has {} RDOMAIN values referencing non-existent domains{}",
+ domain, invalid_count, sample_str
+ )
+ }
+
+ Issue::RelsubNotInDm {
+ missing_count,
+ samples,
+ } => {
+ let sample_str = if samples.is_empty() {
+ String::new()
+ } else {
+ format!(" (e.g., {})", samples.join(", "))
+ };
+ format!(
+ "RELSUB has {} RSUBJID values not found in DM{}",
+ missing_count, sample_str
+ )
+ }
+
+ Issue::RelsubNotBidirectional {
+ missing_count,
+ samples,
+ } => {
+ let sample_str = if samples.is_empty() {
+ String::new()
+ } else {
+ format!(" (e.g., {})", samples.join(", "))
+ };
+ format!(
+ "RELSUB has {} relationships without reciprocal records{}",
+ missing_count, sample_str
+ )
+ }
+
+ Issue::RelspecInvalidParent {
+ invalid_count,
+ samples,
+ } => {
+ let sample_str = if samples.is_empty() {
+ String::new()
+ } else {
+ format!(" (e.g., {})", samples.join(", "))
+ };
+ format!(
+ "RELSPEC has {} PARENT values referencing non-existent REFID{}",
+ invalid_count, sample_str
+ )
+ }
+
+ Issue::RelrecInvalidReference {
+ rdomain,
+ invalid_count,
+ samples,
+ } => {
+ let sample_str = if samples.is_empty() {
+ String::new()
+ } else {
+ format!(" (e.g., {})", samples.join(", "))
+ };
+ format!(
+ "RELREC has {} references to non-existent records in {}{}",
+ invalid_count, rdomain, sample_str
+ )
+ }
}
}
}
diff --git a/crates/tss-submit/src/validate/mod.rs b/crates/tss-submit/src/validate/mod.rs
index fb0d135a..0d98efab 100644
--- a/crates/tss-submit/src/validate/mod.rs
+++ b/crates/tss-submit/src/validate/mod.rs
@@ -33,7 +33,7 @@ pub mod rules;
mod util;
use polars::prelude::DataFrame;
-use std::collections::BTreeSet;
+use std::collections::{BTreeSet, HashSet};
use tss_standards::SdtmDomain;
use tss_standards::TerminologyRegistry;
@@ -87,6 +87,10 @@ pub fn validate_domain_with_not_collected(
///
/// Checks that:
/// - All USUBJIDs in non-DM domains exist in the DM domain
+/// - RDOMAIN values in CO/RELREC reference valid domains
+/// - RELSUB RSUBJID exists in DM and relationships are bidirectional
+/// - RELSPEC PARENT references valid REFID within subject
+/// - RELREC references point to existing records
///
/// # Arguments
/// * `domains` - List of (domain_name, DataFrame) pairs
@@ -115,17 +119,57 @@ pub fn validate_cross_domain(domains: &[(&str, &DataFrame)]) -> Vec<(String, Vec
);
}
- // Check each non-DM domain
- domains
+ // Build set of valid domain codes for RDOMAIN validation
+ let valid_domains: HashSet = domains
.iter()
- .filter(|(name, _)| *name != "DM")
- .filter_map(|(name, df)| {
- let issues = checks::cross_domain::check_usubjid_in_dm(name, df, &dm_subjects);
- if issues.is_empty() {
- None
- } else {
- Some((name.to_string(), issues))
- }
- })
- .collect()
+ .map(|(name, _)| name.to_uppercase())
+ .collect();
+
+ let mut results = Vec::new();
+
+ // Check each domain
+ for (name, df) in domains {
+ let name_upper = name.to_uppercase();
+ let mut domain_issues = Vec::new();
+
+ // Skip DM for USUBJID check (it's the reference)
+ if name_upper != "DM" {
+ domain_issues.extend(checks::cross_domain::check_usubjid_in_dm(
+ name,
+ df,
+ &dm_subjects,
+ ));
+ }
+
+ // RDOMAIN validation for CO and RELREC
+ if name_upper == "CO" || name_upper == "RELREC" {
+ domain_issues.extend(checks::cross_domain::check_rdomain_valid(
+ name,
+ df,
+ &valid_domains,
+ ));
+ }
+
+ // RELSUB-specific validation
+ if name_upper == "RELSUB" {
+ domain_issues.extend(checks::cross_domain::check_relsub(df, &dm_subjects));
+ }
+
+ // RELSPEC-specific validation
+ if name_upper == "RELSPEC" {
+ domain_issues.extend(checks::cross_domain::check_relspec(df));
+ }
+
+ // RELREC-specific validation (record references)
+ if name_upper == "RELREC" {
+ let context = checks::cross_domain::RelrecContext::new(domains);
+ domain_issues.extend(checks::cross_domain::check_relrec(df, &context));
+ }
+
+ if !domain_issues.is_empty() {
+ results.push((name.to_string(), domain_issues));
+ }
+ }
+
+ results
}
diff --git a/crates/tss-submit/src/validate/report.rs b/crates/tss-submit/src/validate/report.rs
index 61d4a97a..c7264e56 100644
--- a/crates/tss-submit/src/validate/report.rs
+++ b/crates/tss-submit/src/validate/report.rs
@@ -56,13 +56,14 @@ impl ValidationReport {
self.error_count() > 0
}
- /// Get issues sorted by severity (Reject first, then Error, then Warning).
+ /// Get issues sorted by severity (Reject first, then Error, Warning, Info).
pub fn sorted_by_severity(&self) -> Vec<&Issue> {
let mut issues: Vec<_> = self.issues.iter().collect();
issues.sort_by_key(|i| match i.severity() {
Severity::Reject => 0,
Severity::Error => 1,
Severity::Warning => 2,
+ Severity::Info => 3,
});
issues
}
diff --git a/resources/technical/MAPPING_NOT_COLLECTED.md b/resources/technical/MAPPING_NOT_COLLECTED.md
deleted file mode 100644
index fc2d7eda..00000000
--- a/resources/technical/MAPPING_NOT_COLLECTED.md
+++ /dev/null
@@ -1,21 +0,0 @@
-## Define-XML Integration (Future)
-
-The `not_collected` map stores variable → reason pairs that will be used when
-generating Define-XML:
-
-```xml
-
-
-
-Reference Start Date/Time
-
-
-
-
-
-
-
- Data not collected in this study
-
-
-```