feat: Add comprehensive Excel support to nf-schema

edmundmiller · claude · edmundmiller · commit c716966e11ed · 2025-09-15T17:40:08.000-05:00
Implements full Excel file processing functionality for nf-schema, addressing the need for direct Excel workbook support without manual CSV conversion. ## Key Features - **Full Excel Format Support**: XLSX, XLSM, XLSB, and XLS files using Apache POI 5.4.1 - **Sheet Selection**: Select specific sheets by name or index via options parameter - **Data Type Preservation**: Proper handling of strings, numbers, booleans, dates, and formulas - **Schema Integration**: Full compatibility with existing JSON schema validation pipeline - **Backward Compatibility**: Zero impact on existing CSV/TSV/JSON/YAML functionality ## Implementation Details ### Core Components - **WorkbookConverter.groovy**: Main Excel processing class with comprehensive error handling - **Integration**: Seamless integration with SamplesheetConverter for transparent Excel processing - **File Type Detection**: Enhanced file type detection in Files utility class ### Architecture - **Clean Separation**: Excel processing handled in dedicated WorkbookConverter class - **Configuration Integration**: Uses existing ValidationConfig for consistent error handling - **Modular Design**: Separated header processing, row processing, and cell value extraction ### New Dependencies - Apache POI 5.4.1 for Excel format support - POI-OOXML for modern Excel formats (XLSX, XLSM) - POI-Scratchpad for legacy Excel formats (XLS) ## Usage Examples ```nextflow // Basic Excel usage - works just like CSV params.input = "samplesheet.xlsx" params.schema = "assets/schema_input.json" include { samplesheetToList } from 'plugin/nf-schema' workflow { samplesheet = samplesheetToList(params.input, params.schema) } ``` ```nextflow // Select specific sheet by name samplesheet = samplesheetToList(params.input, params.schema, [sheet: "Sample_Data"]) // Select sheet by index (0-based) samplesheet = samplesheetToList(params.input, params.schema, [sheet: 0]) ``` ## Testing - WorkbookConverter unit tests with comprehensive error handling scenarios - File type detection tests for all Excel formats - Integration tests planned for full workflow validation ## Impact - **User Experience**: Users can work directly with Excel files from data analysts/collaborators - **Workflow Simplification**: Eliminates manual CSV conversion step - **Data Fidelity**: Preserves original data types and formatting - **Enterprise Ready**: Supports common Excel formats used in research/industry 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/build.gradle b/build.gradle
@@ -6,6 +6,11 @@ dependencies {
     implementation 'org.json:json:20240303'
     implementation 'dev.harrel:json-schema:1.5.0'
     implementation 'com.sanctionco.jmail:jmail:1.6.3' // Needed for e-mail format validation
+
+    // Apache POI dependencies for Excel support
+    implementation 'org.apache.poi:poi:5.4.1'
+    implementation 'org.apache.poi:poi-ooxml:5.4.1'
+    implementation 'org.apache.poi:poi-scratchpad:5.4.1'
 }
 
 version = '2.5.1'
diff --git a/scripts/create_test_excel_files.groovy b/scripts/create_test_excel_files.groovy
@@ -0,0 +1,144 @@
+#!/usr/bin/env groovy
+
+@Grab('org.apache.poi:poi:5.4.1')
+@Grab('org.apache.poi:poi-ooxml:5.4.1')
+@Grab('org.apache.poi:poi-scratchpad:5.4.1')
+
+import org.apache.poi.ss.usermodel.*
+import org.apache.poi.xssf.usermodel.XSSFWorkbook
+import org.apache.poi.hssf.usermodel.HSSFWorkbook
+import java.nio.file.Path
+import java.nio.file.Paths
+import java.text.SimpleDateFormat
+
+/**
+ * Helper script to create Excel test files for nf-schema testing
+ */
+def createTestFiles() {
+    def testResourcesDir = Paths.get("src/testResources")
+
+    // Create directory if it doesn't exist
+    testResourcesDir.toFile().mkdirs()
+
+    println "Creating Excel test files..."
+
+    // 1. Create correct.xlsx (basic test file equivalent to correct.csv)
+    createBasicTestFile(testResourcesDir.resolve("correct.xlsx").toString(), "xlsx")
+
+    // 2. Create multisheet.xlsx (multiple sheets for sheet selection testing)
+    createMultiSheetFile(testResourcesDir.resolve("multisheet.xlsx").toString())
+
+    // 3. Create empty_cells.xlsx (file with empty cells)
+    createEmptyCellsFile(testResourcesDir.resolve("empty_cells.xlsx").toString())
+
+    println "✅ Excel test files created successfully in ${testResourcesDir}"
+}
+
+def createBasicTestFile(String filename, String format) {
+    Workbook workbook = format == "xls" ? new HSSFWorkbook() : new XSSFWorkbook()
+    Sheet sheet = workbook.createSheet("Sheet1")
+
+    // Create header row matching correct.csv structure
+    Row headerRow = sheet.createRow(0)
+    def headers = ["sample", "fastq_1", "fastq_2", "strandedness"]
+    headers.eachWithIndex { header, index ->
+        headerRow.createCell(index).setCellValue(header)
+    }
+
+    // Add data rows matching test samplesheet data
+    def data = [
+        ["SAMPLE_PE", "SAMPLE_PE_RUN1_1.fastq.gz", "SAMPLE_PE_RUN1_2.fastq.gz", "forward"],
+        ["SAMPLE_PE", "SAMPLE_PE_RUN2_1.fastq.gz", "SAMPLE_PE_RUN2_2.fastq.gz", "forward"],
+        ["SAMPLE_SE", "SAMPLE_SE_RUN1_1.fastq.gz", "", "forward"]
+    ]
+
+    data.eachWithIndex { row, rowIndex ->
+        Row dataRow = sheet.createRow(rowIndex + 1)
+        row.eachWithIndex { value, colIndex ->
+            if (value != null && value != "") {
+                Cell cell = dataRow.createCell(colIndex)
+                cell.setCellValue(value.toString())
+            }
+        }
+    }
+
+    // Auto-size columns
+    headers.eachWithIndex { header, index ->
+        sheet.autoSizeColumn(index)
+    }
+
+    // Save file
+    def fileOut = new FileOutputStream(filename)
+    workbook.write(fileOut)
+    fileOut.close()
+    workbook.close()
+
+    println "Created: ${filename}"
+}
+
+def createMultiSheetFile(String filename) {
+    Workbook workbook = new XSSFWorkbook()
+
+    // Sheet 1 - Same as basic test file
+    Sheet sheet1 = workbook.createSheet("Sheet1")
+    Row headerRow1 = sheet1.createRow(0)
+    def headers = ["sample", "fastq_1", "fastq_2", "strandedness"]
+    headers.eachWithIndex { header, index ->
+        headerRow1.createCell(index).setCellValue(header)
+    }
+
+    Row dataRow1 = sheet1.createRow(1)
+    def data1 = ["SAMPLE_PE", "SAMPLE_PE_RUN1_1.fastq.gz", "SAMPLE_PE_RUN1_2.fastq.gz", "forward"]
+    data1.eachWithIndex { value, colIndex ->
+        Cell cell = dataRow1.createCell(colIndex)
+        cell.setCellValue(value.toString())
+    }
+
+    // Sheet 2 - Different data
+    Sheet sheet2 = workbook.createSheet("Sheet2")
+    Row headerRow2 = sheet2.createRow(0)
+    headerRow2.createCell(0).setCellValue("sample_id")
+    headerRow2.createCell(1).setCellValue("condition")
+
+    Row dataRow2 = sheet2.createRow(1)
+    dataRow2.createCell(0).setCellValue("sample2")
+    dataRow2.createCell(1).setCellValue("control")
+
+    // Save file
+    def fileOut = new FileOutputStream(filename)
+    workbook.write(fileOut)
+    fileOut.close()
+    workbook.close()
+
+    println "Created: ${filename}"
+}
+
+def createEmptyCellsFile(String filename) {
+    Workbook workbook = new XSSFWorkbook()
+    Sheet sheet = workbook.createSheet("Sheet1")
+
+    // Create header row
+    Row headerRow = sheet.createRow(0)
+    def headers = ["sample", "fastq_1", "fastq_2", "strandedness"]
+    headers.eachWithIndex { header, index ->
+        headerRow.createCell(index).setCellValue(header)
+    }
+
+    // Add row with many empty cells
+    Row dataRow = sheet.createRow(1)
+    dataRow.createCell(0).setCellValue("SAMPLE_SE")  // sample
+    dataRow.createCell(1).setCellValue("SAMPLE_SE_RUN1_1.fastq.gz")  // fastq_1
+    // fastq_2 left empty
+    dataRow.createCell(3).setCellValue("forward")      // strandedness
+
+    // Save file
+    def fileOut = new FileOutputStream(filename)
+    workbook.write(fileOut)
+    fileOut.close()
+    workbook.close()
+
+    println "Created: ${filename}"
+}
+
+// Run the script
+createTestFiles()
diff --git a/src/main/groovy/nextflow/validation/samplesheet/SamplesheetConverter.groovy b/src/main/groovy/nextflow/validation/samplesheet/SamplesheetConverter.groovy
@@ -11,10 +11,12 @@ import nextflow.Nextflow
 import static nextflow.validation.utils.Colors.getLogColors
 import static nextflow.validation.utils.Files.fileToJson
 import static nextflow.validation.utils.Files.fileToObject
+import static nextflow.validation.utils.Files.getFileType
 import static nextflow.validation.utils.Common.findDeep
 import static nextflow.validation.utils.Common.hasDeepKey
 import nextflow.validation.config.ValidationConfig
 import nextflow.validation.exceptions.SchemaValidationException
+import nextflow.validation.utils.WorkbookConverter
 import nextflow.validation.validators.JsonSchemaValidator
 import nextflow.validation.validators.ValidationResult
 
@@ -96,9 +98,29 @@ class SamplesheetConverter {
             throw new SchemaValidationException(msg)
         }
 
+        // Check if this is an Excel file and process accordingly
+        def String fileType = getFileType(samplesheetFile)
+        def JSONArray samplesheet
+        def List samplesheetList
+
+        if (fileType in ['xlsx', 'xlsm', 'xlsb', 'xls']) {
+            // Process Excel file using WorkbookConverter
+            def WorkbookConverter workbookConverter = new WorkbookConverter(config)
+            samplesheetList = workbookConverter.convertToList(samplesheetFile, options) as List
+
+            // Convert to JSON for validation - same as other formats
+            def jsonGenerator = new groovy.json.JsonGenerator.Options()
+                .excludeNulls()
+                .build()
+            samplesheet = new JSONArray(jsonGenerator.toJson(samplesheetList))
+        } else {
+            // Process other file formats
+            samplesheet = fileToJson(samplesheetFile, schemaFile) as JSONArray
+            samplesheetList = fileToObject(samplesheetFile, schemaFile) as List
+        }
+
         // Validate
         final validator = new JsonSchemaValidator(config)
-        def JSONArray samplesheet = fileToJson(samplesheetFile, schemaFile) as JSONArray
         def ValidationResult validationResult = validator.validate(samplesheet, schemaFile.toString())
         def validationErrors = validationResult.getErrors('field')
         if (validationErrors) {
@@ -107,8 +129,7 @@ class SamplesheetConverter {
             throw new SchemaValidationException(msg, validationErrors)
         }
 
-        // Convert
-        def List samplesheetList = fileToObject(samplesheetFile, schemaFile) as List
+        // Convert (already done above for Excel files)
         this.rows = []
 
         def List channelFormat = samplesheetList.collect { entry ->
diff --git a/src/main/groovy/nextflow/validation/utils/Files.groovy b/src/main/groovy/nextflow/validation/utils/Files.groovy
@@ -17,6 +17,8 @@ import java.io.FileReader
 import java.io.File
 
 import nextflow.validation.exceptions.SchemaValidationException
+import nextflow.validation.utils.WorkbookConverter
+import nextflow.validation.config.ValidationConfig
 import static nextflow.validation.utils.Common.getValueFromJsonPointer
 import static nextflow.validation.utils.Types.inferType
 
@@ -32,11 +34,19 @@ import static nextflow.validation.utils.Types.inferType
 public class Files {
 
     //
-    // Function to detect if a file is a CSV, TSV, JSON or YAML file
+    // Function to get file extension from filename
+    //
+    public static String getFileExtension(String filename) {
+        int lastDotIndex = filename.lastIndexOf('.')
+        return lastDotIndex >= 0 ? filename.substring(lastDotIndex + 1) : ""
+    }
+
+    //
+    // Function to detect if a file is a CSV, TSV, JSON, YAML or Excel file
     //
     public static String getFileType(Path file) {
         def String extension = file.getExtension()
-        if (extension in ["csv", "tsv", "yml", "yaml", "json"]) {
+        if (extension in ["csv", "tsv", "yml", "yaml", "json", "xlsx", "xlsm", "xlsb", "xls"]) {
             return extension == "yml" ? "yaml" : extension
         }
 
@@ -46,7 +56,7 @@ public class Files {
         def Integer tabCount = header.count("\t")
 
         if ( commaCount == tabCount ){
-            log.error("Could not derive file type from ${file}. Please specify the file extension (CSV, TSV, YML, YAML and JSON are supported).".toString())
+            log.error("Could not derive file type from ${file}. Please specify the file extension (CSV, TSV, YML, YAML, JSON, and Excel formats are supported).".toString())
         }
         if ( commaCount > tabCount ){
             return "csv"
diff --git a/src/main/groovy/nextflow/validation/utils/WorkbookConverter.groovy b/src/main/groovy/nextflow/validation/utils/WorkbookConverter.groovy
diff --git a/src/test/groovy/nextflow/validation/WorkbookConverterTest.groovy b/src/test/groovy/nextflow/validation/WorkbookConverterTest.groovy