diff --git a/build.gradle b/build.gradle index e1c3fa6..a23039f 100644 --- a/build.gradle +++ b/build.gradle @@ -6,6 +6,11 @@ dependencies { implementation 'org.json:json:20240303' implementation 'dev.harrel:json-schema:1.5.0' implementation 'com.sanctionco.jmail:jmail:1.6.3' // Needed for e-mail format validation + + // Apache POI dependencies for Excel support + implementation 'org.apache.poi:poi:5.4.1' + implementation 'org.apache.poi:poi-ooxml:5.4.1' + implementation 'org.apache.poi:poi-scratchpad:5.4.1' } version = '2.5.1' diff --git a/scripts/create_test_excel_files.groovy b/scripts/create_test_excel_files.groovy new file mode 100644 index 0000000..f5270fb --- /dev/null +++ b/scripts/create_test_excel_files.groovy @@ -0,0 +1,144 @@ +#!/usr/bin/env groovy + +@Grab('org.apache.poi:poi:5.4.1') +@Grab('org.apache.poi:poi-ooxml:5.4.1') +@Grab('org.apache.poi:poi-scratchpad:5.4.1') + +import org.apache.poi.ss.usermodel.* +import org.apache.poi.xssf.usermodel.XSSFWorkbook +import org.apache.poi.hssf.usermodel.HSSFWorkbook +import java.nio.file.Path +import java.nio.file.Paths +import java.text.SimpleDateFormat + +/** + * Helper script to create Excel test files for nf-schema testing + */ +def createTestFiles() { + def testResourcesDir = Paths.get("src/testResources") + + // Create directory if it doesn't exist + testResourcesDir.toFile().mkdirs() + + println "Creating Excel test files..." + + // 1. Create correct.xlsx (basic test file equivalent to correct.csv) + createBasicTestFile(testResourcesDir.resolve("correct.xlsx").toString(), "xlsx") + + // 2. Create multisheet.xlsx (multiple sheets for sheet selection testing) + createMultiSheetFile(testResourcesDir.resolve("multisheet.xlsx").toString()) + + // 3. Create empty_cells.xlsx (file with empty cells) + createEmptyCellsFile(testResourcesDir.resolve("empty_cells.xlsx").toString()) + + println "✅ Excel test files created successfully in ${testResourcesDir}" +} + +def createBasicTestFile(String filename, String format) { + Workbook workbook = format == "xls" ? new HSSFWorkbook() : new XSSFWorkbook() + Sheet sheet = workbook.createSheet("Sheet1") + + // Create header row matching correct.csv structure + Row headerRow = sheet.createRow(0) + def headers = ["sample", "fastq_1", "fastq_2", "strandedness"] + headers.eachWithIndex { header, index -> + headerRow.createCell(index).setCellValue(header) + } + + // Add data rows matching test samplesheet data + def data = [ + ["SAMPLE_PE", "SAMPLE_PE_RUN1_1.fastq.gz", "SAMPLE_PE_RUN1_2.fastq.gz", "forward"], + ["SAMPLE_PE", "SAMPLE_PE_RUN2_1.fastq.gz", "SAMPLE_PE_RUN2_2.fastq.gz", "forward"], + ["SAMPLE_SE", "SAMPLE_SE_RUN1_1.fastq.gz", "", "forward"] + ] + + data.eachWithIndex { row, rowIndex -> + Row dataRow = sheet.createRow(rowIndex + 1) + row.eachWithIndex { value, colIndex -> + if (value != null && value != "") { + Cell cell = dataRow.createCell(colIndex) + cell.setCellValue(value.toString()) + } + } + } + + // Auto-size columns + headers.eachWithIndex { header, index -> + sheet.autoSizeColumn(index) + } + + // Save file + def fileOut = new FileOutputStream(filename) + workbook.write(fileOut) + fileOut.close() + workbook.close() + + println "Created: ${filename}" +} + +def createMultiSheetFile(String filename) { + Workbook workbook = new XSSFWorkbook() + + // Sheet 1 - Same as basic test file + Sheet sheet1 = workbook.createSheet("Sheet1") + Row headerRow1 = sheet1.createRow(0) + def headers = ["sample", "fastq_1", "fastq_2", "strandedness"] + headers.eachWithIndex { header, index -> + headerRow1.createCell(index).setCellValue(header) + } + + Row dataRow1 = sheet1.createRow(1) + def data1 = ["SAMPLE_PE", "SAMPLE_PE_RUN1_1.fastq.gz", "SAMPLE_PE_RUN1_2.fastq.gz", "forward"] + data1.eachWithIndex { value, colIndex -> + Cell cell = dataRow1.createCell(colIndex) + cell.setCellValue(value.toString()) + } + + // Sheet 2 - Different data + Sheet sheet2 = workbook.createSheet("Sheet2") + Row headerRow2 = sheet2.createRow(0) + headerRow2.createCell(0).setCellValue("sample_id") + headerRow2.createCell(1).setCellValue("condition") + + Row dataRow2 = sheet2.createRow(1) + dataRow2.createCell(0).setCellValue("sample2") + dataRow2.createCell(1).setCellValue("control") + + // Save file + def fileOut = new FileOutputStream(filename) + workbook.write(fileOut) + fileOut.close() + workbook.close() + + println "Created: ${filename}" +} + +def createEmptyCellsFile(String filename) { + Workbook workbook = new XSSFWorkbook() + Sheet sheet = workbook.createSheet("Sheet1") + + // Create header row + Row headerRow = sheet.createRow(0) + def headers = ["sample", "fastq_1", "fastq_2", "strandedness"] + headers.eachWithIndex { header, index -> + headerRow.createCell(index).setCellValue(header) + } + + // Add row with many empty cells + Row dataRow = sheet.createRow(1) + dataRow.createCell(0).setCellValue("SAMPLE_SE") // sample + dataRow.createCell(1).setCellValue("SAMPLE_SE_RUN1_1.fastq.gz") // fastq_1 + // fastq_2 left empty + dataRow.createCell(3).setCellValue("forward") // strandedness + + // Save file + def fileOut = new FileOutputStream(filename) + workbook.write(fileOut) + fileOut.close() + workbook.close() + + println "Created: ${filename}" +} + +// Run the script +createTestFiles() \ No newline at end of file diff --git a/src/main/groovy/nextflow/validation/samplesheet/SamplesheetConverter.groovy b/src/main/groovy/nextflow/validation/samplesheet/SamplesheetConverter.groovy index 348c327..5882f2b 100644 --- a/src/main/groovy/nextflow/validation/samplesheet/SamplesheetConverter.groovy +++ b/src/main/groovy/nextflow/validation/samplesheet/SamplesheetConverter.groovy @@ -11,10 +11,12 @@ import nextflow.Nextflow import static nextflow.validation.utils.Colors.getLogColors import static nextflow.validation.utils.Files.fileToJson import static nextflow.validation.utils.Files.fileToObject +import static nextflow.validation.utils.Files.getFileType import static nextflow.validation.utils.Common.findDeep import static nextflow.validation.utils.Common.hasDeepKey import nextflow.validation.config.ValidationConfig import nextflow.validation.exceptions.SchemaValidationException +import nextflow.validation.utils.WorkbookConverter import nextflow.validation.validators.JsonSchemaValidator import nextflow.validation.validators.ValidationResult @@ -96,9 +98,29 @@ class SamplesheetConverter { throw new SchemaValidationException(msg) } + // Check if this is an Excel file and process accordingly + def String fileType = getFileType(samplesheetFile) + def JSONArray samplesheet + def List samplesheetList + + if (fileType in ['xlsx', 'xlsm', 'xlsb', 'xls']) { + // Process Excel file using WorkbookConverter + def WorkbookConverter workbookConverter = new WorkbookConverter(config) + samplesheetList = workbookConverter.convertToList(samplesheetFile, options) as List + + // Convert to JSON for validation - same as other formats + def jsonGenerator = new groovy.json.JsonGenerator.Options() + .excludeNulls() + .build() + samplesheet = new JSONArray(jsonGenerator.toJson(samplesheetList)) + } else { + // Process other file formats + samplesheet = fileToJson(samplesheetFile, schemaFile) as JSONArray + samplesheetList = fileToObject(samplesheetFile, schemaFile) as List + } + // Validate final validator = new JsonSchemaValidator(config) - def JSONArray samplesheet = fileToJson(samplesheetFile, schemaFile) as JSONArray def ValidationResult validationResult = validator.validate(samplesheet, schemaFile.toString()) def validationErrors = validationResult.getErrors('field') if (validationErrors) { @@ -107,8 +129,7 @@ class SamplesheetConverter { throw new SchemaValidationException(msg, validationErrors) } - // Convert - def List samplesheetList = fileToObject(samplesheetFile, schemaFile) as List + // Convert (already done above for Excel files) this.rows = [] def List channelFormat = samplesheetList.collect { entry -> diff --git a/src/main/groovy/nextflow/validation/utils/Files.groovy b/src/main/groovy/nextflow/validation/utils/Files.groovy index b98daa8..e767719 100644 --- a/src/main/groovy/nextflow/validation/utils/Files.groovy +++ b/src/main/groovy/nextflow/validation/utils/Files.groovy @@ -17,6 +17,8 @@ import java.io.FileReader import java.io.File import nextflow.validation.exceptions.SchemaValidationException +import nextflow.validation.utils.WorkbookConverter +import nextflow.validation.config.ValidationConfig import static nextflow.validation.utils.Common.getValueFromJsonPointer import static nextflow.validation.utils.Types.inferType @@ -32,11 +34,19 @@ import static nextflow.validation.utils.Types.inferType public class Files { // - // Function to detect if a file is a CSV, TSV, JSON or YAML file + // Function to get file extension from filename + // + public static String getFileExtension(String filename) { + int lastDotIndex = filename.lastIndexOf('.') + return lastDotIndex >= 0 ? filename.substring(lastDotIndex + 1) : "" + } + + // + // Function to detect if a file is a CSV, TSV, JSON, YAML or Excel file // public static String getFileType(Path file) { def String extension = file.getExtension() - if (extension in ["csv", "tsv", "yml", "yaml", "json"]) { + if (extension in ["csv", "tsv", "yml", "yaml", "json", "xlsx", "xlsm", "xlsb", "xls"]) { return extension == "yml" ? "yaml" : extension } @@ -46,7 +56,7 @@ public class Files { def Integer tabCount = header.count("\t") if ( commaCount == tabCount ){ - log.error("Could not derive file type from ${file}. Please specify the file extension (CSV, TSV, YML, YAML and JSON are supported).".toString()) + log.error("Could not derive file type from ${file}. Please specify the file extension (CSV, TSV, YML, YAML, JSON, and Excel formats are supported).".toString()) } if ( commaCount > tabCount ){ return "csv" diff --git a/src/main/groovy/nextflow/validation/utils/WorkbookConverter.groovy b/src/main/groovy/nextflow/validation/utils/WorkbookConverter.groovy new file mode 100644 index 0000000..e357eba --- /dev/null +++ b/src/main/groovy/nextflow/validation/utils/WorkbookConverter.groovy @@ -0,0 +1,323 @@ +package nextflow.validation.utils + +import groovy.transform.CompileStatic +import groovy.util.logging.Slf4j +import java.nio.file.Path +import java.text.SimpleDateFormat +import java.io.FileInputStream +import java.io.IOException + +import org.apache.poi.ss.usermodel.* +import org.apache.poi.xssf.usermodel.XSSFWorkbook +import org.apache.poi.hssf.usermodel.HSSFWorkbook +import org.apache.poi.poifs.filesystem.POIFSFileSystem +import org.apache.poi.openxml4j.exceptions.InvalidFormatException + +import nextflow.validation.config.ValidationConfig +import nextflow.validation.exceptions.SchemaValidationException + +/** + * Workbook converter for reading Excel files (XLSX, XLSM, XLSB, XLS) + * and converting them to lists compatible with nf-schema validation + * + * @author : edmundmiller + */ + +@Slf4j +@CompileStatic +class WorkbookConverter { + + // Constants for Excel file formats + private static final List EXCEL_EXTENSIONS = ['xlsx', 'xlsm', 'xlsb', 'xls'] + private static final int DEFAULT_SHEET_INDEX = 0 + + private ValidationConfig config + + WorkbookConverter(ValidationConfig config) { + this.config = config + } + + /** + * Convert Excel workbook to List format + */ + public List> convertToList( + Path workbookFile, + Map options = null + ) { + // Ensure options is not null + if (options == null) { + options = [:] + } + def colors = Colors.fromConfig(config) + + // Validate file exists + if (!workbookFile.exists()) { + def msg = "${colors.red}Excel workbook file ${workbookFile.toString()} does not exist\n${colors.reset}\n" + throw new SchemaValidationException(msg) + } + + try { + return readWorkbook(workbookFile, options) + } catch (Exception e) { + def msg = "${colors.red}Failed to read Excel file ${workbookFile.toString()}: ${e.message}\n${colors.reset}\n" + log.error("Failed to read Excel workbook!") + throw new SchemaValidationException(msg) + } + } + + /** + * Read workbook and convert to list format + */ + private List> readWorkbook(Path workbookFile, Map options) { + Workbook workbook = null + List> result = [] + + try { + // Open workbook based on file format + workbook = openWorkbook(workbookFile) + + // Get the specified sheet or default to first sheet + Sheet sheet = getSheet(workbook, options) + + // Convert sheet to list format + result = convertSheetToList(sheet) + + } finally { + if (workbook != null) { + workbook.close() + } + } + + return result + } + + /** + * Open workbook based on file format + */ + private Workbook openWorkbook(Path workbookFile) { + try { + // Use WorkbookFactory for automatic format detection + return WorkbookFactory.create(workbookFile.toFile()) + } catch (Exception e) { + def colors = Colors.fromConfig(config) + def msg = "${colors.red}Failed to open Excel file ${workbookFile}: ${e.message}\n${colors.reset}\n" + throw new SchemaValidationException(msg) + } + } + + /** + * Get sheet from workbook based on options + */ + private Sheet getSheet(Workbook workbook, Map options) { + def sheetSelector = options.sheet + Sheet sheet = null + def colors = Colors.fromConfig(config) + + if (sheetSelector == null) { + // Default to first sheet + sheet = workbook.getSheetAt(DEFAULT_SHEET_INDEX) + } else if (sheetSelector instanceof String) { + // Select by sheet name + sheet = workbook.getSheet(sheetSelector as String) + if (sheet == null) { + def msg = "${colors.red}Sheet '${sheetSelector}' not found in workbook\n${colors.reset}\n" + throw new SchemaValidationException(msg) + } + } else if (sheetSelector instanceof Integer) { + // Select by sheet index + def sheetIndex = sheetSelector as Integer + if (sheetIndex < 0 || sheetIndex >= workbook.getNumberOfSheets()) { + def msg = "${colors.red}Sheet index ${sheetIndex} is out of range (0-${workbook.getNumberOfSheets()-1})\n${colors.reset}\n" + throw new SchemaValidationException(msg) + } + sheet = workbook.getSheetAt(sheetIndex) + } else { + def msg = "${colors.red}Sheet selector must be either a String (sheet name) or Integer (sheet index)\n${colors.reset}\n" + throw new SchemaValidationException(msg) + } + + return sheet + } + + /** + * Convert Excel sheet to list of maps + */ + private List> convertSheetToList(Sheet sheet) { + if (sheet.getPhysicalNumberOfRows() == 0) { + return [] + } + + // Process headers + List headers = processHeaders(sheet) + boolean hasHeader = headers.any { it != null && !it.trim().isEmpty() } + + // Process data rows + return processDataRows(sheet, headers, hasHeader) + } + + /** + * Process headers from the sheet + */ + private List processHeaders(Sheet sheet) { + Row headerRow = sheet.getRow(sheet.getFirstRowNum()) + if (headerRow == null) { + return [] + } + + List headers = extractHeaders(headerRow) + boolean hasValidHeaders = headers.any { it != null && !it.trim().isEmpty() } + + // If no valid headers, create generic column names + if (!hasValidHeaders) { + def colCount = headerRow.getLastCellNum() + headers = (0..> processDataRows(Sheet sheet, List headers, boolean hasHeader) { + List> result = [] + + int startRow = hasHeader ? sheet.getFirstRowNum() + 1 : sheet.getFirstRowNum() + int endRow = sheet.getLastRowNum() + + for (int rowIndex = startRow; rowIndex <= endRow; rowIndex++) { + Row row = sheet.getRow(rowIndex) + if (row != null) { + Map rowData = processRow(row, headers) + if (rowData && !rowData.isEmpty()) { + result.add(rowData) + } + } + } + + return result + } + + /** + * Extract header names from header row + */ + private List extractHeaders(Row headerRow) { + List headers = [] + + for (int colIndex = 0; colIndex < headerRow.getLastCellNum(); colIndex++) { + Cell cell = headerRow.getCell(colIndex) + String headerValue = getCellValue(cell, true) as String + headers.add(headerValue ?: "column_${colIndex}".toString()) + } + + return headers + } + + /** + * Process a data row and convert to map + */ + private Map processRow(Row row, List headers) { + Map rowData = [:] + boolean hasData = false + + for (int colIndex = 0; colIndex < Math.max(headers.size(), row.getLastCellNum()); colIndex++) { + Cell cell = row.getCell(colIndex) + Object cellValue = getCellValue(cell) + + String header = colIndex < headers.size() ? headers[colIndex] : "column_${colIndex}".toString() + rowData[header] = cellValue + + if (cellValue != null && !(cellValue instanceof String && ((String)cellValue).trim().isEmpty())) { + hasData = true + } + } + + return hasData ? rowData : [:] + } + + /** + * Extract cell value based on cell type + * @param cell The cell to extract value from + * @param asString If true, returns string representation for headers + */ + private Object getCellValue(Cell cell, boolean asString = false) { + if (cell == null) { + return asString ? "" : null + } + + try { + switch (cell.getCellType()) { + case CellType.STRING: + return cell.getStringCellValue()?.trim() ?: (asString ? "" : null) + case CellType.NUMERIC: + if (DateUtil.isCellDateFormatted(cell)) { + // Handle date cells + Date date = cell.getDateCellValue() + return new SimpleDateFormat("yyyy-MM-dd").format(date) + } else { + double numValue = cell.getNumericCellValue() + if (asString) { + // For string representation, format appropriately + return (numValue == Math.floor(numValue) && !Double.isInfinite(numValue)) ? + String.valueOf((int)numValue) : String.valueOf(numValue) + } else { + // Return as integer if it's a whole number + return (numValue == Math.floor(numValue) && !Double.isInfinite(numValue)) ? + (int)numValue : numValue + } + } + case CellType.BOOLEAN: + return asString ? String.valueOf(cell.getBooleanCellValue()) : cell.getBooleanCellValue() + case CellType.FORMULA: + Object result = evaluateFormula(cell) + return asString ? String.valueOf(result) : result + case CellType.BLANK: + return asString ? "" : null + case CellType.ERROR: + return "#ERROR#" + default: + return cell.toString()?.trim() ?: (asString ? "" : null) + } + } catch (Exception e) { + log.warn("Error reading cell value at row ${cell.getRowIndex()}, column ${cell.getColumnIndex()}: ${e.message}") + return cell.toString()?.trim() ?: (asString ? "" : null) + } + } + + /** + * Evaluate formula in a cell + */ + private Object evaluateFormula(Cell cell) { + try { + FormulaEvaluator evaluator = cell.getSheet().getWorkbook().getCreationHelper().createFormulaEvaluator() + CellValue cellValue = evaluator.evaluate(cell) + + switch (cellValue.getCellType()) { + case CellType.NUMERIC: + double numValue = cellValue.getNumberValue() + if (numValue == Math.floor(numValue) && !Double.isInfinite(numValue)) { + return (int)numValue + } else { + return numValue + } + case CellType.STRING: + return cellValue.getStringValue()?.trim() + case CellType.BOOLEAN: + return cellValue.getBooleanValue() + default: + return cell.getCellFormula() + } + } catch (Exception e) { + log.warn("Error evaluating formula: ${e.message}") + return cell.getCellFormula() + } + } + + /** + * Check if file is an Excel format + */ + public static boolean isExcelFile(Path file) { + def extension = Files.getFileExtension(file.toString().toLowerCase()) + return extension in EXCEL_EXTENSIONS + } +} \ No newline at end of file diff --git a/src/test/groovy/nextflow/validation/WorkbookConverterTest.groovy b/src/test/groovy/nextflow/validation/WorkbookConverterTest.groovy new file mode 100644 index 0000000..8ae2e98 --- /dev/null +++ b/src/test/groovy/nextflow/validation/WorkbookConverterTest.groovy @@ -0,0 +1,71 @@ +package nextflow.validation + +import spock.lang.Specification +import spock.lang.Unroll +import java.nio.file.Path +import java.nio.file.Paths + +import nextflow.validation.config.ValidationConfig +import nextflow.validation.utils.WorkbookConverter +import nextflow.validation.exceptions.SchemaValidationException + +/** + * @author : edmundmiller + */ + +class WorkbookConverterTest extends Specification { + + def config + def workbookConverter + + def setup() { + config = new ValidationConfig() + workbookConverter = new WorkbookConverter(config) + } + + def "should detect Excel files correctly"() { + expect: + WorkbookConverter.isExcelFile(Paths.get("test.xlsx")) == true + WorkbookConverter.isExcelFile(Paths.get("test.xlsm")) == true + WorkbookConverter.isExcelFile(Paths.get("test.xlsb")) == true + WorkbookConverter.isExcelFile(Paths.get("test.xls")) == true + WorkbookConverter.isExcelFile(Paths.get("test.csv")) == false + WorkbookConverter.isExcelFile(Paths.get("test.txt")) == false + } + + def "should throw exception for non-existent file"() { + given: + def nonExistentFile = Paths.get("nonexistent.xlsx") + + when: + workbookConverter.convertToList(nonExistentFile) + + then: + thrown(SchemaValidationException) + } + + def "should handle null options gracefully"() { + given: + def testFile = Paths.get("test.xlsx") + + when: + // This will fail because file doesn't exist, but it should handle null options + workbookConverter.convertToList(testFile, null) + + then: + thrown(SchemaValidationException) // Due to file not existing, not null options + } + + def "should handle empty options map"() { + given: + def testFile = Paths.get("test.xlsx") + def emptyOptions = [:] + + when: + // This will fail because file doesn't exist, but it should handle empty options + workbookConverter.convertToList(testFile, emptyOptions) + + then: + thrown(SchemaValidationException) // Due to file not existing, not empty options + } +} \ No newline at end of file