#777 Add support for appending and for the standard output file naming convention.

yruslan · yruslan · commit 05390ade2778 · 2025-09-03T12:33:16.000+02:00
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala
@@ -84,12 +84,7 @@ class DefaultSource
           fs.delete(outputPath, true)
         }
       case SaveMode.Append =>
-        if (fs.exists(outputPath)) {
-          throw new IllegalArgumentException(
-            s"Save mode '$mode' is not supported by the 'spark-cobol' data source at the moment. " +
-              "Please use 'Overwrite' mode to write data to a file or folder."
-          )
-        }
+      // In append mode, no action is needed. Tasks will write to different files.
       case SaveMode.ErrorIfExists =>
         if (fs.exists(outputPath)) {
           throw new IllegalArgumentException(
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RawBinaryOutputFormat.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RawBinaryOutputFormat.scala
@@ -20,8 +20,10 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.io.{BytesWritable, NullWritable}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.getOutputPath
 
 import java.io.DataOutputStream
+import java.util.UUID
 
 /**
   * A custom implementation of `FileOutputFormat` that outputs raw binary data for fixed record length
@@ -39,6 +41,26 @@ import java.io.DataOutputStream
   */
 
 class RawBinaryOutputFormat extends FileOutputFormat[NullWritable, BytesWritable] {
+  private val uniqueUuid = UUID.randomUUID().toString
+
+  override def checkOutputSpecs(job: JobContext): Unit = {
+    val outDir = getOutputPath(job)
+    if (outDir == null) throw new IllegalStateException("Output directory not set.")
+  }
+
+  override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
+    val conf = context.getConfiguration
+    val uniqueWriteJobId = conf.get("spark.sql.sources.writeJobUUID")
+    val idFilePart = if (uniqueWriteJobId == null) uniqueUuid else uniqueWriteJobId
+    val taskAttemptID = context.getTaskAttemptID
+    val taskId = f"${taskAttemptID.getTaskID.getId}%05d"
+    val attemptId = f"c${taskAttemptID.getId}%03d"
+
+    val filename = s"part-$taskId-$idFilePart-$attemptId$extension"
+    val outputPath = getOutputPath(context)
+    new Path(outputPath, filename)
+  }
+
   override def getRecordWriter(context: TaskAttemptContext): RecordWriter[NullWritable, BytesWritable] = {
     val extension = context.getConfiguration.get("cobol.writer.output.extension", ".dat")
     val path: Path = getDefaultWorkFile(context, extension)
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala
@@ -64,7 +64,7 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
         val expected = Array[Byte](
           0xC1.toByte, 0xC6.toByte, 0x89.toByte, 0x99.toByte, 0xa2.toByte, 0xa3.toByte, // A,First
           0xC2.toByte, 0xE2.toByte, 0x83.toByte, 0x95.toByte, 0x84.toByte, 0x40.toByte, // B,Scnd_
-          0xC3.toByte, 0xD3.toByte, 0x81.toByte, 0xa2.toByte, 0xa3.toByte, 0x40.toByte // C,Last_
+          0xC3.toByte, 0xD3.toByte, 0x81.toByte, 0xa2.toByte, 0xa3.toByte, 0x40.toByte  // C,Last_
         )
 
         if (!bytes.sameElements(expected)) {
@@ -114,7 +114,7 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
         val expected = Array[Byte](
           0xC1.toByte, 0x00.toByte, 0xC6.toByte, 0x89.toByte, 0x99.toByte, 0xa2.toByte, 0xa3.toByte, // A,First
           0xC2.toByte, 0x00.toByte, 0xE2.toByte, 0x83.toByte, 0x95.toByte, 0x84.toByte, 0x40.toByte, // B,Scnd_
-          0xC3.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte // C,Last_
+          0xC3.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte  // C,Last_
         )
 
         if (!bytes.sameElements(expected)) {
@@ -184,7 +184,7 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
     }
 
 
-    "write should fail with save mode append and the path exists" in {
+    "write should successfully append" in {
       withTempDirectory("cobol_writer3") { tempDir =>
         val df = List(("A", "First"), ("B", "Scnd"), ("C", "Last")).toDF("A", "B")
 
@@ -196,13 +196,19 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
           .option("copybook_contents", copybookContents)
           .save(path.toString)
 
-        assertThrows[IllegalArgumentException] {
-          df.write
-            .format("cobol")
-            .mode(SaveMode.Append)
-            .option("copybook_contents", copybookContents)
-            .save(path.toString)
-        }
+        df.write
+          .format("cobol")
+          .mode(SaveMode.Append)
+          .option("copybook_contents", copybookContents)
+          .save(path.toString)
+
+        val fs = path.getFileSystem(spark.sparkContext.hadoopConfiguration)
+
+        assert(fs.exists(path), "Output directory should exist")
+        val files = fs.listStatus(path)
+          .filter(_.getPath.getName.startsWith("part-"))
+
+        assert(files.length > 1)
       }
     }