Add dateFormat, timestampFormat support (#524)

srowen · web-flow · commit ceed1b8eed68 · 2021-02-20T16:32:38.000-06:00
* Add dateFormat, timestampFormat support
diff --git a/README.md b/README.md
@@ -88,6 +88,13 @@ In this case, to use local XSD `/foo/bar.xsd`, call `addFile("/foo/bar.xsd")` an
 for example, be treated as if both are just `<author>`. Note that, at the moment, namespaces cannot be ignored on the
 `rowTag` element, only its children. Note that XML parsing is in general not namespace-aware even if `false`.
 Defaults to `false`. New in 0.11.0.
+* `timestampFormat`: Specifies an additional timestamp format that will be tried when parsing values as `TimestampType` 
+columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
+Defaults to try several formats, including [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT),
+including variations with offset timezones or no timezone (defaults to UTC). New in 0.12.0.  
+* `dateFormat`: Specifies an additional timestamp format that will be tried when parsing values as `DateType` 
+columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
+Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0.
 
 When writing files the API accepts several options:
 
@@ -98,6 +105,12 @@ When writing files the API accepts several options:
 * `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `_`.
 * `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`.
 * `compression`: compression codec to use when saving to file. Should be the fully qualified name of a class implementing `org.apache.hadoop.io.compress.CompressionCodec` or one of case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). Defaults to no compression when a codec is not specified.
+* `timestampFormat`: Controls the format used to write `TimestampType` format columns.
+The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
+Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). New in 0.12.0.
+* `dateFormat`: Controls the format used to write `DateType` format columns.
+The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
+Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0.
 
 Currently it supports the shortened name usage. You can use just `xml` instead of `com.databricks.spark.xml`.
 
diff --git a/build.sbt b/build.sbt
@@ -101,6 +101,8 @@ mimaBinaryIssueFilters ++= {
     exclude[DirectMissingMethodProblem](
       "com.databricks.spark.xml.util.TypeCast.supportedXmlTimestampFormatters"),
     exclude[DirectMissingMethodProblem](
-      "com.databricks.spark.xml.util.TypeCast.parseXmlTimestamp")
+      "com.databricks.spark.xml.util.TypeCast.parseXmlTimestamp"),
+    exclude[DirectMissingMethodProblem](
+      "com.databricks.spark.xml.util.TypeCast.isTimestamp")
   )
 }
diff --git a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala
@@ -56,6 +56,8 @@ private[xml] class XmlOptions(
   val wildcardColName =
     parameters.getOrElse("wildcardColName", XmlOptions.DEFAULT_WILDCARD_COL_NAME)
   val ignoreNamespace = parameters.get("ignoreNamespace").map(_.toBoolean).getOrElse(false)
+  val timestampFormat = parameters.get("timestampFormat")
+  val dateFormat = parameters.get("dateFormat")
 }
 
 private[xml] object XmlOptions {
diff --git a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala
@@ -15,6 +15,7 @@
  */
 package com.databricks.spark.xml.parsers
 
+import java.sql.{Date, Timestamp}
 import java.time.format.DateTimeFormatter
 
 import javax.xml.stream.XMLStreamWriter
@@ -46,10 +47,10 @@ private[xml] object StaxXmlGenerator {
         // elements when given values are `null`.
       case (_, _, _) if name == options.valueTag =>
         // If this is meant to be value but in no child, write only a value
-        writeElement(dt, v)
+        writeElement(dt, v, options)
       case (_, _, _) =>
         writer.writeStartElement(name)
-        writeElement(dt, v)
+        writeElement(dt, v, options)
         writer.writeEndElement()
     }
 
@@ -75,11 +76,17 @@ private[xml] object StaxXmlGenerator {
       }
     }
 
-    def writeElement(dt: DataType, v: Any): Unit = (dt, v) match {
+    def writeElement(dt: DataType, v: Any, options: XmlOptions): Unit = (dt, v) match {
       case (_, null) | (NullType, _) => writer.writeCharacters(options.nullValue)
       case (StringType, v: String) => writer.writeCharacters(v)
-      case (TimestampType, v: java.sql.Timestamp) =>
-        writer.writeCharacters(DateTimeFormatter.ISO_INSTANT.format(v.toInstant()))
+      case (TimestampType, v: Timestamp) =>
+        val formatter = options.timestampFormat.map(DateTimeFormatter.ofPattern).
+          getOrElse(DateTimeFormatter.ISO_INSTANT)
+        writer.writeCharacters(formatter.format(v.toInstant()))
+      case (DateType, v: Date) =>
+        val formatter = options.dateFormat.map(DateTimeFormatter.ofPattern).
+          getOrElse(DateTimeFormatter.ISO_DATE)
+        writer.writeCharacters(formatter.format(v.toLocalDate()))
       case (IntegerType, v: Int) => writer.writeCharacters(v.toString)
       case (ShortType, v: Short) => writer.writeCharacters(v.toString)
       case (FloatType, v: Float) => writer.writeCharacters(v.toString)
@@ -88,7 +95,6 @@ private[xml] object StaxXmlGenerator {
       case (DecimalType(), v: java.math.BigDecimal) => writer.writeCharacters(v.toString)
       case (ByteType, v: Byte) => writer.writeCharacters(v.toString)
       case (BooleanType, v: Boolean) => writer.writeCharacters(v.toString)
-      case (DateType, _) => writer.writeCharacters(v.toString)
 
       // For the case roundtrip in reading and writing XML files, [[ArrayType]] cannot have
       // [[ArrayType]] as element type. It always wraps the element with [[StructType]]. So,
@@ -142,7 +148,7 @@ private[xml] object StaxXmlGenerator {
     val (names, values) = elements.unzip
     val elementSchema = StructType(schema.filter(names.contains))
     val elementRow = Row.fromSeq(row.toSeq.filter(values.contains))
-    writeElement(elementSchema, elementRow)
+    writeElement(elementSchema, elementRow, options)
     writer.writeEndElement()
   }
 }
diff --git a/src/main/scala/com/databricks/spark/xml/util/InferSchema.scala b/src/main/scala/com/databricks/spark/xml/util/InferSchema.scala
@@ -118,8 +118,8 @@ private[xml] object InferSchema {
         case v if isInteger(v) => IntegerType
         case v if isDouble(v) => DoubleType
         case v if isBoolean(v) => BooleanType
-        case v if isTimestamp(v) => TimestampType
-        case v if isDate(v) => DateType
+        case v if isTimestamp(v, options) => TimestampType
+        case v if isDate(v, options) => DateType
         case _ => StringType
       }
     } else {
diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala
@@ -64,8 +64,8 @@ private[xml] object TypeCast {
           .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
         case _: BooleanType => parseXmlBoolean(datum)
         case _: DecimalType => new BigDecimal(datum.replaceAll(",", ""))
-        case _: TimestampType => parseXmlTimestamp(datum)
-        case _: DateType => parseXmlDate(datum)
+        case _: TimestampType => parseXmlTimestamp(datum, options)
+        case _: DateType => parseXmlDate(datum, options)
         case _: StringType => datum
         case _ => throw new IllegalArgumentException(s"Unsupported type: ${castType.typeName}")
       }
@@ -86,8 +86,10 @@ private[xml] object TypeCast {
     DateTimeFormatter.ISO_DATE
   )
 
-  private def parseXmlDate(value: String): Date = {
-    supportedXmlDateFormatters.foreach { format =>
+  private def parseXmlDate(value: String, options: XmlOptions): Date = {
+    val formatters = options.dateFormat.map(DateTimeFormatter.ofPattern).
+      map(supportedXmlDateFormatters :+ _).getOrElse(supportedXmlDateFormatters)
+    formatters.foreach { format =>
       try {
         return Date.valueOf(LocalDate.parse(value, format))
       } catch {
@@ -114,8 +116,10 @@ private[xml] object TypeCast {
     DateTimeFormatter.ISO_INSTANT
   )
 
-  private def parseXmlTimestamp(value: String): Timestamp = {
-    supportedXmlTimestampFormatters.foreach { format =>
+  private def parseXmlTimestamp(value: String, options: XmlOptions): Timestamp = {
+    val formatters = options.timestampFormat.map(DateTimeFormatter.ofPattern).
+      map(supportedXmlTimestampFormatters :+ _).getOrElse(supportedXmlTimestampFormatters)
+    formatters.foreach { format =>
       try {
         return Timestamp.from(ZonedDateTime.parse(value, format).toInstant)
       } catch {
@@ -191,12 +195,22 @@ private[xml] object TypeCast {
     (allCatch opt signSafeValue.toLong).isDefined
   }
 
-  private[xml] def isTimestamp(value: String): Boolean = {
-    (allCatch opt Timestamp.valueOf(value)).isDefined
+  private[xml] def isTimestamp(value: String, options: XmlOptions): Boolean = {
+    try {
+      parseXmlTimestamp(value, options)
+      true
+    } catch {
+      case _: IllegalArgumentException => false
+    }
   }
 
-  private[xml] def isDate(value: String): Boolean = {
-    (allCatch opt Date.valueOf(value)).isDefined
+  private[xml] def isDate(value: String, options: XmlOptions): Boolean = {
+    try {
+      parseXmlDate(value, options)
+      true
+    } catch {
+      case _: IllegalArgumentException => false
+    }
   }
 
   private[xml] def signSafeToLong(value: String, options: XmlOptions): Long = {
diff --git a/src/test/resources/date.xml b/src/test/resources/date.xml
@@ -1,4 +1,5 @@
 <book>
     <author>John Smith</author>
-    <date>2021-01-01</date>
+    <date>2021-02-01</date>
+    <date2>02-01-2021</date2>
 </book>
diff --git a/src/test/resources/time.xml b/src/test/resources/time.xml
@@ -0,0 +1,5 @@
+<book>
+    <author>John Smith</author>
+    <time>2011-12-03T10:15:30Z</time>
+    <time2>12-03-2011 10:15:30 PST</time2>
+</book>
diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
@@ -48,6 +48,7 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
       master("local[2]").
       appName("XmlSuite").
       config("spark.ui.enabled", false).
+      config("spark.sql.session.timeZone", "UTC").
       getOrCreate()
   }
   private var tempDir: Path = _
@@ -1298,19 +1299,64 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
   }
 
   test("Test date parsing") {
-    val schema = buildSchema(field("author"), field("date", DateType))
+    val schema = buildSchema(field("author"), field("date", DateType), field("date2", StringType))
     val df = spark.read
       .option("rowTag", "book")
       .schema(schema)
       .xml(resDir + "date.xml")
-    assert(df.collect().head.getAs[Date](1).toString === "2021-01-01")
+    assert(df.collect().head.getAs[Date](1).toString === "2021-02-01")
   }
 
   test("Test date type inference") {
     val df = spark.read
       .option("rowTag", "book")
       .xml(resDir + "date.xml")
-    assert(df.dtypes(1) === ("date", "DateType"))
+    val expectedSchema =
+      buildSchema(field("author"), field("date", DateType), field("date2", StringType))
+    assert(df.schema === expectedSchema)
+    assert(df.collect().head.getAs[Date](1).toString === "2021-02-01")
+  }
+
+  test("Test timestamp parsing") {
+    val schema =
+      buildSchema(field("author"), field("time", TimestampType), field("time2", StringType))
+    val df = spark.read
+      .option("rowTag", "book")
+      .schema(schema)
+      .xml(resDir + "time.xml")
+    assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L)
+  }
+
+  test("Test timestamp type inference") {
+    val df = spark.read
+      .option("rowTag", "book")
+      .xml(resDir + "time.xml")
+    val expectedSchema =
+      buildSchema(field("author"), field("time", TimestampType), field("time2", StringType))
+    assert(df.schema === expectedSchema)
+    assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L)
+  }
+
+  test("Test dateFormat") {
+    val df = spark.read
+      .option("rowTag", "book")
+      .option("dateFormat", "MM-dd-yyyy")
+      .xml(resDir + "date.xml")
+    val expectedSchema =
+      buildSchema(field("author"), field("date", DateType), field("date2", DateType))
+    assert(df.schema === expectedSchema)
+    assert(df.collect().head.getAs[Date](2).toString === "2021-02-01")
+  }
+
+  test("Test timestampFormat") {
+    val df = spark.read
+      .option("rowTag", "book")
+      .option("timestampFormat", "MM-dd-yyyy HH:mm:ss z")
+      .xml(resDir + "time.xml")
+    val expectedSchema =
+      buildSchema(field("author"), field("time", TimestampType), field("time2", TimestampType))
+    assert(df.schema === expectedSchema)
+    assert(df.collect().head.getAs[Timestamp](2).getTime === 1322936130000L)
   }
 
   private def getLines(path: Path): Seq[String] = {
diff --git a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala
@@ -155,7 +155,7 @@ final class TypeCastSuite extends AnyFunSuite {
     assert(TypeCast.isLong("10"))
     assert(TypeCast.isDouble("+10.1"))
     val timestamp = "2015-01-01 00:00:00"
-    assert(TypeCast.isTimestamp(timestamp))
+    assert(TypeCast.isTimestamp(timestamp, new XmlOptions()))
   }
 
   test("Float and Double Types are cast correctly with Locale") {

Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,8 @@ mimaBinaryIssueFilters ++= {`
`101`	`101`	`exclude[DirectMissingMethodProblem](`
`102`	`102`	`"com.databricks.spark.xml.util.TypeCast.supportedXmlTimestampFormatters"),`
`103`	`103`	`exclude[DirectMissingMethodProblem](`
`104`		`- "com.databricks.spark.xml.util.TypeCast.parseXmlTimestamp")`
	`104`	`+ "com.databricks.spark.xml.util.TypeCast.parseXmlTimestamp"),`
	`105`	`+ exclude[DirectMissingMethodProblem](`
	`106`	`+ "com.databricks.spark.xml.util.TypeCast.isTimestamp")`
`105`	`107`	`)`
`106`	`108`	`}`
Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,8 @@ private[xml] class XmlOptions(`
`56`	`56`	`val wildcardColName =`
`57`	`57`	`parameters.getOrElse("wildcardColName", XmlOptions.DEFAULT_WILDCARD_COL_NAME)`
`58`	`58`	`val ignoreNamespace = parameters.get("ignoreNamespace").map(_.toBoolean).getOrElse(false)`
	`59`	`+ val timestampFormat = parameters.get("timestampFormat")`
	`60`	`+ val dateFormat = parameters.get("dateFormat")`
`59`	`61`	`}`
`60`	`62`
`61`	`63`	`private[xml] object XmlOptions {`
Original file line number	Diff line number	Diff line change
`@@ -155,7 +155,7 @@ final class TypeCastSuite extends AnyFunSuite {`
`155`	`155`	`assert(TypeCast.isLong("10"))`
`156`	`156`	`assert(TypeCast.isDouble("+10.1"))`
`157`	`157`	`val timestamp = "2015-01-01 00:00:00"`
`158`		`- assert(TypeCast.isTimestamp(timestamp))`
	`158`	`+ assert(TypeCast.isTimestamp(timestamp, new XmlOptions()))`
`159`	`159`	`}`
`160`	`160`
`161`	`161`	`test("Float and Double Types are cast correctly with Locale") {`