Skip to content
This repository was archived by the owner on Mar 24, 2025. It is now read-only.

Commit ceed1b8

Browse files
authored
Add dateFormat, timestampFormat support (#524)
* Add dateFormat, timestampFormat support
1 parent 0a7289a commit ceed1b8

File tree

10 files changed

+114
-25
lines changed

10 files changed

+114
-25
lines changed

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@ In this case, to use local XSD `/foo/bar.xsd`, call `addFile("/foo/bar.xsd")` an
8888
for example, be treated as if both are just `<author>`. Note that, at the moment, namespaces cannot be ignored on the
8989
`rowTag` element, only its children. Note that XML parsing is in general not namespace-aware even if `false`.
9090
Defaults to `false`. New in 0.11.0.
91+
* `timestampFormat`: Specifies an additional timestamp format that will be tried when parsing values as `TimestampType`
92+
columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
93+
Defaults to try several formats, including [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT),
94+
including variations with offset timezones or no timezone (defaults to UTC). New in 0.12.0.
95+
* `dateFormat`: Specifies an additional timestamp format that will be tried when parsing values as `DateType`
96+
columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
97+
Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0.
9198

9299
When writing files the API accepts several options:
93100

@@ -98,6 +105,12 @@ When writing files the API accepts several options:
98105
* `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `_`.
99106
* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`.
100107
* `compression`: compression codec to use when saving to file. Should be the fully qualified name of a class implementing `org.apache.hadoop.io.compress.CompressionCodec` or one of case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). Defaults to no compression when a codec is not specified.
108+
* `timestampFormat`: Controls the format used to write `TimestampType` format columns.
109+
The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
110+
Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). New in 0.12.0.
111+
* `dateFormat`: Controls the format used to write `DateType` format columns.
112+
The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
113+
Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0.
101114

102115
Currently it supports the shortened name usage. You can use just `xml` instead of `com.databricks.spark.xml`.
103116

build.sbt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ mimaBinaryIssueFilters ++= {
101101
exclude[DirectMissingMethodProblem](
102102
"com.databricks.spark.xml.util.TypeCast.supportedXmlTimestampFormatters"),
103103
exclude[DirectMissingMethodProblem](
104-
"com.databricks.spark.xml.util.TypeCast.parseXmlTimestamp")
104+
"com.databricks.spark.xml.util.TypeCast.parseXmlTimestamp"),
105+
exclude[DirectMissingMethodProblem](
106+
"com.databricks.spark.xml.util.TypeCast.isTimestamp")
105107
)
106108
}

src/main/scala/com/databricks/spark/xml/XmlOptions.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ private[xml] class XmlOptions(
5656
val wildcardColName =
5757
parameters.getOrElse("wildcardColName", XmlOptions.DEFAULT_WILDCARD_COL_NAME)
5858
val ignoreNamespace = parameters.get("ignoreNamespace").map(_.toBoolean).getOrElse(false)
59+
val timestampFormat = parameters.get("timestampFormat")
60+
val dateFormat = parameters.get("dateFormat")
5961
}
6062

6163
private[xml] object XmlOptions {

src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616
package com.databricks.spark.xml.parsers
1717

18+
import java.sql.{Date, Timestamp}
1819
import java.time.format.DateTimeFormatter
1920

2021
import javax.xml.stream.XMLStreamWriter
@@ -46,10 +47,10 @@ private[xml] object StaxXmlGenerator {
4647
// elements when given values are `null`.
4748
case (_, _, _) if name == options.valueTag =>
4849
// If this is meant to be value but in no child, write only a value
49-
writeElement(dt, v)
50+
writeElement(dt, v, options)
5051
case (_, _, _) =>
5152
writer.writeStartElement(name)
52-
writeElement(dt, v)
53+
writeElement(dt, v, options)
5354
writer.writeEndElement()
5455
}
5556

@@ -75,11 +76,17 @@ private[xml] object StaxXmlGenerator {
7576
}
7677
}
7778

78-
def writeElement(dt: DataType, v: Any): Unit = (dt, v) match {
79+
def writeElement(dt: DataType, v: Any, options: XmlOptions): Unit = (dt, v) match {
7980
case (_, null) | (NullType, _) => writer.writeCharacters(options.nullValue)
8081
case (StringType, v: String) => writer.writeCharacters(v)
81-
case (TimestampType, v: java.sql.Timestamp) =>
82-
writer.writeCharacters(DateTimeFormatter.ISO_INSTANT.format(v.toInstant()))
82+
case (TimestampType, v: Timestamp) =>
83+
val formatter = options.timestampFormat.map(DateTimeFormatter.ofPattern).
84+
getOrElse(DateTimeFormatter.ISO_INSTANT)
85+
writer.writeCharacters(formatter.format(v.toInstant()))
86+
case (DateType, v: Date) =>
87+
val formatter = options.dateFormat.map(DateTimeFormatter.ofPattern).
88+
getOrElse(DateTimeFormatter.ISO_DATE)
89+
writer.writeCharacters(formatter.format(v.toLocalDate()))
8390
case (IntegerType, v: Int) => writer.writeCharacters(v.toString)
8491
case (ShortType, v: Short) => writer.writeCharacters(v.toString)
8592
case (FloatType, v: Float) => writer.writeCharacters(v.toString)
@@ -88,7 +95,6 @@ private[xml] object StaxXmlGenerator {
8895
case (DecimalType(), v: java.math.BigDecimal) => writer.writeCharacters(v.toString)
8996
case (ByteType, v: Byte) => writer.writeCharacters(v.toString)
9097
case (BooleanType, v: Boolean) => writer.writeCharacters(v.toString)
91-
case (DateType, _) => writer.writeCharacters(v.toString)
9298

9399
// For the case roundtrip in reading and writing XML files, [[ArrayType]] cannot have
94100
// [[ArrayType]] as element type. It always wraps the element with [[StructType]]. So,
@@ -142,7 +148,7 @@ private[xml] object StaxXmlGenerator {
142148
val (names, values) = elements.unzip
143149
val elementSchema = StructType(schema.filter(names.contains))
144150
val elementRow = Row.fromSeq(row.toSeq.filter(values.contains))
145-
writeElement(elementSchema, elementRow)
151+
writeElement(elementSchema, elementRow, options)
146152
writer.writeEndElement()
147153
}
148154
}

src/main/scala/com/databricks/spark/xml/util/InferSchema.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,8 @@ private[xml] object InferSchema {
118118
case v if isInteger(v) => IntegerType
119119
case v if isDouble(v) => DoubleType
120120
case v if isBoolean(v) => BooleanType
121-
case v if isTimestamp(v) => TimestampType
122-
case v if isDate(v) => DateType
121+
case v if isTimestamp(v, options) => TimestampType
122+
case v if isDate(v, options) => DateType
123123
case _ => StringType
124124
}
125125
} else {

src/main/scala/com/databricks/spark/xml/util/TypeCast.scala

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ private[xml] object TypeCast {
6464
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
6565
case _: BooleanType => parseXmlBoolean(datum)
6666
case _: DecimalType => new BigDecimal(datum.replaceAll(",", ""))
67-
case _: TimestampType => parseXmlTimestamp(datum)
68-
case _: DateType => parseXmlDate(datum)
67+
case _: TimestampType => parseXmlTimestamp(datum, options)
68+
case _: DateType => parseXmlDate(datum, options)
6969
case _: StringType => datum
7070
case _ => throw new IllegalArgumentException(s"Unsupported type: ${castType.typeName}")
7171
}
@@ -86,8 +86,10 @@ private[xml] object TypeCast {
8686
DateTimeFormatter.ISO_DATE
8787
)
8888

89-
private def parseXmlDate(value: String): Date = {
90-
supportedXmlDateFormatters.foreach { format =>
89+
private def parseXmlDate(value: String, options: XmlOptions): Date = {
90+
val formatters = options.dateFormat.map(DateTimeFormatter.ofPattern).
91+
map(supportedXmlDateFormatters :+ _).getOrElse(supportedXmlDateFormatters)
92+
formatters.foreach { format =>
9193
try {
9294
return Date.valueOf(LocalDate.parse(value, format))
9395
} catch {
@@ -114,8 +116,10 @@ private[xml] object TypeCast {
114116
DateTimeFormatter.ISO_INSTANT
115117
)
116118

117-
private def parseXmlTimestamp(value: String): Timestamp = {
118-
supportedXmlTimestampFormatters.foreach { format =>
119+
private def parseXmlTimestamp(value: String, options: XmlOptions): Timestamp = {
120+
val formatters = options.timestampFormat.map(DateTimeFormatter.ofPattern).
121+
map(supportedXmlTimestampFormatters :+ _).getOrElse(supportedXmlTimestampFormatters)
122+
formatters.foreach { format =>
119123
try {
120124
return Timestamp.from(ZonedDateTime.parse(value, format).toInstant)
121125
} catch {
@@ -191,12 +195,22 @@ private[xml] object TypeCast {
191195
(allCatch opt signSafeValue.toLong).isDefined
192196
}
193197

194-
private[xml] def isTimestamp(value: String): Boolean = {
195-
(allCatch opt Timestamp.valueOf(value)).isDefined
198+
private[xml] def isTimestamp(value: String, options: XmlOptions): Boolean = {
199+
try {
200+
parseXmlTimestamp(value, options)
201+
true
202+
} catch {
203+
case _: IllegalArgumentException => false
204+
}
196205
}
197206

198-
private[xml] def isDate(value: String): Boolean = {
199-
(allCatch opt Date.valueOf(value)).isDefined
207+
private[xml] def isDate(value: String, options: XmlOptions): Boolean = {
208+
try {
209+
parseXmlDate(value, options)
210+
true
211+
} catch {
212+
case _: IllegalArgumentException => false
213+
}
200214
}
201215

202216
private[xml] def signSafeToLong(value: String, options: XmlOptions): Long = {

src/test/resources/date.xml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
<book>
22
<author>John Smith</author>
3-
<date>2021-01-01</date>
3+
<date>2021-02-01</date>
4+
<date2>02-01-2021</date2>
45
</book>

src/test/resources/time.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<book>
2+
<author>John Smith</author>
3+
<time>2011-12-03T10:15:30Z</time>
4+
<time2>12-03-2011 10:15:30 PST</time2>
5+
</book>

src/test/scala/com/databricks/spark/xml/XmlSuite.scala

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
4848
master("local[2]").
4949
appName("XmlSuite").
5050
config("spark.ui.enabled", false).
51+
config("spark.sql.session.timeZone", "UTC").
5152
getOrCreate()
5253
}
5354
private var tempDir: Path = _
@@ -1298,19 +1299,64 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
12981299
}
12991300

13001301
test("Test date parsing") {
1301-
val schema = buildSchema(field("author"), field("date", DateType))
1302+
val schema = buildSchema(field("author"), field("date", DateType), field("date2", StringType))
13021303
val df = spark.read
13031304
.option("rowTag", "book")
13041305
.schema(schema)
13051306
.xml(resDir + "date.xml")
1306-
assert(df.collect().head.getAs[Date](1).toString === "2021-01-01")
1307+
assert(df.collect().head.getAs[Date](1).toString === "2021-02-01")
13071308
}
13081309

13091310
test("Test date type inference") {
13101311
val df = spark.read
13111312
.option("rowTag", "book")
13121313
.xml(resDir + "date.xml")
1313-
assert(df.dtypes(1) === ("date", "DateType"))
1314+
val expectedSchema =
1315+
buildSchema(field("author"), field("date", DateType), field("date2", StringType))
1316+
assert(df.schema === expectedSchema)
1317+
assert(df.collect().head.getAs[Date](1).toString === "2021-02-01")
1318+
}
1319+
1320+
test("Test timestamp parsing") {
1321+
val schema =
1322+
buildSchema(field("author"), field("time", TimestampType), field("time2", StringType))
1323+
val df = spark.read
1324+
.option("rowTag", "book")
1325+
.schema(schema)
1326+
.xml(resDir + "time.xml")
1327+
assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L)
1328+
}
1329+
1330+
test("Test timestamp type inference") {
1331+
val df = spark.read
1332+
.option("rowTag", "book")
1333+
.xml(resDir + "time.xml")
1334+
val expectedSchema =
1335+
buildSchema(field("author"), field("time", TimestampType), field("time2", StringType))
1336+
assert(df.schema === expectedSchema)
1337+
assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L)
1338+
}
1339+
1340+
test("Test dateFormat") {
1341+
val df = spark.read
1342+
.option("rowTag", "book")
1343+
.option("dateFormat", "MM-dd-yyyy")
1344+
.xml(resDir + "date.xml")
1345+
val expectedSchema =
1346+
buildSchema(field("author"), field("date", DateType), field("date2", DateType))
1347+
assert(df.schema === expectedSchema)
1348+
assert(df.collect().head.getAs[Date](2).toString === "2021-02-01")
1349+
}
1350+
1351+
test("Test timestampFormat") {
1352+
val df = spark.read
1353+
.option("rowTag", "book")
1354+
.option("timestampFormat", "MM-dd-yyyy HH:mm:ss z")
1355+
.xml(resDir + "time.xml")
1356+
val expectedSchema =
1357+
buildSchema(field("author"), field("time", TimestampType), field("time2", TimestampType))
1358+
assert(df.schema === expectedSchema)
1359+
assert(df.collect().head.getAs[Timestamp](2).getTime === 1322936130000L)
13141360
}
13151361

13161362
private def getLines(path: Path): Seq[String] = {

src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ final class TypeCastSuite extends AnyFunSuite {
155155
assert(TypeCast.isLong("10"))
156156
assert(TypeCast.isDouble("+10.1"))
157157
val timestamp = "2015-01-01 00:00:00"
158-
assert(TypeCast.isTimestamp(timestamp))
158+
assert(TypeCast.isTimestamp(timestamp, new XmlOptions()))
159159
}
160160

161161
test("Float and Double Types are cast correctly with Locale") {

0 commit comments

Comments
 (0)