Skip to content

Commit 42e1af1

Browse files
solution remodelled , objects extended for pattern independence, default ISO pattern used if no pattern is specified and direct casting to DateType & TimestampType
1 parent 577ec48 commit 42e1af1

File tree

7 files changed

+216
-148
lines changed

7 files changed

+216
-148
lines changed

src/main/scala/za/co/absa/standardization/schema/MetadataKeys.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ object MetadataKeys {
4242
val StrictParsing = "strict_parsing"
4343
// For nonstandard data inputs like the Mainframe's century pattern
4444
val IsNonStandard = "is_non_standard"
45+
// For allowing separate infinity patterns
46+
val PlusInfinityPattern = "plus_infinity_pattern"
47+
val MinusInfinityPattern = "minus_infinity_pattern"
48+
4549
}
4650

4751
object MetadataValues {

src/main/scala/za/co/absa/standardization/stages/InfinitySupport.scala

Lines changed: 81 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -16,133 +16,101 @@
1616

1717
package za.co.absa.standardization.stages
1818

19-
import org.apache.spark.sql.functions.{to_timestamp,lit, when,coalesce,to_date}
20-
import org.apache.spark.sql.{Column, Row, SparkSession}
21-
import org.apache.spark.sql.types.{DataType, DateType, StringType, StructField, StructType, TimestampType}
22-
import za.co.absa.standardization.types.{TypeDefaults, TypedStructField}
23-
import za.co.absa.standardization.types.TypedStructField.DateTimeTypeStructField
24-
import java.sql.Timestamp
25-
import scala.collection.JavaConverters._
19+
import org.apache.spark.sql.functions.{lit, when}
20+
import org.apache.spark.sql.Column
21+
import org.apache.spark.sql.types.{DataType, DateType, TimestampType}
22+
import za.co.absa.standardization.types.parsers.DateTimeParser
23+
import za.co.absa.standardization.time.{DateTimePattern, InfinityConfig}
24+
25+
import java.sql.{Date, Timestamp}
2626
import java.text.SimpleDateFormat
27-
import java.util.Date
27+
import java.util.Locale
28+
import scala.util.Try
2829

2930

3031

3132
trait InfinitySupport {
3233
protected def infMinusSymbol: Option[String]
34+
3335
protected def infMinusValue: Option[String]
36+
3437
protected def infPlusSymbol: Option[String]
38+
3539
protected def infPlusValue: Option[String]
40+
protected def infMinusPattern: Option[String]
41+
protected def infPlusPattern: Option[String]
3642
protected val origType: DataType
37-
protected def field: TypedStructField
38-
39-
40-
private def sanitizeInput(s: String): String = {
41-
if (s.matches("[a-zA-Z0-9:.-]+")) s
42-
else {
43-
throw new IllegalArgumentException(s"Invalid input '$s': must be alphanumeric , colon, dot or hyphen")
44-
}
45-
}
46-
47-
private def getPattern(dataType: DataType): Option[String] = {
48-
dataType match {
49-
case DateType | TimestampType =>
50-
field match {
51-
case dateField: DateTimeTypeStructField[_] =>
52-
dateField.pattern.toOption.flatten.map(_.pattern)
53-
case _ => None
54-
}
55-
case _ => None
56-
}
57-
}
58-
59-
private def validateAndConvertInfinityValue(value: String, dataType: DataType, patternOpt: Option[String], spark:SparkSession): String = {
60-
val sanitizedValue = sanitizeInput(value)
61-
val schema = StructType(Seq(StructField("value", StringType, nullable = false)))
62-
val df = spark.createDataFrame(spark.sparkContext.parallelize(Seq(Row(sanitizedValue))), schema)
63-
64-
val parsedWithPattern = patternOpt.flatMap { pattern =>
65-
val parsedCol = dataType match {
66-
case TimestampType =>to_timestamp(df.col("value"), pattern)
67-
case DateType => to_date(df.col("value"), pattern)
68-
case _ => df.col("value").cast(dataType)
69-
}
70-
val result = df.select(parsedCol.alias("parsed")).first().get(0)
71-
if (result != null) Some(sanitizedValue) else None
72-
}
73-
74-
if (parsedWithPattern.isDefined) {
75-
parsedWithPattern.get
76-
} else {
77-
val isoPattern = dataType match {
78-
case TimestampType => "yyyy-MM-dd'T'HH:mm:ss.SSSSSS"
79-
case DateType => "yyyy-MM-dd"
80-
case _ => ""
81-
}
82-
val parsedWithISO = dataType match {
83-
case TimestampType => df.select(to_timestamp(df.col("value"), isoPattern)).alias("parsed").first().getAs[Timestamp](0)
84-
case DateType => df.select(to_date(df.col("value"), isoPattern)).alias("parsed").first().getAs[Date](0)
85-
case _ => null
86-
}
87-
if (parsedWithISO != null) {
88-
patternOpt.getOrElse(isoPattern) match {
89-
case pattern =>
90-
val dateFormat = new SimpleDateFormat(pattern)
91-
dateFormat.format(parsedWithISO)
92-
}
93-
} else{
94-
throw new IllegalArgumentException(s"Invalid infinity value: '$value' for type: $dataType with pattern ${patternOpt.getOrElse("none")} and ISO fallback ($isoPattern)")
95-
}
96-
}
97-
}
98-
99-
100-
def replaceInfinitySymbols(column: Column)(implicit spark:SparkSession, defaults: TypeDefaults): Column = {
101-
var resultCol = column.cast(StringType)
43+
protected val targetType: DataType
10244

103-
val validatedMinus = if (origType == DateType || origType == TimestampType) {
104-
infMinusValue.map( v => validateAndConvertInfinityValue(v, origType, getPattern(origType),spark))
105-
} else {
106-
infMinusValue.map(sanitizeInput)
107-
}
108-
109-
val validatedPlus = if (origType == DateType || origType == TimestampType){
110-
infPlusValue.map(v => validateAndConvertInfinityValue(v, origType, getPattern(origType),spark))
111-
} else{
112-
infPlusValue.map(sanitizeInput)
113-
}
45+
def replaceInfinitySymbols(column: Column): Column = {
46+
targetType match {
47+
case DateType =>
48+
val defaultDatePattern = "yyyy-MM-dd"
49+
val minusDate = infMinusValue.flatMap { value =>
50+
infMinusSymbol.map { symbol =>
51+
when(
52+
column === lit(symbol).cast(origType),
53+
lit(parseInfinityValue(value, infMinusPattern.getOrElse(defaultDatePattern)).getTime)
54+
.cast(TimestampType)
55+
.cast(DateType)
56+
)
57+
}
58+
}.getOrElse(column)
59+
60+
infPlusValue.flatMap { value =>
61+
infPlusSymbol.map { symbol =>
62+
when(
63+
minusDate === lit(symbol).cast(origType),
64+
lit(parseInfinityValue(value, infPlusPattern.getOrElse(defaultDatePattern)).getTime)
65+
.cast(TimestampType)
66+
.cast(DateType)
67+
).otherwise(minusDate)
68+
}
69+
}.getOrElse(minusDate)
11470

115-
validatedMinus.foreach { v =>
116-
infMinusSymbol.foreach { s =>
117-
resultCol = when(resultCol === lit(s), lit(v)).otherwise(resultCol)
118-
}
119-
}
71+
case TimestampType =>
72+
val defaultTimestampPattern = "yyyy-MM-dd HH:mm:ss"
73+
val minusTimestamp = infMinusValue.flatMap { value =>
74+
infMinusSymbol.map { symbol =>
75+
when(
76+
column === lit(symbol).cast(origType),
77+
lit(parseInfinityValue(value, infMinusPattern.getOrElse(defaultTimestampPattern)).getTime)
78+
.cast(TimestampType)
79+
)
80+
}
81+
}.getOrElse(column)
82+
83+
infPlusValue.flatMap { value =>
84+
infPlusSymbol.map { symbol =>
85+
when(
86+
minusTimestamp === lit(symbol).cast(origType),
87+
lit(parseInfinityValue(value, infPlusPattern.getOrElse(defaultTimestampPattern)).getTime)
88+
.cast(TimestampType)
89+
).otherwise(minusTimestamp)
90+
}
91+
}.getOrElse(minusTimestamp)
12092

121-
validatedPlus.foreach { v =>
122-
infPlusSymbol.foreach { s =>
123-
resultCol = when(resultCol === lit(s), lit(v)).otherwise(resultCol)
93+
case _ =>
94+
val columnWithNegativeInf: Column = infMinusSymbol.flatMap { minusSymbol =>
95+
infMinusValue.map { minusValue =>
96+
when(column === lit(minusSymbol).cast(origType), lit(minusValue).cast(origType)).otherwise(column)
97+
}
98+
}.getOrElse(column)
99+
100+
infPlusSymbol.flatMap { plusSymbol =>
101+
infPlusValue.map { plusValue =>
102+
when(columnWithNegativeInf === lit(plusSymbol).cast(origType), lit(plusValue).cast(origType))
103+
.otherwise(columnWithNegativeInf)
104+
}
105+
}.getOrElse(columnWithNegativeInf)
124106
}
125-
}
107+
}
126108

127-
origType match {
128-
case TimestampType =>
129-
val pattern = getPattern(origType).getOrElse(
130-
defaults.defaultTimestampTimeZone.map(_ => "yyyy-MM-dd'T'HH:mm:ss.SSSSSS").getOrElse("yyyy-MM-dd HH:mm:ss")
131-
)
132-
coalesce(
133-
to_timestamp(resultCol,pattern),
134-
to_timestamp(resultCol,"yyyy-MM-dd'T'HH:mm:ss.SSSSSS")
135-
).cast(origType)
136-
case DateType =>
137-
val pattern = getPattern(origType).getOrElse(
138-
defaults.defaultDateTimeZone.map(_ => "yyyy-MM-dd").getOrElse("yyyy-MM-dd")
139-
)
140-
coalesce(
141-
to_date(resultCol,pattern),
142-
to_date(resultCol, "yyyy-MM-dd")
143-
).cast(origType)
144-
case _ =>
145-
resultCol.cast(origType)
146-
}
109+
private def parseInfinityValue(value: String, pattern: String): Date = {
110+
val dateFormat = new SimpleDateFormat(pattern, Locale.US)
111+
dateFormat.setLenient(false)
112+
new Date(dateFormat.parse(value).getTime)
147113
}
148114
}
115+
116+

src/main/scala/za/co/absa/standardization/stages/TypeParser.scala

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
package za.co.absa.standardization.stages
1818

19-
import org.apache.spark.sql.{Column, SparkSession}
19+
import org.apache.spark.sql.Column
2020
import org.apache.spark.sql.expressions.UserDefinedFunction
2121
import org.apache.spark.sql.functions._
2222
import org.apache.spark.sql.types._
@@ -138,7 +138,7 @@ object TypeParser {
138138
origSchema: StructType,
139139
stdConfig: StandardizationConfig,
140140
failOnInputNotPerSchema: Boolean = true)
141-
(implicit spark: SparkSession, defaults: TypeDefaults): ParseOutput = {
141+
(implicit defaults: TypeDefaults): ParseOutput = {
142142
// udfLib implicit is present for error column UDF implementation
143143
val sourceName = SchemaUtils.appendPath(path, field.sourceName)
144144
val origField = origSchema.getField(sourceName)
@@ -165,7 +165,7 @@ object TypeParser {
165165
origType: DataType,
166166
failOnInputNotPerSchema: Boolean,
167167
isArrayElement: Boolean = false)
168-
(implicit spark:SparkSession, defaults: TypeDefaults): TypeParser[_] = {
168+
(implicit defaults: TypeDefaults): TypeParser[_] = {
169169
val parserClass: (String, Column, DataType, Boolean, Boolean) => TypeParser[_] = field.dataType match {
170170
case _: ArrayType => ArrayParser(TypedStructField.asArrayTypeStructField(field), _, _, _, _, _)
171171
case _: StructType => StructParser(TypedStructField.asStructTypeStructField(field), _, _, _, _, _)
@@ -318,11 +318,14 @@ object TypeParser {
318318
}
319319

320320
private abstract class NumericParser[N: TypeTag](override val field: NumericTypeStructField[N])
321-
(implicit spark: SparkSession, defaults: TypeDefaults) extends ScalarParser[N] with InfinitySupport {
321+
(implicit defaults: TypeDefaults) extends ScalarParser[N] with InfinitySupport {
322322
override protected val infMinusSymbol: Option[String] = metadata.getOptString(MetadataKeys.MinusInfinitySymbol)
323323
override protected val infMinusValue: Option[String] = metadata.getOptString(MetadataKeys.MinusInfinityValue)
324324
override protected val infPlusSymbol: Option[String] = metadata.getOptString(MetadataKeys.PlusInfinitySymbol)
325325
override protected val infPlusValue: Option[String] = metadata.getOptString(MetadataKeys.PlusInfinityValue)
326+
override protected val infMinusPattern : Option[String] = metadata.getOptString(MetadataKeys.MinusInfinityPattern)
327+
override protected val infPlusPattern : Option[String] = metadata.getOptString(MetadataKeys.PlusInfinityPattern)
328+
override protected val targetType: DataType = field.dataType
326329
private val columnWithInfinityReplaced = replaceInfinitySymbols(column)
327330

328331
override protected def standardizeAfterCheck(stdConfig: StandardizationConfig)(implicit logger: Logger): ParseOutput = {
@@ -384,7 +387,7 @@ object TypeParser {
384387
failOnInputNotPerSchema: Boolean,
385388
isArrayElement: Boolean,
386389
overflowableTypes: Set[DataType])
387-
(implicit spark:SparkSession, defaults: TypeDefaults) extends NumericParser[N](field) {
390+
(implicit defaults: TypeDefaults) extends NumericParser[N](field) {
388391
override protected def assemblePrimitiveCastErrorLogic(castedCol: Column): Column = {
389392
val basicLogic: Column = super.assemblePrimitiveCastErrorLogic(castedCol)
390393

@@ -510,6 +513,9 @@ object TypeParser {
510513
override protected val infMinusValue: Option[String] = metadata.getOptString(MetadataKeys.MinusInfinityValue)
511514
override protected val infPlusSymbol: Option[String] = metadata.getOptString(MetadataKeys.PlusInfinitySymbol)
512515
override protected val infPlusValue: Option[String] = metadata.getOptString(MetadataKeys.PlusInfinityValue)
516+
override protected val infMinusPattern : Option[String] = metadata.getOptString(MetadataKeys.MinusInfinityPattern)
517+
override protected val infPlusPattern : Option[String] = metadata.getOptString(MetadataKeys.PlusInfinityPattern)
518+
override protected val targetType: DataType = field.dataType
513519
private val columnWithInfinityReplaced: Column = replaceInfinitySymbols(column)
514520

515521
protected val replaceCenturyUDF: UserDefinedFunction = udf((inputDate: String, centuryPattern: String) => {

src/main/scala/za/co/absa/standardization/time/DateTimePattern.scala

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ import za.co.absa.standardization.types.{Section, TypePattern}
2626
* @param pattern actual pattern to format the type conversion
2727
* @param isDefault marks if the pattern is actually an assigned value or taken for global defaults
2828
*/
29-
abstract sealed class DateTimePattern(pattern: String, isDefault: Boolean = false)
29+
30+
abstract sealed class DateTimePattern(pattern: String, isDefault: Boolean = false,
31+
val infinityConfig:Option[ InfinityConfig] = None)
3032
extends TypePattern(pattern, isDefault){
3133

3234
val isEpoch: Boolean
@@ -81,8 +83,9 @@ object DateTimePattern {
8183
// scalastyle:on magic.number
8284

8385
private final case class EpochDTPattern(override val pattern: String,
84-
override val isDefault: Boolean = false)
85-
extends DateTimePattern(pattern, isDefault) {
86+
override val isDefault: Boolean = false,
87+
override val infinityConfig: Option[InfinityConfig] = None)
88+
extends DateTimePattern(pattern, isDefault, infinityConfig) {
8689

8790
override val isEpoch: Boolean = true
8891
override val isCentury: Boolean = false
@@ -115,12 +118,15 @@ object DateTimePattern {
115118
case _ => Seq.empty
116119
}
117120
override val patternWithoutSecondFractions: String = EpochKeyword
121+
122+
118123
}
119124

120125
private abstract class StandardDTPatternBase(override val pattern: String,
121126
assignedDefaultTimeZone: Option[String],
122-
override val isDefault: Boolean = false)
123-
extends DateTimePattern(pattern, isDefault) {
127+
override val isDefault: Boolean = false,
128+
override val infinityConfig: Option[InfinityConfig] = None)
129+
extends DateTimePattern(pattern, isDefault,infinityConfig) {
124130

125131
override val isEpoch: Boolean = false
126132
override val epochFactor: Long = 0
@@ -151,8 +157,9 @@ object DateTimePattern {
151157

152158
private final case class StandardDTPattern(override val pattern: String,
153159
assignedDefaultTimeZone: Option[String] = None,
154-
override val isDefault: Boolean = false)
155-
extends StandardDTPatternBase(pattern, assignedDefaultTimeZone, isDefault) {
160+
override val isDefault: Boolean = false,
161+
override val infinityConfig: Option[InfinityConfig] = None)
162+
extends StandardDTPatternBase(pattern, assignedDefaultTimeZone, isDefault, infinityConfig) {
156163

157164
override val isCentury: Boolean = false
158165
override val originalPattern: Option[String] = None
@@ -161,35 +168,39 @@ object DateTimePattern {
161168
private final case class CenturyDTPattern(override val pattern: String,
162169
override val originalPattern: Option[String],
163170
assignedDefaultTimeZone: Option[String] = None,
164-
override val isDefault: Boolean = false)
165-
extends StandardDTPatternBase(pattern, assignedDefaultTimeZone, isDefault) {
171+
override val isDefault: Boolean = false,
172+
override val infinityConfig: Option[InfinityConfig] = None)
173+
extends StandardDTPatternBase(pattern, assignedDefaultTimeZone, isDefault, infinityConfig) {
166174

167175
override val isCentury: Boolean = true
168176
}
169177

170178
private def create(pattern: String,
171179
assignedDefaultTimeZone: Option[String],
172180
isCenturyPattern: Boolean,
173-
isDefault: Boolean): DateTimePattern = {
181+
isDefault: Boolean,
182+
infinityConfig: Option[InfinityConfig]): DateTimePattern = {
174183
if (isEpoch(pattern)) {
175-
EpochDTPattern(pattern, isDefault)
184+
EpochDTPattern(pattern, isDefault,infinityConfig)
176185
} else if (isCenturyPattern && isCentury(pattern)) {
177186
val patternWithoutCentury = pattern.replaceAll(patternCenturyChar, "yy")
178-
CenturyDTPattern(patternWithoutCentury, Some(pattern), assignedDefaultTimeZone, isDefault)
187+
CenturyDTPattern(patternWithoutCentury, Some(pattern), assignedDefaultTimeZone, isDefault, infinityConfig)
179188
} else {
180-
StandardDTPattern(pattern, assignedDefaultTimeZone, isDefault)
189+
StandardDTPattern(pattern, assignedDefaultTimeZone, isDefault, infinityConfig)
181190
}
182191
}
183192

184193
def apply(pattern: String,
185194
assignedDefaultTimeZone: Option[String] = None,
186-
isCenturyPattern: Boolean = false): DateTimePattern = {
187-
create(pattern, assignedDefaultTimeZone, isCenturyPattern, isDefault = false)
195+
isCenturyPattern: Boolean = false,
196+
infinityConfig: Option[InfinityConfig] = None): DateTimePattern = {
197+
create(pattern, assignedDefaultTimeZone, isCenturyPattern, isDefault = false , infinityConfig)
188198
}
189199

190200
def asDefault(pattern: String,
191-
assignedDefaultTimeZone: Option[String] = None): DateTimePattern = {
192-
create(pattern, assignedDefaultTimeZone, isCenturyPattern = false, isDefault = true)
201+
assignedDefaultTimeZone: Option[String] = None,
202+
infinityConfig: Option[InfinityConfig] = None): DateTimePattern = {
203+
create(pattern, assignedDefaultTimeZone, isCenturyPattern = false, isDefault = true, infinityConfig)
193204
}
194205

195206
def isEpoch(pattern: String): Boolean = {

0 commit comments

Comments
 (0)