Skip to content

Commit a13a54d

Browse files
authored
#64 Add support for non standard century pattern
1 parent 8e36e5f commit a13a54d

File tree

12 files changed

+193
-25
lines changed

12 files changed

+193
-25
lines changed

.github/CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
* @Zejnilovic @dk1844 @benedeki @lsulak
1+
* @Zejnilovic @dk1844 @benedeki @lsulak @yruslan

.github/workflows/build.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ on:
2121
branches: [ main, develop, master ]
2222
pull_request:
2323
branches: [ master, develop ]
24-
types: [ assigned, opened, synchronize, reopened, labeled ]
2524

2625
jobs:
2726
test:

.github/workflows/format_check.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ name: FormatCheck
1919
on:
2020
pull_request:
2121
branches: [ master, develop ]
22-
types: [ assigned, opened, synchronize, reopened, labeled ]
2322

2423
jobs:
2524
scalafmt:

.github/workflows/jacoco_check.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ name: JaCoCo report
1919
on:
2020
pull_request:
2121
branches: [ master ]
22-
types: [ opened, edited, synchronize, reopened ]
2322

2423
jobs:
2524
test:

.github/workflows/licence_check.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ on:
2121
branches: [ main, develop, master ]
2222
pull_request:
2323
branches: [ master ]
24-
types: [ assigned, opened, synchronize, reopened, labeled ]
2524

2625
jobs:
2726
license-test:

src/main/scala/za/co/absa/standardization/schema/MetadataKeys.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ object MetadataKeys {
4040
val Encoding = "encoding"
4141
//decimal
4242
val StrictParsing = "strict_parsing"
43+
// For nonstandard data inputs like the Mainframe's century pattern
44+
val IsNonStandard = "is_non_standard"
4345
}
4446

4547
object MetadataValues {

src/main/scala/za/co/absa/standardization/stages/InfinitySupport.scala

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,9 @@ import org.apache.spark.sql.types.DataType
2222

2323
trait InfinitySupport {
2424
protected def infMinusSymbol: Option[String]
25-
2625
protected def infMinusValue: Option[String]
27-
2826
protected def infPlusSymbol: Option[String]
29-
3027
protected def infPlusValue: Option[String]
31-
3228
protected val origType: DataType
3329

3430
def replaceInfinitySymbols(column: Column): Column = {

src/main/scala/za/co/absa/standardization/stages/TypeParser.scala

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import za.co.absa.standardization.schema.{MetadataKeys, MetadataValues, StdSchem
3434
import za.co.absa.standardization.time.DateTimePattern
3535
import za.co.absa.standardization.typeClasses.{DoubleLike, LongLike}
3636
import za.co.absa.standardization.types.TypedStructField._
37+
import za.co.absa.standardization.types.parsers.DateTimeParser
3738
import za.co.absa.standardization.types.{ParseOutput, TypeDefaults, TypedStructField}
3839
import za.co.absa.standardization.udf.{UDFBuilder, UDFNames}
3940

@@ -511,6 +512,17 @@ object TypeParser {
511512
override protected val infPlusValue: Option[String] = metadata.getOptString(MetadataKeys.PlusInfinityValue)
512513
private val columnWithInfinityReplaced: Column = replaceInfinitySymbols(column)
513514

515+
protected val replaceCenturyUDF: UserDefinedFunction = udf((inputDate: String, centuryPattern: String) => {
516+
val centuryIndex = centuryPattern.indexOf(DateTimePattern.patternCenturyChar)
517+
val padding = centuryPattern.length - inputDate.length
518+
val leftPadding = "0" * padding
519+
val pendedInput = leftPadding + inputDate
520+
521+
val charAtPos = pendedInput.charAt(centuryIndex).asDigit
522+
val modifiedChar = (charAtPos + 19).toString // Add 19 and convert back to string
523+
pendedInput.substring(0, centuryIndex) + modifiedChar + pendedInput.substring(centuryIndex + 1)
524+
})
525+
514526
override protected def assemblePrimitiveCastLogic: Column = {
515527
if (pattern.isEpoch) {
516528
castEpoch()
@@ -604,14 +616,19 @@ object TypeParser {
604616
}
605617

606618
override protected def castStringColumn(stringColumn: Column): Column = {
619+
val columWithCenturyReplaced: Column =
620+
if (pattern.isCentury && metadata.getOptStringAsBoolean(MetadataKeys.IsNonStandard).getOrElse(false)) {
621+
replaceCenturyUDF(stringColumn, lit(pattern.originalPattern.get))
622+
} else { stringColumn }
623+
607624
if (pattern.containsSecondFractions) {
608625
// date doesn't need to care about second fractions
609626
applyPatternToStringColumn(
610-
stringColumn.removeSections(
627+
columWithCenturyReplaced.removeSections(
611628
Seq(pattern.millisecondsPosition, pattern.microsecondsPosition, pattern.nanosecondsPosition).flatten
612629
), pattern.patternWithoutSecondFractions)
613630
} else {
614-
applyPatternToStringColumn(stringColumn, pattern)
631+
applyPatternToStringColumn(columWithCenturyReplaced, pattern)
615632
}
616633
}
617634

@@ -651,28 +668,33 @@ object TypeParser {
651668
}
652669

653670
override protected def castStringColumn(stringColumn: Column): Column = {
671+
val columWithCenturyReplaced: Column =
672+
if (pattern.isCentury && metadata.getOptStringAsBoolean(MetadataKeys.IsNonStandard).getOrElse(false)) {
673+
replaceCenturyUDF(stringColumn, lit(pattern.originalPattern.get))
674+
} else { stringColumn }
675+
654676
if (pattern.containsSecondFractions) {
655677
//this is a trick how to enforce fractions of seconds into the timestamp
656678
// - turn into timestamp up to seconds precision and that into unix_timestamp,
657679
// - the second fractions turn into numeric fractions
658680
// - add both together and convert to timestamp
659681
val colSeconds = unix_timestamp(applyPatternToStringColumn(
660-
stringColumn.removeSections(
682+
columWithCenturyReplaced.removeSections(
661683
Seq(pattern.millisecondsPosition, pattern.microsecondsPosition, pattern.nanosecondsPosition).flatten
662684
), pattern.patternWithoutSecondFractions))
663685

664686
val colMilliseconds: Option[Column] =
665-
pattern.millisecondsPosition.map(stringColumn.zeroBasedSubstr(_).cast(decimalType) / MillisecondsPerSecond)
687+
pattern.millisecondsPosition.map(columWithCenturyReplaced.zeroBasedSubstr(_).cast(decimalType) / MillisecondsPerSecond)
666688
val colMicroseconds: Option[Column] =
667-
pattern.microsecondsPosition.map(stringColumn.zeroBasedSubstr(_).cast(decimalType) / MicrosecondsPerSecond)
689+
pattern.microsecondsPosition.map(columWithCenturyReplaced.zeroBasedSubstr(_).cast(decimalType) / MicrosecondsPerSecond)
668690
val colNanoseconds: Option[Column] =
669-
pattern.nanosecondsPosition.map(stringColumn.zeroBasedSubstr(_).cast(decimalType) / NanosecondsPerSecond)
691+
pattern.nanosecondsPosition.map(columWithCenturyReplaced.zeroBasedSubstr(_).cast(decimalType) / NanosecondsPerSecond)
670692
val colFractions: Column =
671693
(colMilliseconds ++ colMicroseconds ++ colNanoseconds).reduceOption(_ + _).getOrElse(lit(0))
672694

673695
(colSeconds + colFractions).cast(TimestampType)
674696
} else {
675-
applyPatternToStringColumn(stringColumn, pattern)
697+
applyPatternToStringColumn(columWithCenturyReplaced, pattern)
676698
}
677699
}
678700

src/main/scala/za/co/absa/standardization/time/DateTimePattern.scala

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package za.co.absa.standardization.time
1818

1919
import za.co.absa.standardization.implicits.StringImplicits.StringEnhancements
20+
import za.co.absa.standardization.time.DateTimePattern.{patternMicroSecondChar, patternMilliSecondChar, patternNanoSecondChat}
2021
import za.co.absa.standardization.types.{Section, TypePattern}
2122

2223
/**
@@ -29,10 +30,12 @@ abstract sealed class DateTimePattern(pattern: String, isDefault: Boolean = fals
2930
extends TypePattern(pattern, isDefault){
3031

3132
val isEpoch: Boolean
33+
val isCentury: Boolean
3234
val epochFactor: Long
3335

3436
val timeZoneInPattern: Boolean
3537
val defaultTimeZone: Option[String]
38+
val originalPattern: Option[String]
3639
val isTimeZoned: Boolean
3740

3841
val millisecondsPosition: Option[Section]
@@ -47,7 +50,6 @@ abstract sealed class DateTimePattern(pattern: String, isDefault: Boolean = fals
4750
val q = "\""
4851
s"pattern: $q$pattern$q" + defaultTimeZone.map(x => s" (default time zone: $q$x$q)").getOrElse("")
4952
}
50-
5153
}
5254

5355
object DateTimePattern {
@@ -57,6 +59,8 @@ object DateTimePattern {
5759
val EpochMicroKeyword = "epochmicro"
5860
val EpochNanoKeyword = "epochnano"
5961

62+
val patternCenturyChar = "c"
63+
6064
private val epochUnitFactor = 1
6165
private val epoch1kFactor = 1000
6266
private val epoch1MFactor = 1000000
@@ -81,10 +85,12 @@ object DateTimePattern {
8185
extends DateTimePattern(pattern, isDefault) {
8286

8387
override val isEpoch: Boolean = true
88+
override val isCentury: Boolean = false
8489
override val epochFactor: Long = DateTimePattern.epochFactor(pattern)
8590

8691
override val timeZoneInPattern: Boolean = true
8792
override val defaultTimeZone: Option[String] = None
93+
override val originalPattern: Option[String] = None
8894
override val isTimeZoned: Boolean = true
8995

9096
override val millisecondsPosition: Option[Section] = pattern match {
@@ -111,9 +117,9 @@ object DateTimePattern {
111117
override val patternWithoutSecondFractions: String = EpochKeyword
112118
}
113119

114-
private final case class StandardDTPattern(override val pattern: String,
115-
assignedDefaultTimeZone: Option[String] = None,
116-
override val isDefault: Boolean = false)
120+
private abstract class StandardDTPatternBase(override val pattern: String,
121+
assignedDefaultTimeZone: Option[String],
122+
override val isDefault: Boolean = false)
117123
extends DateTimePattern(pattern, isDefault) {
118124

119125
override val isEpoch: Boolean = false
@@ -143,22 +149,47 @@ object DateTimePattern {
143149
}
144150
}
145151

146-
private def create(pattern: String, assignedDefaultTimeZone: Option[String], isDefault: Boolean): DateTimePattern = {
152+
private final case class StandardDTPattern(override val pattern: String,
153+
assignedDefaultTimeZone: Option[String] = None,
154+
override val isDefault: Boolean = false)
155+
extends StandardDTPatternBase(pattern, assignedDefaultTimeZone, isDefault) {
156+
157+
override val isCentury: Boolean = false
158+
override val originalPattern: Option[String] = None
159+
}
160+
161+
private final case class CenturyDTPattern(override val pattern: String,
162+
override val originalPattern: Option[String],
163+
assignedDefaultTimeZone: Option[String] = None,
164+
override val isDefault: Boolean = false)
165+
extends StandardDTPatternBase(pattern, assignedDefaultTimeZone, isDefault) {
166+
167+
override val isCentury: Boolean = true
168+
}
169+
170+
private def create(pattern: String,
171+
assignedDefaultTimeZone: Option[String],
172+
isCenturyPattern: Boolean,
173+
isDefault: Boolean): DateTimePattern = {
147174
if (isEpoch(pattern)) {
148175
EpochDTPattern(pattern, isDefault)
176+
} else if (isCenturyPattern && isCentury(pattern)) {
177+
val patternWithoutCentury = pattern.replaceAll(patternCenturyChar, "yy")
178+
CenturyDTPattern(patternWithoutCentury, Some(pattern), assignedDefaultTimeZone, isDefault)
149179
} else {
150180
StandardDTPattern(pattern, assignedDefaultTimeZone, isDefault)
151181
}
152182
}
153183

154184
def apply(pattern: String,
155-
assignedDefaultTimeZone: Option[String] = None): DateTimePattern = {
156-
create(pattern, assignedDefaultTimeZone, isDefault = false)
185+
assignedDefaultTimeZone: Option[String] = None,
186+
isCenturyPattern: Boolean = false): DateTimePattern = {
187+
create(pattern, assignedDefaultTimeZone, isCenturyPattern, isDefault = false)
157188
}
158189

159190
def asDefault(pattern: String,
160191
assignedDefaultTimeZone: Option[String] = None): DateTimePattern = {
161-
create(pattern, assignedDefaultTimeZone, isDefault = true)
192+
create(pattern, assignedDefaultTimeZone, isCenturyPattern = false, isDefault = true)
162193
}
163194

164195
def isEpoch(pattern: String): Boolean = {
@@ -168,6 +199,10 @@ object DateTimePattern {
168199
}
169200
}
170201

202+
def isCentury(pattern: String): Boolean = {
203+
pattern.contains(s"${patternCenturyChar}yy")
204+
}
205+
171206
def epochFactor(pattern: String): Long = {
172207
pattern.toLowerCase match {
173208
case EpochKeyword => epochUnitFactor

src/main/scala/za/co/absa/standardization/types/TypedStructField.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,8 @@ object TypedStructField {
415415
private def readDateTimePattern: DateTimePattern = {
416416
structField.metadata.getOptString(MetadataKeys.Pattern).map { pattern =>
417417
val timeZoneOpt = structField.metadata.getOptString(MetadataKeys.DefaultTimeZone)
418-
DateTimePattern(pattern, timeZoneOpt)
418+
val isCenturyPattern = structField.metadata.getOptStringAsBoolean(MetadataKeys.IsNonStandard).getOrElse(false)
419+
DateTimePattern(pattern, timeZoneOpt, isCenturyPattern)
419420
}.getOrElse(
420421
DateTimePattern.asDefault(defaults.getStringPattern(structField.dataType), None)
421422
)

0 commit comments

Comments
 (0)