Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@
hs_err_pid*
.idea
*.iml

# Output folder of IntelliJ
target
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>

<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>
<artifactId>dateparser-xyzt-ai</artifactId>
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ow, this was a mistake.
This was done for our local fork, and should not have been included in this PR. Will revert this for the PR.

<version>1.0.11</version>

<name>dateparser</name>
Expand Down Expand Up @@ -210,4 +210,4 @@
</resources>
</build>

</project>
</project>
106 changes: 83 additions & 23 deletions src/main/java/com/github/sisyphsu/dateparser/DateParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,33 @@
*/
public final class DateParser {

private static final int MAXIMUM_NUMBER_OF_ERRORS = 10;

private final ReMatcher matcher;
private final DateBuilder dt = new DateBuilder();

private ReMatcher limitedRulesMatcher = null;
private final boolean optimizeForReuseSimilarFormatted;
private int encounteredErrorsCounter = 0;

private final List<String> rules;
private final Set<String> standardRules;
private final Map<String, RuleHandler> customizedRuleMap;

private String input;
private boolean preferMonthFirst;

DateParser(List<String> rules, Set<String> stdRules, Map<String, RuleHandler> cstRules, boolean preferMonthFirst) {
DateParser(List<String> rules, Set<String> stdRules, Map<String, RuleHandler> cstRules, boolean preferMonthFirst, boolean optimizeForReuseSimilarFormatted) {
this.rules = rules;
this.standardRules = stdRules;
this.customizedRuleMap = cstRules;
this.preferMonthFirst = preferMonthFirst;
this.matcher = new ReMatcher(this.rules.toArray(new String[0]));
this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted;
}

/**
* Create an new DateParserBuilder which could be used for initialize DateParser.
* Create a new DateParserBuilder which could be used for initialize DateParser.
*
* @return DateParserBuilder instance
*/
Expand Down Expand Up @@ -111,20 +118,73 @@ public OffsetDateTime parseOffsetDateTime(String str) {
* Execute datetime's parsing
*/
private void parse(final CharArray input) {
matcher.reset(input);
// When the optimizeForReuseSimilarFormatted flag is set, we assume that the parser is
// used for multiple input strings in the same format
// * Remember which rules were used to parse the first input string
// * When parsing the second string, first try with the same rules as for the first input string
// * If this succeeds, we have a performance gain
// * If this fails, increment an error counter, and parse instead with all the rules
// * If the error counter passes a threshold, stop trying to parse input strings with the rules from the first string
// and fall back to the regular parsing code path that uses all the rules.
// The input strings were clearly not formatted in the same way
if (optimizeForReuseSimilarFormatted && encounteredErrorsCounter < MAXIMUM_NUMBER_OF_ERRORS) {
if (limitedRulesMatcher != null) {
try {
parse(input, limitedRulesMatcher);
} catch (DateTimeParseException e) {
dt.reset();
encounteredErrorsCounter++;
//Parsing with our subset of rules failed, so fall back to the matcher which uses all the rules
parse(input, matcher);
}
return;
}
//Find the rules that are needed to parse the input, and create a matcher with that subset of rules
matcher.reset(input);
int offset = 0;
int oldEnd = -1;
List<String> reducedAllRules = new ArrayList<>();
while (matcher.find(offset)) {
if (oldEnd == matcher.end()) {
encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS;
parse(input, matcher);
return;
}
String usedRule = matcher.re();
reducedAllRules.add(usedRule);
offset = matcher.end();
oldEnd = offset;
}
if (offset != input.length()) {
encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS;
parse(input, matcher);
return;
}
//At this point, we could parse the input meaning we found the relevant rules
//Store it for the next time
limitedRulesMatcher = new ReMatcher(reducedAllRules.toArray(new String[0]));

parse(input, matcher);
} else {
parse(input, matcher);
}
}

private void parse(final CharArray input, ReMatcher m) throws DateTimeParseException{
m.reset(input);
int offset = 0;
int oldEnd = -1;
while (matcher.find(offset)) {
if (oldEnd == matcher.end()) {
while (m.find(offset)) {
if (oldEnd == m.end()) {
throw error(offset, "empty matching at " + offset);
}
if (standardRules.contains(matcher.re())) {
this.parseStandard(input, offset);
if (standardRules.contains(m.re())) {
this.parseStandard(input, offset, m);
} else {
RuleHandler handler = customizedRuleMap.get(matcher.re());
handler.handle(input, matcher, dt);
RuleHandler handler = customizedRuleMap.get(m.re());
handler.handle(input, m, dt);
}
offset = matcher.end();
offset = m.end();
oldEnd = offset;
}
if (offset != input.length()) {
Expand All @@ -135,13 +195,13 @@ private void parse(final CharArray input) {
/**
* Parse datetime use standard rules.
*/
void parseStandard(CharArray input, int offset) {
for (int index = 1; index <= matcher.groupCount(); index++) {
final String groupName = matcher.groupName(index);
final int startOff = matcher.start(index);
final int endOff = matcher.end(index);
void parseStandard(CharArray input, int offset, ReMatcher m) {
for (int index = 1; index <= m.groupCount(); index++) {
final String groupName = m.groupName(index);
final int startOff = m.start(index);
final int endOff = m.end(index);
if (groupName == null) {
throw error(offset, "Hit invalid standard rule: " + matcher.re());
throw error(offset, "Hit invalid standard rule: " + m.re());
}
if (startOff == -1 && endOff == -1) {
continue;
Expand Down Expand Up @@ -226,13 +286,13 @@ void parseStandard(CharArray input, int offset) {
dt.ns = parseNum(input, endOff - 9, endOff);
break;
default:
throw error(offset, "Hit invalid standard rule: " + matcher.re());
throw error(offset, "Hit invalid standard rule: " + m.re());
}
}
}

/**
* Parse an subsequence which represent dd/mm or mm/dd, it should be more smart for different locales.
* Parse a subsequence which represent dd/mm or mm/dd, it should be more smart for different locales.
*/
void parseDayOrMonth(CharArray input, int from, int to) {
char next = input.data[from + 1];
Expand All @@ -257,7 +317,7 @@ void parseDayOrMonth(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent year, like '2019', '19' etc
* Parse a subsequence which represent year, like '2019', '19' etc
*/
int parseYear(CharArray input, int from, int to) {
switch (to - from) {
Expand All @@ -274,7 +334,7 @@ int parseYear(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc
* Parse a subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc
*/
int parseZoneOffset(CharArray input, int from, int to) {
boolean neg = input.data[from] == '-';
Expand All @@ -301,7 +361,7 @@ int parseZoneOffset(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc
* Parse a subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc
* It should be treated as ms/us/ns.
*/
int parseNano(CharArray input, int from, int to) {
Expand All @@ -314,7 +374,7 @@ int parseNano(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent week, like 'Monday', 'mon' etc
* Parse a subsequence which represent week, like 'Monday', 'mon' etc
*/
int parseWeek(CharArray input, int from) {
switch (input.data[from]) {
Expand Down Expand Up @@ -345,7 +405,7 @@ int parseWeek(CharArray input, int from) {
}

/**
* Parse an subsequence which represent month, like '12', 'Feb' etc
* Parse a subsequence which represent month, like '12', 'Feb' etc
*/
int parseMonth(CharArray input, int from, int to) {
if (to - from <= 2) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ static synchronized void register(String re, RuleHandler handler) {
}

private boolean preferMonthFirst = false;
private boolean optimizeForReuseSimilarFormatted = false;

private final List<String> rules = new ArrayList<>();
private final Set<String> standardRules = new HashSet<>();
private final Map<String, RuleHandler> customizedRuleMap = new HashMap<>();
Expand All @@ -158,6 +160,18 @@ static synchronized void register(String re, RuleHandler handler) {
this.customizedRuleMap.putAll(DateParserBuilder.CUSTOMIZED_RULE_MAP);
}

/**
* Set to {@code true} when the parser will be used to parse many date strings which all use the same format.
* An example use-case is parsing a timestamp column from a large CSV file.
*
* @param optimizeForReuseSimilarFormatted True means creating a parser optimized to parse many date strings in the same format.
* @return This
*/
public DateParserBuilder optimizeForReuseSimilarFormatted(boolean optimizeForReuseSimilarFormatted){
this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted;
return this;
}

/**
* Mark this parser prefer mm/dd or not.
*
Expand Down Expand Up @@ -204,7 +218,7 @@ public DateParserBuilder addRule(String rule, RuleHandler handler) {
* @return DateParser
*/
public DateParser build() {
return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst);
return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst, optimizeForReuseSimilarFormatted);
}

}
30 changes: 30 additions & 0 deletions src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

import java.time.*;
import java.util.Date;
import java.util.Random;
import java.util.TimeZone;

import static org.junit.jupiter.api.Assertions.assertEquals;

/**
* @author sulin
* @since 2019-09-14 16:48:50
Expand Down Expand Up @@ -127,4 +130,31 @@ public void testTimestamp() {
assert date.getTime() == Long.valueOf(timestamp);
}

@Test
public void testOptimizeForReuseSimilarFormatted(){
Random random = new Random(123456789l);
String[] inputs = new String[500000];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
}
DateParser regular = DateParser.newBuilder().build();
DateParser optimized = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build();

for (int i = 0; i < inputs.length; i++) {
String input = inputs[i];
assertEquals(regular.parseDate(input), optimized.parseDate(input));
}

//Now check if the parser can still deal with a date in a different format
String inputInDifferentFormat = String.format("1%d/0%d/2020 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
assertEquals(regular.parseDate(inputInDifferentFormat), optimized.parseDate(inputInDifferentFormat));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package com.github.sisyphsu.dateparser.benchmark;

import com.github.sisyphsu.dateparser.DateParser;
import org.openjdk.jmh.annotations.*;

import java.util.Random;
import java.util.concurrent.TimeUnit;

@Warmup(iterations = 2, time = 2)
@BenchmarkMode(Mode.AverageTime)
@Fork(2)
@Measurement(iterations = 3, time = 3)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class OptimizeForReuseSimilarFormattedBenchmark {
private static final String[] TEXTS;

static {
Random random = new Random(123456789l);
TEXTS = new String[500000];
for (int i = 0; i < TEXTS.length; i++) {
TEXTS[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
}
}

@Benchmark
public void regularParser() {
DateParser parser = DateParser.newBuilder().build();
for (String text : TEXTS) {
parser.parseDate(text);
}
}

@Benchmark
public void optimizedForReuseParser() {
DateParser parser = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build();
for (String text : TEXTS) {
parser.parseDate(text);
}
}
}