Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@
hs_err_pid*
.idea
*.iml

# Output folder of IntelliJ
target
100 changes: 77 additions & 23 deletions src/main/java/com/github/sisyphsu/dateparser/DateParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,33 @@
*/
public final class DateParser {

private static final int MAXIMUM_NUMBER_OF_ERRORS = 10;

private final ReMatcher matcher;
private final DateBuilder dt = new DateBuilder();

private ReMatcher limitedRulesMatcher = null;
private final boolean optimizeForReuseSimilarFormatted;
private int encounteredErrorsCounter = 0;

private final List<String> rules;
private final Set<String> standardRules;
private final Map<String, RuleHandler> customizedRuleMap;

private String input;
private boolean preferMonthFirst;

DateParser(List<String> rules, Set<String> stdRules, Map<String, RuleHandler> cstRules, boolean preferMonthFirst) {
DateParser(List<String> rules, Set<String> stdRules, Map<String, RuleHandler> cstRules, boolean preferMonthFirst, boolean optimizeForReuseSimilarFormatted) {
this.rules = rules;
this.standardRules = stdRules;
this.customizedRuleMap = cstRules;
this.preferMonthFirst = preferMonthFirst;
this.matcher = new ReMatcher(this.rules.toArray(new String[0]));
this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted;
}

/**
* Create an new DateParserBuilder which could be used for initialize DateParser.
* Create a new DateParserBuilder which could be used for initialize DateParser.
*
* @return DateParserBuilder instance
*/
Expand Down Expand Up @@ -111,20 +118,67 @@ public OffsetDateTime parseOffsetDateTime(String str) {
* Execute datetime's parsing
*/
private void parse(final CharArray input) {
matcher.reset(input);
if (optimizeForReuseSimilarFormatted && encounteredErrorsCounter < MAXIMUM_NUMBER_OF_ERRORS) {
if (limitedRulesMatcher != null) {
//See if we can parse the input using the matcher which uses only a subset of the rules
try {
parse(input, limitedRulesMatcher);
return;
} catch (DateTimeParseException e) {
dt.reset();
encounteredErrorsCounter++;
}
} else {
//Find the rules that are needed to parse the input, and create a matcher with that subset of rules
matcher.reset(input);
int offset = 0;
int oldEnd = -1;
List<String> reducedAllRules = new ArrayList<>();
while (matcher.find(offset)) {
if (oldEnd == matcher.end()) {
encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS;
parse(input, matcher);
return;
}
String usedRule = matcher.re();
if (standardRules.contains(usedRule)) {
reducedAllRules.add(usedRule);
} else {
reducedAllRules.add(usedRule);
}
offset = matcher.end();
oldEnd = offset;
}
if (offset != input.length()) {
encounteredErrorsCounter = MAXIMUM_NUMBER_OF_ERRORS;
parse(input, matcher);
return;
}
//At this point, we could parse the input meaning we found the relevant rules
//Store it for the next time
limitedRulesMatcher = new ReMatcher(reducedAllRules.toArray(new String[0]));
}
parse(input, matcher);
} else {
parse(input, matcher);
}
}

private void parse(final CharArray input, ReMatcher m) throws DateTimeParseException{
m.reset(input);
int offset = 0;
int oldEnd = -1;
while (matcher.find(offset)) {
if (oldEnd == matcher.end()) {
while (m.find(offset)) {
if (oldEnd == m.end()) {
throw error(offset, "empty matching at " + offset);
}
if (standardRules.contains(matcher.re())) {
this.parseStandard(input, offset);
if (standardRules.contains(m.re())) {
this.parseStandard(input, offset, m);
} else {
RuleHandler handler = customizedRuleMap.get(matcher.re());
handler.handle(input, matcher, dt);
RuleHandler handler = customizedRuleMap.get(m.re());
handler.handle(input, m, dt);
}
offset = matcher.end();
offset = m.end();
oldEnd = offset;
}
if (offset != input.length()) {
Expand All @@ -135,13 +189,13 @@ private void parse(final CharArray input) {
/**
* Parse datetime use standard rules.
*/
void parseStandard(CharArray input, int offset) {
for (int index = 1; index <= matcher.groupCount(); index++) {
final String groupName = matcher.groupName(index);
final int startOff = matcher.start(index);
final int endOff = matcher.end(index);
void parseStandard(CharArray input, int offset, ReMatcher m) {
for (int index = 1; index <= m.groupCount(); index++) {
final String groupName = m.groupName(index);
final int startOff = m.start(index);
final int endOff = m.end(index);
if (groupName == null) {
throw error(offset, "Hit invalid standard rule: " + matcher.re());
throw error(offset, "Hit invalid standard rule: " + m.re());
}
if (startOff == -1 && endOff == -1) {
continue;
Expand Down Expand Up @@ -226,13 +280,13 @@ void parseStandard(CharArray input, int offset) {
dt.ns = parseNum(input, endOff - 9, endOff);
break;
default:
throw error(offset, "Hit invalid standard rule: " + matcher.re());
throw error(offset, "Hit invalid standard rule: " + m.re());
}
}
}

/**
* Parse an subsequence which represent dd/mm or mm/dd, it should be more smart for different locales.
* Parse a subsequence which represent dd/mm or mm/dd, it should be more smart for different locales.
*/
void parseDayOrMonth(CharArray input, int from, int to) {
char next = input.data[from + 1];
Expand All @@ -257,7 +311,7 @@ void parseDayOrMonth(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent year, like '2019', '19' etc
* Parse a subsequence which represent year, like '2019', '19' etc
*/
int parseYear(CharArray input, int from, int to) {
switch (to - from) {
Expand All @@ -274,7 +328,7 @@ int parseYear(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc
* Parse a subsequence which represent the offset of timezone, like '+0800', '+08', '+8:00', '+08:00' etc
*/
int parseZoneOffset(CharArray input, int from, int to) {
boolean neg = input.data[from] == '-';
Expand All @@ -301,7 +355,7 @@ int parseZoneOffset(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc
* Parse a subsequence which suffix second, like '.2000', '.3186369', '.257000000' etc
* It should be treated as ms/us/ns.
*/
int parseNano(CharArray input, int from, int to) {
Expand All @@ -314,7 +368,7 @@ int parseNano(CharArray input, int from, int to) {
}

/**
* Parse an subsequence which represent week, like 'Monday', 'mon' etc
* Parse a subsequence which represent week, like 'Monday', 'mon' etc
*/
int parseWeek(CharArray input, int from) {
switch (input.data[from]) {
Expand Down Expand Up @@ -345,7 +399,7 @@ int parseWeek(CharArray input, int from) {
}

/**
* Parse an subsequence which represent month, like '12', 'Feb' etc
* Parse a subsequence which represent month, like '12', 'Feb' etc
*/
int parseMonth(CharArray input, int from, int to) {
if (to - from <= 2) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ static synchronized void register(String re, RuleHandler handler) {
}

private boolean preferMonthFirst = false;
private boolean optimizeForReuseSimilarFormatted = false;

private final List<String> rules = new ArrayList<>();
private final Set<String> standardRules = new HashSet<>();
private final Map<String, RuleHandler> customizedRuleMap = new HashMap<>();
Expand All @@ -158,6 +160,18 @@ static synchronized void register(String re, RuleHandler handler) {
this.customizedRuleMap.putAll(DateParserBuilder.CUSTOMIZED_RULE_MAP);
}

/**
* Set to {@code true} when the parser will be used to parse many date strings which all use the same format.
* An example use-case is parsing a timestamp column from a large CSV file.
*
* @param optimizeForReuseSimilarFormatted True means creating a parser optimized to parse many date strings in the same format.
* @return This
*/
public DateParserBuilder optimizeForReuseSimilarFormatted(boolean optimizeForReuseSimilarFormatted){
this.optimizeForReuseSimilarFormatted = optimizeForReuseSimilarFormatted;
return this;
}

/**
* Mark this parser prefer mm/dd or not.
*
Expand Down Expand Up @@ -204,7 +218,7 @@ public DateParserBuilder addRule(String rule, RuleHandler handler) {
* @return DateParser
*/
public DateParser build() {
return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst);
return new DateParser(rules, standardRules, customizedRuleMap, preferMonthFirst, optimizeForReuseSimilarFormatted);
}

}
22 changes: 22 additions & 0 deletions src/test/java/com/github/sisyphsu/dateparser/DateBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

import java.time.*;
import java.util.Date;
import java.util.Random;
import java.util.TimeZone;

import static org.junit.jupiter.api.Assertions.assertEquals;

/**
* @author sulin
* @since 2019-09-14 16:48:50
Expand Down Expand Up @@ -127,4 +130,23 @@ public void testTimestamp() {
assert date.getTime() == Long.valueOf(timestamp);
}

@Test
public void testOptimizeForReuseSimilarFormatted(){
Random random = new Random(123456789l);
String[] inputs = new String[500000];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
}
DateParser regular = DateParser.newBuilder().build();
DateParser optimized = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build();

for (int i = 0; i < inputs.length; i++) {
String input = inputs[i];
assertEquals(regular.parseDate(input), optimized.parseDate(input));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package com.github.sisyphsu.dateparser.benchmark;

import com.github.sisyphsu.dateparser.DateParser;
import org.openjdk.jmh.annotations.*;

import java.util.Random;
import java.util.concurrent.TimeUnit;

@Warmup(iterations = 2, time = 2)
@BenchmarkMode(Mode.AverageTime)
@Fork(2)
@Measurement(iterations = 3, time = 3)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class OptimizeForReuseSimilarFormattedBenchmark {
private static final String[] TEXTS;

static {
Random random = new Random(123456789l);
TEXTS = new String[500000];
for (int i = 0; i < TEXTS.length; i++) {
TEXTS[i] = String.format("2020-0%d-1%d 00:%d%d:00 UTC",
random.nextInt(8) + 1,
random.nextInt(8) + 1,
random.nextInt(5),
random.nextInt(9));
}
}

@Benchmark
public void regularParser() {
DateParser parser = DateParser.newBuilder().build();
for (String text : TEXTS) {
parser.parseDate(text);
}
}

@Benchmark
public void optimizedForReuseParser() {
DateParser parser = DateParser.newBuilder().optimizeForReuseSimilarFormatted(true).build();
for (String text : TEXTS) {
parser.parseDate(text);
}
}
}