Skip to content

Commit d6a708c

Browse files
committed
Add support in analysis for month, dayOfYear, dayOfMonth, and year fields, so we don't have to re-parse date info later. Also move temporal analysis into separate class.
1 parent 4a60ffe commit d6a708c

File tree

8 files changed

+654
-389
lines changed

8 files changed

+654
-389
lines changed

schemas-analyze/src/main/java/org/cedar/schemas/analyze/Analyzers.java

Lines changed: 2 additions & 292 deletions
Original file line numberDiff line numberDiff line change
@@ -5,39 +5,15 @@
55
import org.slf4j.Logger;
66
import org.slf4j.LoggerFactory;
77

8-
import java.time.*;
9-
import java.time.format.DateTimeFormatter;
10-
import java.time.format.DateTimeFormatterBuilder;
11-
import java.time.format.DateTimeParseException;
12-
import java.time.format.ResolverStyle;
13-
import java.time.temporal.ChronoField;
14-
import java.time.temporal.ChronoUnit;
15-
import java.time.temporal.TemporalAccessor;
16-
import java.time.temporal.TemporalQueries;
178
import java.util.ArrayList;
189
import java.util.List;
1910

2011
import static org.cedar.schemas.avro.psi.TimeRangeDescriptor.*;
12+
import org.cedar.schemas.analyze.DateInfo;
2113

2214
public class Analyzers {
2315
private static final Logger log = LoggerFactory.getLogger(Analyzers.class);
2416

25-
public static final DateTimeFormatter PARSE_DATE_FORMATTER = new DateTimeFormatterBuilder()
26-
.appendOptional(DateTimeFormatter.ISO_ZONED_DATE_TIME) // e.g. 2010-12-30T00:00:00Z
27-
.appendOptional(DateTimeFormatter.ISO_LOCAL_DATE_TIME) // e.g. 2010-12-30T00:00:00
28-
.appendOptional(DateTimeFormatter.ISO_LOCAL_DATE) // e.g. 2010-12-30
29-
.appendOptional(new DateTimeFormatterBuilder()
30-
.appendValue(ChronoField.YEAR) // e.g. -200
31-
.optionalStart()
32-
.appendPattern("-MM") // e.g. -200-10
33-
.optionalEnd()
34-
.optionalStart()
35-
.appendPattern("-dd") // e.g. -200-01-01
36-
.optionalEnd()
37-
.toFormatter())
38-
.toFormatter()
39-
.withResolverStyle(ResolverStyle.STRICT);
40-
4117
public static ParsedRecord addAnalysis(ParsedRecord record) {
4218
if (record == null) {
4319
return null; // pass through
@@ -65,7 +41,7 @@ public static Analysis analyze(Discovery discovery) {
6541
}
6642
return Analysis.newBuilder()
6743
.setIdentification(analyzeIdentifiers(discovery))
68-
.setTemporalBounding(analyzeTemporalBounding(discovery))
44+
.setTemporalBounding(Temporal.analyzeBounding(discovery))
6945
.setSpatialBounding(analyzeSpatialBounding(discovery))
7046
.setTitles(analyzeTitles(discovery))
7147
.setDescription(analyzeDescription(discovery))
@@ -95,41 +71,6 @@ public static IdentificationAnalysis analyzeIdentifiers(Discovery metadata) {
9571
.build();
9672
}
9773

98-
public static TemporalBoundingAnalysis analyzeTemporalBounding(Discovery metadata) {
99-
TemporalBoundingAnalysis.Builder builder = TemporalBoundingAnalysis.newBuilder();
100-
101-
if (metadata != null && metadata.getTemporalBounding() != null) {
102-
// Gather info
103-
DateInfo beginInfo = new DateInfo(metadata.getTemporalBounding().getBeginDate(), true);
104-
DateInfo endInfo = new DateInfo(metadata.getTemporalBounding().getEndDate(), false);
105-
DateInfo instantInfo = new DateInfo(metadata.getTemporalBounding().getInstant(), true);
106-
TimeRangeDescriptor rangeDescriptor = rangeDescriptor(beginInfo, endInfo, instantInfo);
107-
108-
// Build
109-
builder.setBeginDescriptor(beginInfo.descriptor);
110-
builder.setBeginPrecision(beginInfo.precision);
111-
builder.setBeginIndexable(beginInfo.indexable);
112-
builder.setBeginZoneSpecified(beginInfo.zoneSpecified);
113-
builder.setBeginUtcDateTimeString(beginInfo.utcDateTimeString);
114-
115-
builder.setEndDescriptor(endInfo.descriptor);
116-
builder.setEndPrecision(endInfo.precision);
117-
builder.setEndIndexable(endInfo.indexable);
118-
builder.setEndZoneSpecified(endInfo.zoneSpecified);
119-
builder.setEndUtcDateTimeString(endInfo.utcDateTimeString);
120-
121-
builder.setInstantDescriptor(instantInfo.descriptor);
122-
builder.setInstantPrecision(instantInfo.precision);
123-
builder.setInstantIndexable(instantInfo.indexable);
124-
builder.setInstantZoneSpecified(instantInfo.zoneSpecified);
125-
builder.setInstantUtcDateTimeString(instantInfo.utcDateTimeString);
126-
127-
builder.setRangeDescriptor(rangeDescriptor);
128-
}
129-
130-
return builder.build();
131-
}
132-
13374
public static SpatialBoundingAnalysis analyzeSpatialBounding(Discovery metadata) {
13475
SpatialBoundingAnalysis.Builder builder = SpatialBoundingAnalysis.newBuilder();
13576
if (metadata != null) {
@@ -204,235 +145,4 @@ public StringInfo(String input) {
204145
}
205146
}
206147

207-
static class DateInfo implements Comparable<DateInfo> {
208-
public final ValidDescriptor descriptor;
209-
public final String precision;
210-
public final boolean indexable;
211-
public final String zoneSpecified;
212-
public final String utcDateTimeString;
213-
214-
public DateInfo(String dateString, boolean start) {
215-
if (dateString == null || dateString.length() == 0) {
216-
descriptor = ValidDescriptor.UNDEFINED;
217-
precision = null;
218-
indexable = true;
219-
zoneSpecified = null;
220-
utcDateTimeString = null;
221-
return;
222-
}
223-
224-
Long longDate = parseLong(dateString);
225-
TemporalAccessor parsedDate = parseDate(dateString);
226-
if (longDate != null && !indexable(longDate)) {
227-
descriptor = ValidDescriptor.VALID;
228-
precision = precision(longDate);
229-
indexable = indexable(longDate);
230-
zoneSpecified = timezone(longDate);
231-
utcDateTimeString = utcDateTimeString(longDate, start);
232-
}
233-
else if (parsedDate != null) {
234-
descriptor = ValidDescriptor.VALID;
235-
precision = precision(parsedDate);
236-
indexable = indexable(parsedDate);
237-
zoneSpecified = timezone(parsedDate);
238-
utcDateTimeString = utcDateTimeString(parsedDate, start);
239-
}
240-
else {
241-
descriptor = ValidDescriptor.INVALID;
242-
precision = null;
243-
indexable = false;
244-
zoneSpecified = null;
245-
utcDateTimeString = null;
246-
}
247-
}
248-
249-
private static Long parseLong(String number) {
250-
try {
251-
return Long.parseLong(number);
252-
} catch (Exception e) {
253-
return null;
254-
}
255-
}
256-
257-
private static TemporalAccessor parseDate(String date) {
258-
try {
259-
return PARSE_DATE_FORMATTER.parseBest(
260-
date,
261-
ZonedDateTime::from,
262-
LocalDateTime::from,
263-
LocalDate::from,
264-
YearMonth::from,
265-
Year::from);
266-
} catch (Exception e) {
267-
return null;
268-
}
269-
}
270-
271-
@Override
272-
public int compareTo(DateInfo o) {
273-
boolean thisIndexable = this.indexable;
274-
boolean oIndexable = o.indexable;
275-
boolean thisIsYears = this.precision.equals(ChronoUnit.YEARS.toString());
276-
boolean oIsYears = o.precision.equals(ChronoUnit.YEARS.toString());
277-
278-
if (thisIndexable && oIndexable) {
279-
// Compare actual dates with UTC string
280-
ZonedDateTime thisDate = ZonedDateTime.parse(this.utcDateTimeString);
281-
ZonedDateTime oDate = ZonedDateTime.parse(o.utcDateTimeString);
282-
if (thisDate.isEqual(oDate)) {
283-
return 0;
284-
} else {
285-
return thisDate.isBefore(oDate) ? -1 : 1;
286-
}
287-
}
288-
else if ((thisIsYears && oIsYears) || (thisIsYears && oIndexable) || (thisIndexable && oIsYears)) {
289-
// Compare years only as longs; parse both as string objects since both may not be just a long.
290-
// Watch out for negative years...
291-
String thisYearText = this.utcDateTimeString.substring(0, this.utcDateTimeString.indexOf('-', 1));
292-
String oYearText = o.utcDateTimeString.substring(0, o.utcDateTimeString.indexOf('-', 1));
293-
Long thisYear = Long.parseLong(thisYearText);
294-
Long oYear = Long.parseLong(oYearText);
295-
if (thisYear == oYear) {
296-
return 0;
297-
} else {
298-
return thisYear < oYear ? -1 : 1;
299-
}
300-
}
301-
else {
302-
// One or both has an INVALID search format that is not just due to a paleo year
303-
throw new DateTimeException("One or both dates being compared have an INVALID format.");
304-
}
305-
}
306-
}
307-
308-
static boolean indexable(Long year) {
309-
// Year must be in the range [-292_275_055, 292_278_994] in order to be parsed as a date by ES (Joda time magic number). However,
310-
// this number is a bit arbitrary, and prone to change when ES switches to the Java time library (minimum supported year
311-
// being -999,999,999). We will limit the year ourselves instead to -100,000,000 -- since this is a fairly safe bet for
312-
// supportability across many date libraries if the utcDateTime ends up used as is by a downstream app.
313-
return year >= -100_000_000L;
314-
}
315-
316-
static boolean indexable(TemporalAccessor date) {
317-
return true; // if it's a parsable accessor, it's indexable
318-
}
319-
320-
static String precision(Long year) {
321-
return ChronoUnit.YEARS.toString();
322-
}
323-
324-
static String precision(TemporalAccessor date) {
325-
if (date == null) {
326-
return null;
327-
}
328-
return date.query(TemporalQueries.precision()).toString();
329-
}
330-
331-
static String timezone(Object date) {
332-
return date instanceof ZonedDateTime ? ((ZonedDateTime) date).getOffset().toString() : null;
333-
}
334-
335-
static String utcDateTimeString(TemporalAccessor parsedDate, boolean start) {
336-
if (parsedDate == null) {
337-
return null;
338-
}
339-
340-
if (parsedDate instanceof Year) {
341-
LocalDateTime yearDate = start ?
342-
((Year) parsedDate).atMonth(1).atDay(1).atStartOfDay() :
343-
((Year) parsedDate).atMonth(12).atEndOfMonth().atTime(23, 59, 59, 999000000);
344-
return DateTimeFormatter.ISO_ZONED_DATE_TIME.format(yearDate.atZone(ZoneOffset.UTC));
345-
}
346-
if (parsedDate instanceof YearMonth) {
347-
LocalDateTime yearMonthDate = start ?
348-
((YearMonth) parsedDate).atDay(1).atStartOfDay() :
349-
((YearMonth) parsedDate).atEndOfMonth().atTime(23, 59, 59, 999000000);
350-
return DateTimeFormatter.ISO_ZONED_DATE_TIME.format((yearMonthDate.atZone(ZoneOffset.UTC)));
351-
}
352-
if (parsedDate instanceof LocalDate) {
353-
LocalDateTime localDate = start ?
354-
((LocalDate) parsedDate).atStartOfDay() :
355-
((LocalDate) parsedDate).atTime(23, 59, 59, 999000000);
356-
return DateTimeFormatter.ISO_ZONED_DATE_TIME.format(localDate.atZone(ZoneOffset.UTC));
357-
}
358-
if (parsedDate instanceof LocalDateTime) {
359-
return DateTimeFormatter.ISO_ZONED_DATE_TIME.format(((LocalDateTime) parsedDate).atZone(ZoneOffset.UTC));
360-
}
361-
if (parsedDate instanceof ZonedDateTime) {
362-
return DateTimeFormatter.ISO_ZONED_DATE_TIME.format(((ZonedDateTime) parsedDate).withZoneSameInstant(ZoneOffset.UTC));
363-
}
364-
365-
return null;
366-
}
367-
368-
static String utcDateTimeString(Long year, boolean start) {
369-
return start ? year.toString() + "-01-01T00:00:00Z" : year.toString() + "-12-31T23:59:59.999Z";
370-
}
371-
372-
static TimeRangeDescriptor rangeDescriptor(DateInfo beginInfo, DateInfo endInfo, DateInfo instantInfo) {
373-
ValidDescriptor begin = beginInfo.descriptor;
374-
ValidDescriptor end = endInfo.descriptor;
375-
ValidDescriptor instant = instantInfo.descriptor;
376-
377-
// A time range cannot be described as an error exists with one or more dates:
378-
if(begin == ValidDescriptor.INVALID ||
379-
end == ValidDescriptor.INVALID ||
380-
instant == ValidDescriptor.INVALID) {
381-
return NOT_APPLICABLE;
382-
}
383-
384-
// Dates are all undefined so range is undefined:
385-
if (begin == ValidDescriptor.UNDEFINED &&
386-
end == ValidDescriptor.UNDEFINED &&
387-
instant == ValidDescriptor.UNDEFINED) {
388-
return UNDEFINED;
389-
}
390-
391-
// If begin is valid but end is undefined, this indicates an ongoing range:
392-
if (begin == ValidDescriptor.VALID &&
393-
end == ValidDescriptor.UNDEFINED &&
394-
instant == ValidDescriptor.UNDEFINED) {
395-
return ONGOING;
396-
}
397-
398-
// Valid instant is straightforward:
399-
if (begin == ValidDescriptor.UNDEFINED &&
400-
end == ValidDescriptor.UNDEFINED &&
401-
instant == ValidDescriptor.VALID) {
402-
return INSTANT;
403-
}
404-
405-
// Dates describe more than one valid range descriptor, which is ambiguous:
406-
if ( ( begin == ValidDescriptor.VALID && end == ValidDescriptor.VALID && instant == ValidDescriptor.VALID ) ||
407-
( begin == ValidDescriptor.VALID && end == ValidDescriptor.UNDEFINED && instant == ValidDescriptor.VALID ) ) {
408-
return AMBIGUOUS;
409-
}
410-
411-
// Begin and end dates are independently valid but based on how they compare to each other can describe very
412-
// different range types:
413-
if (begin == ValidDescriptor.VALID &&
414-
end == ValidDescriptor.VALID &&
415-
instant == ValidDescriptor.UNDEFINED) {
416-
try {
417-
int comparator = beginInfo.compareTo(endInfo);
418-
TimeRangeDescriptor descriptor;
419-
switch (comparator) {
420-
case -1: descriptor = BOUNDED;
421-
break;
422-
case 0: descriptor = INSTANT;
423-
break;
424-
case 1: descriptor = BACKWARDS;
425-
break;
426-
default: descriptor = INVALID;
427-
break;
428-
}
429-
return descriptor;
430-
} catch(DateTimeException e) {
431-
return INVALID;
432-
}
433-
}
434-
435-
// Covers undefined begin date with valid end date which is meaningless, regardless of presence of an instant date
436-
return INVALID;
437-
}
438148
}

0 commit comments

Comments
 (0)