|
5 | 5 | import org.slf4j.Logger; |
6 | 6 | import org.slf4j.LoggerFactory; |
7 | 7 |
|
8 | | -import java.time.*; |
9 | | -import java.time.format.DateTimeFormatter; |
10 | | -import java.time.format.DateTimeFormatterBuilder; |
11 | | -import java.time.format.DateTimeParseException; |
12 | | -import java.time.format.ResolverStyle; |
13 | | -import java.time.temporal.ChronoField; |
14 | | -import java.time.temporal.ChronoUnit; |
15 | | -import java.time.temporal.TemporalAccessor; |
16 | | -import java.time.temporal.TemporalQueries; |
17 | 8 | import java.util.ArrayList; |
18 | 9 | import java.util.List; |
19 | 10 |
|
20 | 11 | import static org.cedar.schemas.avro.psi.TimeRangeDescriptor.*; |
| 12 | +import org.cedar.schemas.analyze.DateInfo; |
21 | 13 |
|
22 | 14 | public class Analyzers { |
23 | 15 | private static final Logger log = LoggerFactory.getLogger(Analyzers.class); |
24 | 16 |
|
25 | | - public static final DateTimeFormatter PARSE_DATE_FORMATTER = new DateTimeFormatterBuilder() |
26 | | - .appendOptional(DateTimeFormatter.ISO_ZONED_DATE_TIME) // e.g. 2010-12-30T00:00:00Z |
27 | | - .appendOptional(DateTimeFormatter.ISO_LOCAL_DATE_TIME) // e.g. 2010-12-30T00:00:00 |
28 | | - .appendOptional(DateTimeFormatter.ISO_LOCAL_DATE) // e.g. 2010-12-30 |
29 | | - .appendOptional(new DateTimeFormatterBuilder() |
30 | | - .appendValue(ChronoField.YEAR) // e.g. -200 |
31 | | - .optionalStart() |
32 | | - .appendPattern("-MM") // e.g. -200-10 |
33 | | - .optionalEnd() |
34 | | - .optionalStart() |
35 | | - .appendPattern("-dd") // e.g. -200-01-01 |
36 | | - .optionalEnd() |
37 | | - .toFormatter()) |
38 | | - .toFormatter() |
39 | | - .withResolverStyle(ResolverStyle.STRICT); |
40 | | - |
41 | 17 | public static ParsedRecord addAnalysis(ParsedRecord record) { |
42 | 18 | if (record == null) { |
43 | 19 | return null; // pass through |
@@ -65,7 +41,7 @@ public static Analysis analyze(Discovery discovery) { |
65 | 41 | } |
66 | 42 | return Analysis.newBuilder() |
67 | 43 | .setIdentification(analyzeIdentifiers(discovery)) |
68 | | - .setTemporalBounding(analyzeTemporalBounding(discovery)) |
| 44 | + .setTemporalBounding(Temporal.analyzeBounding(discovery)) |
69 | 45 | .setSpatialBounding(analyzeSpatialBounding(discovery)) |
70 | 46 | .setTitles(analyzeTitles(discovery)) |
71 | 47 | .setDescription(analyzeDescription(discovery)) |
@@ -95,41 +71,6 @@ public static IdentificationAnalysis analyzeIdentifiers(Discovery metadata) { |
95 | 71 | .build(); |
96 | 72 | } |
97 | 73 |
|
98 | | - public static TemporalBoundingAnalysis analyzeTemporalBounding(Discovery metadata) { |
99 | | - TemporalBoundingAnalysis.Builder builder = TemporalBoundingAnalysis.newBuilder(); |
100 | | - |
101 | | - if (metadata != null && metadata.getTemporalBounding() != null) { |
102 | | - // Gather info |
103 | | - DateInfo beginInfo = new DateInfo(metadata.getTemporalBounding().getBeginDate(), true); |
104 | | - DateInfo endInfo = new DateInfo(metadata.getTemporalBounding().getEndDate(), false); |
105 | | - DateInfo instantInfo = new DateInfo(metadata.getTemporalBounding().getInstant(), true); |
106 | | - TimeRangeDescriptor rangeDescriptor = rangeDescriptor(beginInfo, endInfo, instantInfo); |
107 | | - |
108 | | - // Build |
109 | | - builder.setBeginDescriptor(beginInfo.descriptor); |
110 | | - builder.setBeginPrecision(beginInfo.precision); |
111 | | - builder.setBeginIndexable(beginInfo.indexable); |
112 | | - builder.setBeginZoneSpecified(beginInfo.zoneSpecified); |
113 | | - builder.setBeginUtcDateTimeString(beginInfo.utcDateTimeString); |
114 | | - |
115 | | - builder.setEndDescriptor(endInfo.descriptor); |
116 | | - builder.setEndPrecision(endInfo.precision); |
117 | | - builder.setEndIndexable(endInfo.indexable); |
118 | | - builder.setEndZoneSpecified(endInfo.zoneSpecified); |
119 | | - builder.setEndUtcDateTimeString(endInfo.utcDateTimeString); |
120 | | - |
121 | | - builder.setInstantDescriptor(instantInfo.descriptor); |
122 | | - builder.setInstantPrecision(instantInfo.precision); |
123 | | - builder.setInstantIndexable(instantInfo.indexable); |
124 | | - builder.setInstantZoneSpecified(instantInfo.zoneSpecified); |
125 | | - builder.setInstantUtcDateTimeString(instantInfo.utcDateTimeString); |
126 | | - |
127 | | - builder.setRangeDescriptor(rangeDescriptor); |
128 | | - } |
129 | | - |
130 | | - return builder.build(); |
131 | | - } |
132 | | - |
133 | 74 | public static SpatialBoundingAnalysis analyzeSpatialBounding(Discovery metadata) { |
134 | 75 | SpatialBoundingAnalysis.Builder builder = SpatialBoundingAnalysis.newBuilder(); |
135 | 76 | if (metadata != null) { |
@@ -204,235 +145,4 @@ public StringInfo(String input) { |
204 | 145 | } |
205 | 146 | } |
206 | 147 |
|
207 | | - static class DateInfo implements Comparable<DateInfo> { |
208 | | - public final ValidDescriptor descriptor; |
209 | | - public final String precision; |
210 | | - public final boolean indexable; |
211 | | - public final String zoneSpecified; |
212 | | - public final String utcDateTimeString; |
213 | | - |
214 | | - public DateInfo(String dateString, boolean start) { |
215 | | - if (dateString == null || dateString.length() == 0) { |
216 | | - descriptor = ValidDescriptor.UNDEFINED; |
217 | | - precision = null; |
218 | | - indexable = true; |
219 | | - zoneSpecified = null; |
220 | | - utcDateTimeString = null; |
221 | | - return; |
222 | | - } |
223 | | - |
224 | | - Long longDate = parseLong(dateString); |
225 | | - TemporalAccessor parsedDate = parseDate(dateString); |
226 | | - if (longDate != null && !indexable(longDate)) { |
227 | | - descriptor = ValidDescriptor.VALID; |
228 | | - precision = precision(longDate); |
229 | | - indexable = indexable(longDate); |
230 | | - zoneSpecified = timezone(longDate); |
231 | | - utcDateTimeString = utcDateTimeString(longDate, start); |
232 | | - } |
233 | | - else if (parsedDate != null) { |
234 | | - descriptor = ValidDescriptor.VALID; |
235 | | - precision = precision(parsedDate); |
236 | | - indexable = indexable(parsedDate); |
237 | | - zoneSpecified = timezone(parsedDate); |
238 | | - utcDateTimeString = utcDateTimeString(parsedDate, start); |
239 | | - } |
240 | | - else { |
241 | | - descriptor = ValidDescriptor.INVALID; |
242 | | - precision = null; |
243 | | - indexable = false; |
244 | | - zoneSpecified = null; |
245 | | - utcDateTimeString = null; |
246 | | - } |
247 | | - } |
248 | | - |
249 | | - private static Long parseLong(String number) { |
250 | | - try { |
251 | | - return Long.parseLong(number); |
252 | | - } catch (Exception e) { |
253 | | - return null; |
254 | | - } |
255 | | - } |
256 | | - |
257 | | - private static TemporalAccessor parseDate(String date) { |
258 | | - try { |
259 | | - return PARSE_DATE_FORMATTER.parseBest( |
260 | | - date, |
261 | | - ZonedDateTime::from, |
262 | | - LocalDateTime::from, |
263 | | - LocalDate::from, |
264 | | - YearMonth::from, |
265 | | - Year::from); |
266 | | - } catch (Exception e) { |
267 | | - return null; |
268 | | - } |
269 | | - } |
270 | | - |
271 | | - @Override |
272 | | - public int compareTo(DateInfo o) { |
273 | | - boolean thisIndexable = this.indexable; |
274 | | - boolean oIndexable = o.indexable; |
275 | | - boolean thisIsYears = this.precision.equals(ChronoUnit.YEARS.toString()); |
276 | | - boolean oIsYears = o.precision.equals(ChronoUnit.YEARS.toString()); |
277 | | - |
278 | | - if (thisIndexable && oIndexable) { |
279 | | - // Compare actual dates with UTC string |
280 | | - ZonedDateTime thisDate = ZonedDateTime.parse(this.utcDateTimeString); |
281 | | - ZonedDateTime oDate = ZonedDateTime.parse(o.utcDateTimeString); |
282 | | - if (thisDate.isEqual(oDate)) { |
283 | | - return 0; |
284 | | - } else { |
285 | | - return thisDate.isBefore(oDate) ? -1 : 1; |
286 | | - } |
287 | | - } |
288 | | - else if ((thisIsYears && oIsYears) || (thisIsYears && oIndexable) || (thisIndexable && oIsYears)) { |
289 | | - // Compare years only as longs; parse both as string objects since both may not be just a long. |
290 | | - // Watch out for negative years... |
291 | | - String thisYearText = this.utcDateTimeString.substring(0, this.utcDateTimeString.indexOf('-', 1)); |
292 | | - String oYearText = o.utcDateTimeString.substring(0, o.utcDateTimeString.indexOf('-', 1)); |
293 | | - Long thisYear = Long.parseLong(thisYearText); |
294 | | - Long oYear = Long.parseLong(oYearText); |
295 | | - if (thisYear == oYear) { |
296 | | - return 0; |
297 | | - } else { |
298 | | - return thisYear < oYear ? -1 : 1; |
299 | | - } |
300 | | - } |
301 | | - else { |
302 | | - // One or both has an INVALID search format that is not just due to a paleo year |
303 | | - throw new DateTimeException("One or both dates being compared have an INVALID format."); |
304 | | - } |
305 | | - } |
306 | | - } |
307 | | - |
308 | | - static boolean indexable(Long year) { |
309 | | - // Year must be in the range [-292_275_055, 292_278_994] in order to be parsed as a date by ES (Joda time magic number). However, |
310 | | - // this number is a bit arbitrary, and prone to change when ES switches to the Java time library (minimum supported year |
311 | | - // being -999,999,999). We will limit the year ourselves instead to -100,000,000 -- since this is a fairly safe bet for |
312 | | - // supportability across many date libraries if the utcDateTime ends up used as is by a downstream app. |
313 | | - return year >= -100_000_000L; |
314 | | - } |
315 | | - |
316 | | - static boolean indexable(TemporalAccessor date) { |
317 | | - return true; // if it's a parsable accessor, it's indexable |
318 | | - } |
319 | | - |
320 | | - static String precision(Long year) { |
321 | | - return ChronoUnit.YEARS.toString(); |
322 | | - } |
323 | | - |
324 | | - static String precision(TemporalAccessor date) { |
325 | | - if (date == null) { |
326 | | - return null; |
327 | | - } |
328 | | - return date.query(TemporalQueries.precision()).toString(); |
329 | | - } |
330 | | - |
331 | | - static String timezone(Object date) { |
332 | | - return date instanceof ZonedDateTime ? ((ZonedDateTime) date).getOffset().toString() : null; |
333 | | - } |
334 | | - |
335 | | - static String utcDateTimeString(TemporalAccessor parsedDate, boolean start) { |
336 | | - if (parsedDate == null) { |
337 | | - return null; |
338 | | - } |
339 | | - |
340 | | - if (parsedDate instanceof Year) { |
341 | | - LocalDateTime yearDate = start ? |
342 | | - ((Year) parsedDate).atMonth(1).atDay(1).atStartOfDay() : |
343 | | - ((Year) parsedDate).atMonth(12).atEndOfMonth().atTime(23, 59, 59, 999000000); |
344 | | - return DateTimeFormatter.ISO_ZONED_DATE_TIME.format(yearDate.atZone(ZoneOffset.UTC)); |
345 | | - } |
346 | | - if (parsedDate instanceof YearMonth) { |
347 | | - LocalDateTime yearMonthDate = start ? |
348 | | - ((YearMonth) parsedDate).atDay(1).atStartOfDay() : |
349 | | - ((YearMonth) parsedDate).atEndOfMonth().atTime(23, 59, 59, 999000000); |
350 | | - return DateTimeFormatter.ISO_ZONED_DATE_TIME.format((yearMonthDate.atZone(ZoneOffset.UTC))); |
351 | | - } |
352 | | - if (parsedDate instanceof LocalDate) { |
353 | | - LocalDateTime localDate = start ? |
354 | | - ((LocalDate) parsedDate).atStartOfDay() : |
355 | | - ((LocalDate) parsedDate).atTime(23, 59, 59, 999000000); |
356 | | - return DateTimeFormatter.ISO_ZONED_DATE_TIME.format(localDate.atZone(ZoneOffset.UTC)); |
357 | | - } |
358 | | - if (parsedDate instanceof LocalDateTime) { |
359 | | - return DateTimeFormatter.ISO_ZONED_DATE_TIME.format(((LocalDateTime) parsedDate).atZone(ZoneOffset.UTC)); |
360 | | - } |
361 | | - if (parsedDate instanceof ZonedDateTime) { |
362 | | - return DateTimeFormatter.ISO_ZONED_DATE_TIME.format(((ZonedDateTime) parsedDate).withZoneSameInstant(ZoneOffset.UTC)); |
363 | | - } |
364 | | - |
365 | | - return null; |
366 | | - } |
367 | | - |
368 | | - static String utcDateTimeString(Long year, boolean start) { |
369 | | - return start ? year.toString() + "-01-01T00:00:00Z" : year.toString() + "-12-31T23:59:59.999Z"; |
370 | | - } |
371 | | - |
372 | | - static TimeRangeDescriptor rangeDescriptor(DateInfo beginInfo, DateInfo endInfo, DateInfo instantInfo) { |
373 | | - ValidDescriptor begin = beginInfo.descriptor; |
374 | | - ValidDescriptor end = endInfo.descriptor; |
375 | | - ValidDescriptor instant = instantInfo.descriptor; |
376 | | - |
377 | | - // A time range cannot be described as an error exists with one or more dates: |
378 | | - if(begin == ValidDescriptor.INVALID || |
379 | | - end == ValidDescriptor.INVALID || |
380 | | - instant == ValidDescriptor.INVALID) { |
381 | | - return NOT_APPLICABLE; |
382 | | - } |
383 | | - |
384 | | - // Dates are all undefined so range is undefined: |
385 | | - if (begin == ValidDescriptor.UNDEFINED && |
386 | | - end == ValidDescriptor.UNDEFINED && |
387 | | - instant == ValidDescriptor.UNDEFINED) { |
388 | | - return UNDEFINED; |
389 | | - } |
390 | | - |
391 | | - // If begin is valid but end is undefined, this indicates an ongoing range: |
392 | | - if (begin == ValidDescriptor.VALID && |
393 | | - end == ValidDescriptor.UNDEFINED && |
394 | | - instant == ValidDescriptor.UNDEFINED) { |
395 | | - return ONGOING; |
396 | | - } |
397 | | - |
398 | | - // Valid instant is straightforward: |
399 | | - if (begin == ValidDescriptor.UNDEFINED && |
400 | | - end == ValidDescriptor.UNDEFINED && |
401 | | - instant == ValidDescriptor.VALID) { |
402 | | - return INSTANT; |
403 | | - } |
404 | | - |
405 | | - // Dates describe more than one valid range descriptor, which is ambiguous: |
406 | | - if ( ( begin == ValidDescriptor.VALID && end == ValidDescriptor.VALID && instant == ValidDescriptor.VALID ) || |
407 | | - ( begin == ValidDescriptor.VALID && end == ValidDescriptor.UNDEFINED && instant == ValidDescriptor.VALID ) ) { |
408 | | - return AMBIGUOUS; |
409 | | - } |
410 | | - |
411 | | - // Begin and end dates are independently valid but based on how they compare to each other can describe very |
412 | | - // different range types: |
413 | | - if (begin == ValidDescriptor.VALID && |
414 | | - end == ValidDescriptor.VALID && |
415 | | - instant == ValidDescriptor.UNDEFINED) { |
416 | | - try { |
417 | | - int comparator = beginInfo.compareTo(endInfo); |
418 | | - TimeRangeDescriptor descriptor; |
419 | | - switch (comparator) { |
420 | | - case -1: descriptor = BOUNDED; |
421 | | - break; |
422 | | - case 0: descriptor = INSTANT; |
423 | | - break; |
424 | | - case 1: descriptor = BACKWARDS; |
425 | | - break; |
426 | | - default: descriptor = INVALID; |
427 | | - break; |
428 | | - } |
429 | | - return descriptor; |
430 | | - } catch(DateTimeException e) { |
431 | | - return INVALID; |
432 | | - } |
433 | | - } |
434 | | - |
435 | | - // Covers undefined begin date with valid end date which is meaningless, regardless of presence of an instant date |
436 | | - return INVALID; |
437 | | - } |
438 | 148 | } |
0 commit comments