88from opensextant .FlexPat import PatternExtractor , RegexPatternManager , PatternMatch
99
1010
11- # History - 2024 may - MCU ported from XCoord Java
12- #
13- #
14- # TODO: error "precision" scales
15- # text formatting of normalized coordinate
16- # complete testing
17-
18- class XCoord (PatternExtractor ):
19- """
20- NOTE: a port of XCoord java (org.opensextant.extractors.xcoord, in Xponents-Core)
21- """
22-
23- def __init__ (self , cfg = "geocoord_patterns_py.cfg" , debug = False ):
24- """
25- :param cfg: patterns config file.
26- """
27- PatternExtractor .__init__ (self , RegexPatternManager (cfg , debug = debug , testing = debug ))
28-
29-
3011class ResolutionUncertainty :
3112 UNKNOWN = 100000
3213 REGIONAL = 50000
@@ -55,6 +36,26 @@ class Specificity:
5536 None : 1
5637}
5738
39+ default_specificity = Specificity .SUBDEG
40+
41+
42+ # History - 2024 may - MCU ported from XCoord Java
43+ #
44+ class XCoord (PatternExtractor ):
45+ """
46+ NOTE: a port of XCoord java (org.opensextant.extractors.xcoord, in Xponents-Core)
47+ """
48+
49+ def __init__ (self , cfg = "geocoord_patterns_py.cfg" , debug = False , specificity = Specificity .SUBDEG ):
50+ """
51+ :param cfg: patterns config file.
52+ :param specificity: the abstract level of resolution for validating coordinates, e.g., DEGREE, SUB-DEGREE, etc.
53+ use Specificity enumeration
54+ """
55+ PatternExtractor .__init__ (self , RegexPatternManager (cfg , debug = debug , testing = debug ))
56+ global default_specificity
57+ default_specificity = specificity
58+
5859
5960def hemisphere_factor (sym : str ) -> int :
6061 if sym :
@@ -79,12 +80,14 @@ def is_blank(txt: str):
7980 return False
8081 return txt == '' or txt .strip () == ''
8182
82- def strip (txt :str ):
83+
84+ def strip (txt : str ):
8385 if txt is None :
8486 # Sorry -- you have to determine if obj is string or not first. None does not count.
8587 return False
8688 return txt .strip ()
8789
90+
8891class Hemisphere :
8992 def __init__ (self , axis , slots = None ):
9093 self .axis = axis
@@ -178,7 +181,7 @@ def has_subsec(self):
178181 return self .specificity == Specificity .SUBSECOND
179182
180183 def has_symbols (self ):
181- return len (self .symbols ) > 1
184+ return len (self .symbols ) > 0
182185
183186 def normalize (self ):
184187 """
@@ -327,11 +330,11 @@ def __init__(self, *args, **kwargs):
327330 PatternMatch .__init__ (self , * args , ** kwargs )
328331 self .case = PatternMatch .UPPER_CASE
329332 self .geodetic = None
330- self .coordinate = None
331- self .parsing_err = None
332- self .lat_ordinate = None
333- self .lon_ordinate = None
334- self .filter = None
333+ self .coordinate : Coordinate = None
334+ self .parsing_err : str = None
335+ self .lat_ordinate : DMSOrdinate = None
336+ self .lon_ordinate : DMSOrdinate = None
337+ self .filter : GeocoordFilter = None
335338 self .pattern_family = self .pattern_id .split ("-" , 1 )[0 ]
336339
337340 def __str__ (self ):
@@ -358,6 +361,22 @@ def _make_coordinate(self):
358361 self .coordinate = Coordinate (None , lat = LL .lat , lon = LL .lon )
359362 # These are parsed by UTM and MGRS libraries, so coordinate is assumed valid.
360363
364+ def filter_by_resolution (self ):
365+ """ Check specificity of ordinates -- as parsed, if LAT or LON has the minimum level of detail
366+
367+ 40N -- could be "40 North"
368+ +40.0000 -- also "40 North", but precision is specified to 4sigfig.
369+ +40:00:00 -- well, could also be an hour marker ~ 40 hours
370+
371+ :return: TRUE if coordinate is specific and resolution is high enough.
372+ """
373+ if not self .lat_ordinate or not self .lon_ordinate :
374+ # If unset, we'll simply filter OUT
375+ return False
376+ lat_valid = self .lat_ordinate .specificity >= default_specificity
377+ lon_valid = self .lon_ordinate .specificity >= default_specificity
378+ return lat_valid and lon_valid
379+
361380
362381class GeocoordFilter :
363382 def filter_out (self , m : GeocoordMatch ) -> tuple :
@@ -369,7 +388,9 @@ def __init__(self):
369388 GeocoordFilter .__init__ (self )
370389 self .date_formats = ["DDMMMYYYY" , "DMMMYYHHmm" , "DDMMMYYHHmm" , "DDMMMYY" , "DMMMYY" , "HHZZZYYYY" ]
371390 self .sequences = ["1234" , "123456" , "12345678" , "1234567890" ]
372- self .stop_terms = { "PER" , "SEC" , "UTC" , "GMT" , "GAL" }
391+ self .stop_terms = {"PER" , "SEC" , "UTC" , "GMT" , "GAL" , "USC" , "CAN" ,
392+ "JAN" , "FEB" , "MAR" , "APR" , "MAY" , "JUN" ,
393+ "JUL" , "AUG" , "SEP" , "OCT" , "NOV" , "DEC" }
373394 self .today = arrow .utcnow ()
374395 self .YEAR = self .today .date ().year
375396 self .YY = self .YEAR - 2000
@@ -392,6 +413,8 @@ def filter_out(self, mgrs: GeocoordMatch) -> tuple:
392413
393414 if not (mgrs .text .isupper () and len (mgrs .text .replace (" " , "" )) > 6 ):
394415 return True , "lexical"
416+ if "\t " in mgrs .text or "\n " in mgrs .text :
417+ return True , "format-ws"
395418 for term in self .stop_terms :
396419 if term in mgrs .textnorm :
397420 return True , "measure"
@@ -430,6 +453,11 @@ def filter_out(self, dms: GeocoordMatch) -> tuple:
430453 Easy filter -- if puncutation matches, this is an easy pattern to ignore.
431454 :return: True if filtered out, false positive.
432455 """
456+ if dms .is_valid :
457+ if not dms .filter_by_resolution ():
458+ # Not valid -- or at least not meeting users level of specificity.
459+ return True
460+
433461 if dms .is_valid :
434462 if dms .text [0 ].isalpha ():
435463 return False , None
@@ -581,16 +609,18 @@ def validate(self):
581609 55.60, 80.11 -- not valid
582610 N55.60, W80.11 -- valid
583611 +55.60, -80.11 -- valid
612+ S20 E33 -- not valid, by default looking for sub-degree resolution.
584613
585614 Validate also if the coordinate is a valid range for Lat/Lon.
586615 """
587616 if not self .is_valid :
588617 return
618+
589619 lath = self .lat_ordinate .hemi
590620 lonh = self .lon_ordinate .hemi
591621 valid_hemi = lath and lonh and lath .is_alpha () and lonh .is_alpha ()
592622 valid_sym = self .lat_ordinate .has_symbols () or self .lon_ordinate .has_symbols ()
593- self .is_valid = valid_hemi or valid_sym
623+ self .is_valid = ( valid_hemi or valid_sym ) and self . filter_by_resolution ()
594624
595625 self .filtered_out = not self .is_valid
596626
0 commit comments