Skip to content

Commit 05b7687

Browse files
committed
Allow caller to forcibly set locale -- parse all dates as one locale.
1 parent 4010766 commit 05b7687

File tree

4 files changed

+92
-43
lines changed

4 files changed

+92
-43
lines changed

src/main/python/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
MANIFEST
12
*.pyc
23
chardet
34
pysolr.py

src/main/python/opensextant/extractors/xtemporal.py

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
NO_DAY = -5
2323

2424
log = logger_config("INFO", pkg=__name__)
25-
25+
_default_locale = None
2626

2727
def format_date(d):
2828
if isinstance(d, arrow.Arrow):
@@ -92,26 +92,37 @@ def normalize_month_num(slots: dict):
9292
return INVALID_DATE
9393

9494

95-
def test_european_locale(slots: dict):
95+
def test_european_locale(slots: dict, locale=None):
9696
"""
9797
9898
:param slots:
9999
:return: day, month
100100
"""
101-
if "DM1" in slots and "DM2" in slots:
102-
# Matched as MDY
103-
# But we test if DMY is valid based on values.
104-
try:
105-
day = int(slots["DM1"])
106-
mon = int(slots["DM2"])
101+
if not ("DM1" in slots and "DM2" in slots):
102+
return None, None
103+
104+
# Matched as MDY
105+
# But we test if DMY is valid based on values.
106+
try:
107+
day = int(slots["DM1"])
108+
mon = int(slots["DM2"])
109+
# First pass -- if LOCALE == "euro", then assume pattern matches DAY/MON/YYYY
110+
if locale and locale == "euro":
111+
if mon <= 12 and day <= 31:
112+
return day, mon
113+
else:
114+
return -1, -1
115+
else:
116+
# Otherwise -- this is a test and we're guessing. Only return
117+
# a date if date pattern appears to be unambiguous. 03/05 is Mar-5th or May-3rd, for example
107118
if day > 12 and mon <= 12:
108119
# Valid match 31/12/... new year's eve.
109120
return day, mon
110121
if day > 12 and mon > 12:
111122
# Invalid date match for this pattern, e.g., 13/13/, or 30/13/...
112123
return -1, -1
113-
except:
114-
pass
124+
except:
125+
pass
115126
return None, None
116127

117128

@@ -217,11 +228,15 @@ def normalize_time(slots):
217228

218229

219230
class XTemporal(PatternExtractor):
220-
def __init__(self, cfg="datetime_patterns_py.cfg", debug=False):
231+
def __init__(self, cfg="datetime_patterns_py.cfg", debug=False, locale=None):
221232
"""
222233
:param cfg: patterns config file.
223234
"""
224235
PatternExtractor.__init__(self, RegexPatternManager(cfg, debug=debug, testing=debug))
236+
if locale:
237+
global _default_locale
238+
_default_locale = locale.lower()
239+
225240
if debug:
226241
log.setLevel("DEBUG")
227242

@@ -237,6 +252,20 @@ class Resolution:
237252

238253

239254
class DateTimeMatch(PatternMatch):
255+
"""
256+
DateTimeMatch puts out a matched date with attributes:
257+
258+
datenorm -- ISO yyyy-mm-dd date
259+
epoch -- seconds from 1970-01-01
260+
resolution - D, M, h, m, s
261+
locale -- "north-am" or "euro".
262+
263+
If locale is set using XTemporal(locale='euro')
264+
matching Euro-style dates will be forced as such through out document.
265+
When locale is not set, the default is to only use euro locale for dates
266+
that are not ambiguous, e.g., 30/05/1977.
267+
Ambiguous dates (with no default locale used) are parsed as "north-am".
268+
"""
240269
def __init__(self, *args, **kwargs):
241270
PatternMatch.__init__(self, *args, **kwargs)
242271
self.case = PatternMatch.LOWER_CASE
@@ -273,7 +302,7 @@ def normalize(self):
273302
# resolution = Resolution.YEAR
274303
day, month = None, None
275304
if self.pattern_id in {"MDY-01", "MDY-02"}:
276-
day, month = test_european_locale(slots) # Uses DM slots only
305+
day, month = test_european_locale(slots, _default_locale) # Uses DM slots only
277306
if day and day < 0:
278307
return False
279308
if day and month:

src/main/python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
setup(
1616
name='opensextant',
17-
version='1.6.7',
17+
version='1.6.8',
1818

1919
description='OpenSextant APIs and Utilities',
2020
long_description=long_description,
Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,59 @@
11
import copy
22
import os
3+
import unittest
34

45
from opensextant.extractors.xtemporal import XTemporal
56
from opensextant.utility import get_csv_writer, ensure_dirs
67

78

8-
def run_test():
9-
print("Run Default Tests")
10-
datex = XTemporal(debug=True)
11-
test_results = datex.default_tests()
12-
13-
print("Save Test Results")
14-
libdir = os.path.dirname(os.path.abspath(__file__))
15-
output = os.path.abspath(os.path.join(libdir, "..", "..", "results", "xtemporal-tests.csv"))
16-
ensure_dirs(output)
17-
print("... output file at ", output)
18-
19-
with open(output, "w", encoding="UTF-8") as fh:
20-
header = ["TEST", "TEXT", "RESULT", "MATCH_TEXT", "MATCH_ATTRS"]
21-
csvout = get_csv_writer(fh, header)
22-
csvout.writeheader()
23-
for result in test_results:
24-
25-
baserow = {
26-
"TEST": result["TEST"],
27-
"TEXT": result["TEXT"],
28-
"RESULT": result["PASS"],
29-
"MATCH_TEXT": "",
30-
"MATCH_ATTRS": ""
31-
}
32-
for m in result["MATCHES"]:
33-
row = copy.copy(baserow)
34-
row["MATCH_TEXT"] = m.text
35-
row["MATCH_ATTRS"] = repr(m.attrs)
36-
csvout.writerow(row)
9+
class TestXTemporal(unittest.TestCase):
10+
def test_xtemp_euro(self):
11+
datex = XTemporal(debug=True, locale="Euro")
12+
13+
euro_tests = [
14+
("text text 04/05/2025", "2025-05-04"),
15+
("text text 30/05/2025", "2025-05-30"),
16+
("text text 12/05/2025", "2025-05-30"),
17+
("text text 12/12/2025", "2025-12-12"),
18+
("text text 05/12/2025", "2025-12-05")
19+
]
20+
for tst, expected in euro_tests:
21+
for date_match in datex.extract(tst):
22+
result = date_match.attrs.get("datenorm")
23+
print(date_match.text, result, "Expected", expected)
24+
self.assertEqual(expected, result)
25+
26+
def run_test(self):
27+
print("Run Default Tests")
28+
29+
datex = XTemporal(debug=True)
30+
test_results = datex.default_tests()
31+
32+
print("Save Test Results")
33+
libdir = os.path.dirname(os.path.abspath(__file__))
34+
output = os.path.abspath(os.path.join(libdir, "..", "..", "results", "xtemporal-tests.csv"))
35+
ensure_dirs(output)
36+
print("... output file at ", output)
37+
38+
with open(output, "w", encoding="UTF-8") as fh:
39+
header = ["TEST", "TEXT", "RESULT", "MATCH_TEXT", "MATCH_ATTRS"]
40+
csvout = get_csv_writer(fh, header)
41+
csvout.writeheader()
42+
for result in test_results:
43+
44+
baserow = {
45+
"TEST": result["TEST"],
46+
"TEXT": result["TEXT"],
47+
"RESULT": result["PASS"],
48+
"MATCH_TEXT": "",
49+
"MATCH_ATTRS": ""
50+
}
51+
for m in result["MATCHES"]:
52+
row = copy.copy(baserow)
53+
row["MATCH_TEXT"] = m.text
54+
row["MATCH_ATTRS"] = repr(m.attrs)
55+
csvout.writerow(row)
3756

3857

3958
if __name__ == "__main__":
40-
run_test()
59+
unittest.main()

0 commit comments

Comments
 (0)