Skip to content

Commit eb6d677

Browse files
authored
Merge pull request #202 from forslund/feature/sv-extract-duration
Add extract_duration for Swedish
2 parents 2cc837c + bb6239c commit eb6d677

File tree

2 files changed

+181
-5
lines changed

2 files changed

+181
-5
lines changed

lingua_franca/lang/parse_sv.py

Lines changed: 141 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,147 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515
#
16-
from datetime import datetime
16+
from datetime import datetime, timedelta
1717
from dateutil.relativedelta import relativedelta
18+
1819
from lingua_franca.time import now_local
19-
from .parse_common import is_numeric, look_for_fractions, Normalizer
20+
21+
from .parse_common import (is_numeric, look_for_fractions, Normalizer,
22+
tokenize, Token)
23+
24+
25+
def _find_numbers_in_text(tokens):
26+
"""Finds duration related numbers in texts and makes a list of mappings.
27+
28+
The mapping will be for number to token that created it, if no number was
29+
created from the token the mapping will be from None to the token.
30+
31+
The function is optimized to generate data that can be parsed to a duration
32+
so it returns the list in reverse order to make the "size" (minutes/hours/
33+
etc.) come first and the related numbers afterwards.
34+
35+
Args:
36+
tokens: Tokens to parse
37+
38+
Returns:
39+
list of (number, token) tuples
40+
"""
41+
parts = []
42+
for tok in tokens:
43+
res = extract_number_sv(tok.word)
44+
if res:
45+
parts.insert(0, (res, tok))
46+
# Special case for quarter of an hour
47+
if tok.word == 'kvart':
48+
parts.insert(0, (None, Token('timmar', index=-1)))
49+
elif tok.word in ['halvtimme', 'halvtimma']:
50+
parts.insert(0, (30, tok))
51+
parts.insert(0, (None, Token('minuter', index=-1)))
52+
else:
53+
parts.insert(0, (None, tok))
54+
return parts
55+
56+
57+
def _combine_adjacent_numbers(number_map):
58+
"""Combine adjacent numbers through multiplication.
59+
60+
Walks through a number map and joins adjasent numbers to handle cases
61+
such as "en halvtimme" (one half hour).
62+
63+
Returns:
64+
(list): simplified number_map
65+
"""
66+
simplified = []
67+
skip = False
68+
for i in range(len(number_map) - 1):
69+
if skip:
70+
skip = False
71+
continue
72+
if number_map[i][0] and number_map[i + 1][0]:
73+
combined_number = number_map[i][0] * number_map[i + 1][0]
74+
combined_tokens = (number_map[i][1], number_map[i + 1][1])
75+
simplified.append((combined_number, combined_tokens))
76+
skip = True
77+
else:
78+
simplified.append((number_map[i][0], (number_map[i][1],)))
79+
80+
if not skip:
81+
simplified.append((number_map[-1][0], (number_map[-1][1],)))
82+
return simplified
83+
84+
85+
def extract_duration_sv(text):
86+
"""
87+
Convert an swedish phrase into a number of seconds.
88+
89+
The function handles durations from seconds up to days.
90+
91+
Convert things like:
92+
"10 minute"
93+
"2 and a half hours"
94+
"3 days 8 hours 10 minutes and 49 seconds"
95+
into an int, representing the total number of seconds.
96+
97+
The words used in the duration will be consumed, and
98+
the remainder returned.
99+
100+
As an example, "set a timer for 5 minutes" would return
101+
(300, "set a timer for").
102+
103+
Args:
104+
text (str): string containing a duration
105+
106+
Returns:
107+
(timedelta, str):
108+
A tuple containing the duration and the remaining text
109+
not consumed in the parsing. The first value will
110+
be None if no duration is found. The text returned
111+
will have whitespace stripped from the ends.
112+
"""
113+
tokens = tokenize(text)
114+
number_tok_map = _find_numbers_in_text(tokens)
115+
# Combine adjacent numbers
116+
simplified = _combine_adjacent_numbers(number_tok_map)
117+
118+
states = {
119+
'days': 0,
120+
'hours': 0,
121+
'minutes': 0,
122+
'seconds': 0
123+
}
124+
125+
# Parser state, mapping words that should set the parser to collect
126+
# numbers to a specific time "size"
127+
state_words = {
128+
'days': ('dygn', 'dag', 'dagar', 'dags'),
129+
'hours': ('timmar', 'timme', 'timma', 'timmes', 'timmas'),
130+
'minutes': ('minuter', 'minuters', 'minut', 'minuts'),
131+
'seconds': ('sekunder', 'sekunders', 'sekund', 'sekunds')
132+
}
133+
binding_words = ('och')
134+
135+
consumed = []
136+
state = None
137+
valid = False
138+
139+
for num, toks in simplified:
140+
if state and num:
141+
states[state] += num
142+
consumed.extend(toks)
143+
valid = True # If a state field got set this is valid duration
144+
elif num is None:
145+
for s in state_words:
146+
if toks[0].word in state_words[s]:
147+
state = s
148+
consumed.extend(toks)
149+
break
150+
else:
151+
if toks[0].word not in binding_words:
152+
state = None
153+
154+
td = timedelta(**states)
155+
remainder = ' '.join([t.word for t in tokens if t not in consumed])
156+
return (td, remainder) if valid else None
20157

21158

22159
def extract_number_sv(text, short_scale=True, ordinals=False):
@@ -29,8 +166,8 @@ def extract_number_sv(text, short_scale=True, ordinals=False):
29166
(int) or (float): The value of extracted number
30167
"""
31168
# TODO: short_scale and ordinals don't do anything here.
32-
# The parameters are present in the function signature for API compatibility
33-
# reasons.
169+
# The parameters are present in the function signature for API
170+
# compatibility reasons.
34171
text = text.lower()
35172
aWords = text.split()
36173
and_pass = False

test/test_parse_sv.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@
1414
# limitations under the License.
1515
#
1616
import unittest
17-
from datetime import datetime, time
17+
from datetime import datetime, time, timedelta
1818

1919
from lingua_franca import load_language, unload_language
2020
from lingua_franca.parse import extract_datetime
2121
from lingua_franca.parse import extract_number
22+
from lingua_franca.parse import extract_duration
2223
from lingua_franca.parse import normalize
2324

2425

@@ -127,6 +128,44 @@ def test_numbers(self):
127128
lang='sv-se'),
128129
"det är 18 19 20 test")
129130

131+
class TestExtractDuration(unittest.TestCase):
132+
def test_valid_extract_duration(self):
133+
"""Duration in sentence."""
134+
td, remains = extract_duration("5 minuter", lang='sv-se')
135+
self.assertEqual(td, timedelta(seconds=300))
136+
self.assertEqual(remains, '')
137+
138+
td, remains = extract_duration("om 2 och en halv timme", lang='sv-se')
139+
self.assertEqual(td, timedelta(hours=2, minutes=30))
140+
self.assertEqual(remains, "om och")
141+
142+
td, remains = extract_duration("starta en 9 minuters timer",
143+
lang='sv-se')
144+
self.assertEqual(td, timedelta(minutes=9))
145+
self.assertEqual(remains, "starta timer")
146+
147+
# Extraction of things like "kvart" and "halvtimme"
148+
td, remains = extract_duration("i en kvart", lang='sv-se')
149+
self.assertEqual(td, timedelta(minutes=15))
150+
self.assertEqual(remains, "i")
151+
152+
td, remains = extract_duration("hämta mig om två timmar och en kvart",
153+
lang='sv-se')
154+
self.assertEqual(td, timedelta(hours=2, minutes=15))
155+
self.assertEqual(remains, "hämta mig om och")
156+
157+
td, remains = extract_duration("om en halvtimme", lang='sv-se')
158+
self.assertEqual(td, timedelta(minutes=30))
159+
self.assertEqual(remains, "om")
160+
161+
def test_invalid_extract_duration(self):
162+
"""No duration in sentence."""
163+
res = extract_duration("vad är en myrslok", lang='sv-se')
164+
self.assertEqual(res, None)
165+
166+
res = extract_duration("svaret är 42", lang='sv-se')
167+
self.assertEqual(res, None)
168+
130169

131170
if __name__ == "__main__":
132171
unittest.main()

0 commit comments

Comments
 (0)