Skip to content

Commit d032edb

Browse files
committed
Add isegment function with chunking to avoid recursion limit
1 parent 6f85f90 commit d032edb

File tree

2 files changed

+31
-4
lines changed

2 files changed

+31
-4
lines changed

docs/api.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ WordSegment API Reference
2222

2323
Score a `word` in the context of the previous word, `prev`.
2424

25+
.. py:function:: isegment(text)
26+
:module: wordsegment
27+
28+
Return iterator of words that is the best segmenation of `text`.
29+
2530
.. py:function:: segment(text)
2631
:module: wordsegment
2732

wordsegment.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ def score(word, prev=None):
9292

9393
return score(word)
9494

95-
def segment(text):
96-
"Return a list of words that is the best segmenation of `text`."
95+
def isegment(text):
96+
"Return iterator of words that is the best segmenation of `text`."
9797

9898
memo = dict()
9999

@@ -116,9 +116,31 @@ def candidates():
116116

117117
return max(candidates())
118118

119-
_, result_words = search(clean(text))
119+
# Avoid recursion limit issues by dividing text into chunks, segmenting
120+
# those chunks and combining the results together. Chunks may divide words
121+
# in the middle so prefix chunks with the last five words of the previous
122+
# result.
123+
124+
clean_text = clean(text)
125+
size = 250
126+
prefix = ''
127+
128+
for offset in range(0, len(clean_text), size):
129+
chunk = clean_text[offset:(offset + size)]
130+
_, chunk_words = search(prefix + chunk)
131+
prefix = ''.join(chunk_words[-5:])
132+
del chunk_words[-5:]
133+
for word in chunk_words:
134+
yield word
120135

121-
return result_words
136+
_, prefix_words = search(prefix)
137+
138+
for word in prefix_words:
139+
yield word
140+
141+
def segment(text):
142+
"Return a list of words that is the best segmenation of `text`."
143+
return list(isegment(text))
122144

123145
def main(args=()):
124146
"""Command-line entry-point. Parses `args` into in-file and out-file then

0 commit comments

Comments
 (0)