Skip to content

Commit 37f6ba8

Browse files
authored
Merge pull request #1054 from PyThaiNLP/add-calculate_ngram_counts
Add pythainlp.lm.calculate_ngram_counts
2 parents 2252dee + ca9446d commit 37f6ba8

File tree

4 files changed

+46
-4
lines changed

4 files changed

+46
-4
lines changed

docs/api/lm.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ pythainlp.lm
66
Modules
77
-------
88

9+
.. autofunction:: calculate_ngram_counts
910
.. autofunction:: remove_repeated_ngrams

pythainlp/lm/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
# SPDX-FileType: SOURCE
44
# SPDX-License-Identifier: Apache-2.0
55

6-
__all__ = ["remove_repeated_ngrams"]
6+
__all__ = [
7+
"calculate_ngram_counts",
8+
"remove_repeated_ngrams"
9+
]
710

8-
from pythainlp.lm.text_util import remove_repeated_ngrams
11+
from pythainlp.lm.text_util import calculate_ngram_counts, remove_repeated_ngrams

pythainlp/lm/text_util.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,32 @@
44
# SPDX-License-Identifier: Apache-2.0
55
# ruff: noqa: C901
66

7-
from typing import List
7+
from typing import List, Tuple, Dict
8+
9+
10+
def calculate_ngram_counts(
11+
list_words: List[str],
12+
n_min: int = 2,
13+
n_max: int = 4) -> Dict[Tuple[str], int]:
14+
"""
15+
Calculates the counts of n-grams in the list words for the specified range.
16+
17+
:param List[str] list_words: List of string
18+
:param int n_min: The minimum n-gram size (default: 2).
19+
:param int n_max: The maximum n-gram size (default: 4).
20+
21+
:return: A dictionary where keys are n-grams and values are their counts.
22+
:rtype: Dict[Tuple[str], int]
23+
"""
24+
25+
ngram_counts = {}
26+
27+
for n in range(n_min, n_max + 1):
28+
for i in range(len(list_words) - n + 1):
29+
ngram = tuple(list_words[i:i + n])
30+
ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1
31+
32+
return ngram_counts
833

934

1035
def remove_repeated_ngrams(string_list: List[str], n: int = 2) -> List[str]:

tests/core/test_lm.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,23 @@
55

66
import unittest
77

8-
from pythainlp.lm import remove_repeated_ngrams
8+
from pythainlp.lm import calculate_ngram_counts, remove_repeated_ngrams
99

1010

1111
class LMTestCase(unittest.TestCase):
12+
def test_calculate_ngram_counts(self):
13+
self.assertEqual(
14+
calculate_ngram_counts(['1', '2', '3', '4']),
15+
{
16+
('1', '2'): 1,
17+
('2', '3'): 1,
18+
('3', '4'): 1,
19+
('1', '2', '3'): 1,
20+
('2', '3', '4'): 1,
21+
('1', '2', '3', '4'): 1
22+
}
23+
)
24+
1225
def test_remove_repeated_ngrams(self):
1326
texts = ['เอา', 'เอา', 'แบบ', 'แบบ', 'แบบ', 'ไหน']
1427
self.assertEqual(

0 commit comments

Comments
 (0)