Skip to content

Commit 1d3e1e6

Browse files
committed
Add pythainlp.llm.remove_repeated_ngrams
1 parent 46fe81f commit 1d3e1e6

File tree

4 files changed

+89
-0
lines changed

4 files changed

+89
-0
lines changed

docs/api/llm.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
.. currentmodule:: pythainlp.llm
2+
3+
pythainlp.llm
4+
=============
5+
6+
Modules
7+
-------
8+
9+
.. autofunction:: remove_repeated_ngrams

pythainlp/llm/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3+
# SPDX-FileType: SOURCE
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
__all__ = ["remove_repeated_ngrams"]
7+
8+
from pythainlp.llm.text_util import remove_repeated_ngrams

pythainlp/llm/text_util.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3+
# SPDX-FileType: SOURCE
4+
# SPDX-License-Identifier: Apache-2.0
5+
# ruff: noqa: C901
6+
7+
from typing import List
8+
9+
10+
def remove_repeated_ngrams(string_list: List[str], n: int = 2) -> List[str]:
11+
"""
12+
Remove repeated n-grams
13+
14+
:param List[str] string_list: List of string
15+
:param int n: n-gram size
16+
:return: List of string
17+
:rtype: List[str]
18+
19+
:Example:
20+
::
21+
22+
from pythainlp.llm import remove_repeated_ngrams
23+
24+
remove_repeated_ngrams(['เอา', 'เอา', 'แบบ', 'ไหน'], n=1)
25+
# output: ['เอา', 'แบบ', 'ไหน']
26+
"""
27+
if not string_list or n <= 0:
28+
return string_list
29+
30+
unique_ngrams = set()
31+
32+
output_list = []
33+
34+
for i in range(len(string_list)):
35+
if i + n <= len(string_list):
36+
ngram = tuple(string_list[i:i+n])
37+
38+
if ngram not in unique_ngrams:
39+
unique_ngrams.add(ngram)
40+
41+
if not output_list or output_list[-(n-1):]!= list(ngram[:-1]):
42+
output_list.extend(ngram)
43+
else:
44+
output_list.append(ngram[-1])
45+
else:
46+
for char in string_list[i:]:
47+
if not output_list or output_list[-1]!= char:
48+
output_list.append(char)
49+
50+
return output_list
51+

tests/core/test_llm.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3+
# SPDX-FileType: SOURCE
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
import unittest
7+
8+
from pythainlp.llm import remove_repeated_ngrams
9+
10+
11+
class LlmTestCase(unittest.TestCase):
12+
def test_remove_repeated_ngrams(self):
13+
texts = ['เอา', 'เอา', 'แบบ', 'แบบ', 'แบบ', 'ไหน']
14+
self.assertEqual(
15+
remove_repeated_ngrams(texts, n=1),
16+
['เอา', 'แบบ', 'ไหน']
17+
)
18+
self.assertEqual(
19+
remove_repeated_ngrams(texts, n=2),
20+
['เอา', 'เอา', 'แบบ', 'แบบ', 'ไหน']
21+
)

0 commit comments

Comments
 (0)