Skip to content

Commit c4de24c

Browse files
authored
Merge pull request #1059 from PyThaiNLP/wannaphong/add-lcs
Add longest common subsequence algorithm
2 parents ef0e01d + 4c5c948 commit c4de24c

File tree

4 files changed

+86
-1
lines changed

4 files changed

+86
-1
lines changed

docs/api/util.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,11 @@ Modules
283283

284284
The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner.
285285

286+
.. autofunction:: longest_common_subsequence
287+
:noindex:
288+
289+
The `longest_common_subsequence` function is find the longest common subsequence between two strings.
290+
286291
.. autofunction:: pythainlp.util.morse.morse_encode
287292
:noindex:
288293

pythainlp/util/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# -*- coding: utf-8 -*-
1+
# -*- coding: utf-8 -*-
22
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
33
# SPDX-FileType: SOURCE
44
# SPDX-License-Identifier: Apache-2.0
@@ -26,6 +26,7 @@
2626
"is_native_thai",
2727
"isthai",
2828
"isthaichar",
29+
"longest_common_subsequence",
2930
"nectec_to_ipa",
3031
"normalize",
3132
"now_reign_year",
@@ -92,6 +93,7 @@
9293
thai_to_eng,
9394
)
9495
from pythainlp.util.keywords import find_keyword, rank
96+
from pythainlp.util.lcs import longest_common_subsequence
9597
from pythainlp.util.normalize import (
9698
maiyamok,
9799
normalize,

pythainlp/util/lcs.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
3+
# SPDX-FileType: SOURCE
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
def longest_common_subsequence(str1: str, str2: str) -> str:
7+
"""
8+
Find the longest common subsequence between two strings.
9+
10+
:param str str1: The first string.
11+
:param str str2: The second string.
12+
:return: The longest common subsequence.
13+
:rtype: str
14+
15+
:Example:
16+
::
17+
18+
from pythainlp.util.lcs import longest_common_subsequence
19+
20+
print(longest_common_subsequence("ABCBDAB", "BDCAB"))
21+
# output: "BDAB"
22+
"""
23+
m = len(str1)
24+
n = len(str2)
25+
26+
# Create a 2D array to store lengths of longest common subsequence.
27+
dp = [[0] * (n + 1) for _ in range(m + 1)]
28+
29+
# Build the dp array from bottom up.
30+
for i in range(m + 1):
31+
for j in range(n + 1):
32+
if i == 0 or j == 0:
33+
dp[i][j] = 0
34+
elif str1[i - 1] == str2[j - 1]:
35+
dp[i][j] = dp[i - 1][j - 1] + 1
36+
else:
37+
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
38+
39+
# Following code is used to print LCS
40+
index = dp[m][n]
41+
42+
# Create a character array to store the lcs string
43+
lcs = [""] * (index + 1)
44+
lcs[index] = ""
45+
46+
# Start from the right-most-bottom-most corner and
47+
# one by one store characters in lcs[]
48+
i = m
49+
j = n
50+
while i > 0 and j > 0:
51+
52+
# If current character in str1 and str2 are same, then
53+
# current character is part of LCS
54+
if str1[i - 1] == str2[j - 1]:
55+
lcs[index - 1] = str1[i - 1]
56+
i -= 1
57+
j -= 1
58+
index -= 1
59+
60+
# If not same, then find the larger of two and
61+
# go in the direction of larger value
62+
elif dp[i - 1][j] > dp[i][j - 1]:
63+
i -= 1
64+
else:
65+
j -= 1
66+
67+
return "".join(lcs)

tests/core/test_util.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
ipa_to_rtgs,
3333
isthai,
3434
isthaichar,
35+
longest_common_subsequence,
3536
nectec_to_ipa,
3637
normalize,
3738
now_reign_year,
@@ -842,3 +843,13 @@ def test_th_zodiac(self):
842843

843844
# def test_abbreviation_to_full_text(self):
844845
# self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))
846+
847+
def test_longest_common_subsequence(self):
848+
self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB")
849+
self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB")
850+
self.assertEqual(longest_common_subsequence("ABCDGH", "AEDFHR"), "ADH")
851+
self.assertEqual(longest_common_subsequence("ABC", "AC"), "AC")
852+
self.assertEqual(longest_common_subsequence("ABC", "DEF"), "")
853+
self.assertEqual(longest_common_subsequence("", "ABC"), "")
854+
self.assertEqual(longest_common_subsequence("ABC", ""), "")
855+
self.assertEqual(longest_common_subsequence("", ""), "")

0 commit comments

Comments
 (0)