Skip to content

Commit c4068a5

Browse files
authored
Merge pull request #4 from wannaphong/copilot/fork-newmm-tokenizer-thainlp
Implement newmm tokenizer with PyThaiNLP-compatible API
2 parents 7ba6fbf + 23d375e commit c4068a5

File tree

14 files changed

+63087
-17
lines changed

14 files changed

+63087
-17
lines changed

.github/workflows/dotnetcore.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ jobs:
88
runs-on: ubuntu-latest
99

1010
steps:
11-
- uses: actions/checkout@v2
12-
- name: Setup .NET Core
13-
uses: actions/setup-dotnet@v1
11+
- uses: actions/checkout@v4
12+
- name: Setup .NET
13+
uses: actions/setup-dotnet@v4
1414
with:
15-
dotnet-version: 3.1.101
15+
dotnet-version: '8.0.x'
1616
- name: Install dependencies
1717
run: dotnet restore
1818
- name: Build

README.md

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,101 @@
11
# thainlp.net
22
Thai NLP in .NET
3+
4+
## Features
5+
6+
### Word Tokenization
7+
- **newmm** - Dictionary-based maximal matching word segmentation constrained by Thai Character Cluster (TCC) boundaries
8+
- API similar to PyThaiNLP for easy migration from Python
9+
10+
### Subword Tokenization
11+
- **TCC** (Thai Character Cluster) tokenization for breaking text into character clusters
12+
13+
## Installation
14+
15+
Build the project:
16+
```bash
17+
dotnet build
18+
```
19+
20+
## Usage
21+
22+
### Word Tokenization (newmm)
23+
24+
Basic usage:
25+
```csharp
26+
using Thainlp;
27+
28+
// Simple tokenization
29+
var tokens = WordTokenizer.Tokenize("ประเทศไทยมีอากาศดี");
30+
// Output: ["ประเทศ", "ไทย", "มี", "อากาศ", "ดี"]
31+
32+
// With more options
33+
var tokens = WordTokenizer.WordTokenize(
34+
text: "โอเคบ่พวกเรารักภาษาบ้านเกิด",
35+
engine: "newmm",
36+
keepWhitespace: true
37+
);
38+
// Output: ["โอเค", "บ่", "พวกเรา", "รัก", "ภาษา", "บ้านเกิด"]
39+
```
40+
41+
### Custom Dictionary
42+
43+
```csharp
44+
using Thainlp;
45+
using System.Collections.Generic;
46+
47+
// Create custom dictionary
48+
var customWords = new List<string> { "ชินโซ", "อาเบะ" };
49+
var customDict = new Trie(customWords);
50+
51+
// Use with tokenizer
52+
var tokens = WordTokenizer.WordTokenize(
53+
"ชินโซ อาเบะ เกิด 21 กันยายน",
54+
customDict: customDict
55+
);
56+
```
57+
58+
### TCC (Thai Character Cluster) Tokenization
59+
60+
```csharp
61+
using Thainlp;
62+
63+
// Tokenize into character clusters
64+
var clusters = TCC.Segment("ประเทศไทย");
65+
// Output: ["ป", "ระ", "เท", "ศ", "ไ", "ท", "ย"]
66+
67+
// Get cluster positions
68+
var positions = TCC.GetPositions("ประเทศไทย");
69+
```
70+
71+
### Legacy Subword API
72+
73+
```csharp
74+
using Thainlp;
75+
76+
// Original TCC implementation
77+
var clusters = Subword.tcc("ประเทศไทย");
78+
var positions = Subword.tcc_pos("ประเทศไทย");
79+
```
80+
81+
## API Compatibility with PyThaiNLP
82+
83+
This library provides an API similar to PyThaiNLP:
84+
85+
| PyThaiNLP | thainlp.net |
86+
|-----------|-------------|
87+
| `word_tokenize(text)` | `WordTokenizer.WordTokenize(text)` |
88+
| `word_tokenize(text, engine="newmm")` | `WordTokenizer.WordTokenize(text, engine: "newmm")` |
89+
| `word_tokenize(text, custom_dict=trie)` | `WordTokenizer.WordTokenize(text, customDict: trie)` |
90+
| `word_tokenize(text, keep_whitespace=False)` | `WordTokenizer.WordTokenize(text, keepWhitespace: false)` |
91+
92+
## Testing
93+
94+
Run the test suite:
95+
```bash
96+
dotnet test
97+
```
98+
99+
## License
100+
101+
See LICENSE file for details.

ThaiNLPTest/NewMMTest.cs

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
using Microsoft.VisualStudio.TestTools.UnitTesting;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using Thainlp;
6+
7+
namespace ThaiNLPTest
8+
{
9+
[TestClass]
10+
public class NewMMTest
11+
{
12+
[TestMethod]
13+
public void TestNewMMBasic()
14+
{
15+
// Test basic Thai text tokenization
16+
string text = "ประเทศไทย";
17+
var tokens = WordTokenizer.WordTokenize(text);
18+
19+
Assert.IsNotNull(tokens);
20+
Assert.IsTrue(tokens.Count >= 2);
21+
Assert.IsTrue(tokens.Contains("ประเทศ"));
22+
Assert.IsTrue(tokens.Contains("ไทย"));
23+
}
24+
25+
[TestMethod]
26+
public void TestNewMMLongText()
27+
{
28+
// Test longer sentence
29+
string text = "ประเทศไทยมีอากาศดี";
30+
var tokens = WordTokenizer.WordTokenize(text);
31+
32+
Assert.IsNotNull(tokens);
33+
Assert.IsTrue(tokens.Count >= 5);
34+
CollectionAssert.Contains(tokens, "ประเทศ");
35+
CollectionAssert.Contains(tokens, "ไทย");
36+
CollectionAssert.Contains(tokens, "มี");
37+
CollectionAssert.Contains(tokens, "อากาศ");
38+
CollectionAssert.Contains(tokens, "ดี");
39+
}
40+
41+
[TestMethod]
42+
public void TestNewMMComplexText()
43+
{
44+
// Test from PyThaiNLP example
45+
string text = "โอเคบ่พวกเรารักภาษาบ้านเกิด";
46+
var tokens = WordTokenizer.WordTokenize(text);
47+
48+
Assert.IsNotNull(tokens);
49+
CollectionAssert.Contains(tokens, "โอเค");
50+
CollectionAssert.Contains(tokens, "บ่");
51+
CollectionAssert.Contains(tokens, "พวกเรา");
52+
CollectionAssert.Contains(tokens, "รัก");
53+
CollectionAssert.Contains(tokens, "ภาษา");
54+
CollectionAssert.Contains(tokens, "บ้านเกิด");
55+
}
56+
57+
[TestMethod]
58+
public void TestNewMMEmptyString()
59+
{
60+
// Test empty string
61+
string text = "";
62+
var tokens = WordTokenizer.WordTokenize(text);
63+
64+
Assert.IsNotNull(tokens);
65+
Assert.AreEqual(0, tokens.Count);
66+
}
67+
68+
[TestMethod]
69+
public void TestNewMMNullString()
70+
{
71+
// Test null string
72+
string text = null;
73+
var tokens = WordTokenizer.WordTokenize(text);
74+
75+
Assert.IsNotNull(tokens);
76+
Assert.AreEqual(0, tokens.Count);
77+
}
78+
79+
[TestMethod]
80+
public void TestNewMMKeepWhitespace()
81+
{
82+
// Test keeping whitespace
83+
string text = "วรรณกรรม ภาพวาด";
84+
var tokensWithWhitespace = WordTokenizer.WordTokenize(text, keepWhitespace: true);
85+
var tokensWithoutWhitespace = WordTokenizer.WordTokenize(text, keepWhitespace: false);
86+
87+
Assert.IsTrue(tokensWithWhitespace.Count > tokensWithoutWhitespace.Count);
88+
Assert.IsFalse(tokensWithoutWhitespace.Any(t => string.IsNullOrWhiteSpace(t)));
89+
}
90+
91+
[TestMethod]
92+
public void TestNewMMConvenienceMethod()
93+
{
94+
// Test the convenience Tokenize method
95+
string text = "ประเทศไทย";
96+
var tokens = WordTokenizer.Tokenize(text);
97+
98+
Assert.IsNotNull(tokens);
99+
Assert.IsTrue(tokens.Count >= 2);
100+
}
101+
102+
[TestMethod]
103+
public void TestNewMMEngine()
104+
{
105+
// Test explicit engine parameter
106+
string text = "ประเทศไทย";
107+
var tokens = WordTokenizer.WordTokenize(text, engine: "newmm");
108+
109+
Assert.IsNotNull(tokens);
110+
Assert.IsTrue(tokens.Count >= 2);
111+
}
112+
113+
[TestMethod]
114+
[ExpectedException(typeof(ArgumentException))]
115+
public void TestNewMMInvalidEngine()
116+
{
117+
// Test invalid engine parameter
118+
string text = "ประเทศไทย";
119+
WordTokenizer.WordTokenize(text, engine: "invalid");
120+
}
121+
}
122+
}

ThaiNLPTest/TCCTest.cs

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
using Microsoft.VisualStudio.TestTools.UnitTesting;
2+
using System;
3+
using Thainlp;
4+
5+
namespace ThaiNLPTest
6+
{
7+
[TestClass]
8+
public class TCCTest
9+
{
10+
[TestMethod]
11+
public void TestTCCBasic()
12+
{
13+
// Test basic TCC tokenization
14+
string text = "ประเทศไทย";
15+
var tokens = TCC.Segment(text);
16+
17+
Assert.IsNotNull(tokens);
18+
Assert.IsTrue(tokens.Length > 0);
19+
}
20+
21+
[TestMethod]
22+
public void TestTCCConsistency()
23+
{
24+
// Test TCC tokenization is consistent
25+
string text = "ประเทศไทย";
26+
var tccResult1 = TCC.Segment(text);
27+
var tccResult2 = TCC.Segment(text);
28+
29+
CollectionAssert.AreEqual(tccResult1, tccResult2);
30+
}
31+
32+
[TestMethod]
33+
public void TestTCCPositions()
34+
{
35+
// Test TCC position detection
36+
string text = "ประเทศไทย";
37+
var positions = TCC.GetPositions(text);
38+
39+
Assert.IsNotNull(positions);
40+
Assert.IsTrue(positions.Count > 0);
41+
Assert.IsTrue(positions.Contains(text.Length));
42+
}
43+
44+
[TestMethod]
45+
public void TestTCCEmpty()
46+
{
47+
// Test TCC with empty string
48+
var clusters = TCC.Segment("");
49+
Assert.IsNotNull(clusters);
50+
Assert.AreEqual(0, clusters.Length);
51+
}
52+
53+
[TestMethod]
54+
public void TestTCCPositionCalculation()
55+
{
56+
// Test that TCC positions are correctly calculated
57+
string text = "กรุงเทพ";
58+
var positions = TCC.GetPositions(text);
59+
60+
Assert.IsNotNull(positions);
61+
Assert.IsTrue(positions.Count > 0);
62+
// The last position should always be the text length
63+
Assert.IsTrue(positions.Contains(text.Length));
64+
}
65+
}
66+
}

ThaiNLPTest/ThaiNLPTest.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22

33
<PropertyGroup>
4-
<TargetFramework>netcoreapp3.1</TargetFramework>
4+
<TargetFramework>net8.0</TargetFramework>
55

66
<IsPackable>false</IsPackable>
77
</PropertyGroup>

0 commit comments

Comments
 (0)