Skip to content

Commit ae8e000

Browse files
authored
Merge pull request #1 from wannaphong/copilot/port-newmm-to-c-api
Port newmm tokenizer from PyThaiNLP to C
2 parents 1a812a2 + dcba1f6 commit ae8e000

File tree

13 files changed

+1434
-1
lines changed

13 files changed

+1434
-1
lines changed

.github/workflows/test.yml

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
name: Build and Test
2+
3+
on:
4+
push:
5+
branches: [ main, dev, copilot/** ]
6+
pull_request:
7+
branches: [ main, dev ]
8+
9+
jobs:
10+
build-and-test:
11+
name: Build and Test on ${{ matrix.os }}
12+
runs-on: ${{ matrix.os }}
13+
strategy:
14+
fail-fast: false
15+
matrix:
16+
os: [ubuntu-latest, macos-latest]
17+
include:
18+
- os: ubuntu-latest
19+
cc: gcc
20+
- os: macos-latest
21+
cc: clang
22+
23+
steps:
24+
- name: Checkout code
25+
uses: actions/checkout@v4
26+
27+
- name: Display system information
28+
run: |
29+
echo "OS: ${{ matrix.os }}"
30+
echo "Compiler: ${{ matrix.cc }}"
31+
${{ matrix.cc }} --version
32+
make --version
33+
34+
- name: Build project
35+
run: |
36+
make clean
37+
make all
38+
env:
39+
CC: ${{ matrix.cc }}
40+
41+
- name: Run tests
42+
run: make test
43+
44+
- name: Test basic example
45+
run: |
46+
./build/example_basic "ฉันไปโรงเรียน" data/thai_words.txt
47+
./build/example_basic "วันนี้อากาศดีมาก" data/thai_words.txt
48+
./build/example_basic "hello world 123"
49+
50+
- name: Upload build artifacts
51+
if: failure()
52+
uses: actions/upload-artifact@v4
53+
with:
54+
name: build-artifacts-${{ matrix.os }}
55+
path: |
56+
build/
57+
lib/
58+
retention-days: 5

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,7 @@ dkms.conf
5353

5454
# debug information files
5555
*.dwo
56+
57+
# Build directories
58+
build/
59+
lib/

Makefile

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Makefile for CThaiNLP
2+
3+
CC = gcc
4+
CFLAGS = -Wall -Wextra -O2 -I./include
5+
AR = ar
6+
ARFLAGS = rcs
7+
8+
# Directories
9+
SRC_DIR = src
10+
INCLUDE_DIR = include
11+
BUILD_DIR = build
12+
EXAMPLES_DIR = examples
13+
LIB_DIR = lib
14+
15+
# Source files
16+
SOURCES = $(SRC_DIR)/trie.c $(SRC_DIR)/tcc.c $(SRC_DIR)/newmm.c
17+
OBJECTS = $(BUILD_DIR)/trie.o $(BUILD_DIR)/tcc.o $(BUILD_DIR)/newmm.o
18+
19+
# Library
20+
LIBRARY = $(LIB_DIR)/libcthainlp.a
21+
22+
# Example programs
23+
EXAMPLE_BASIC = $(BUILD_DIR)/example_basic
24+
TEST_NEWMM = $(BUILD_DIR)/test_newmm
25+
26+
# Default target
27+
all: dirs $(LIBRARY) $(EXAMPLE_BASIC) $(TEST_NEWMM)
28+
29+
# Create directories
30+
dirs:
31+
@mkdir -p $(BUILD_DIR) $(LIB_DIR)
32+
33+
# Build object files
34+
$(BUILD_DIR)/trie.o: $(SRC_DIR)/trie.c $(SRC_DIR)/trie.h
35+
$(CC) $(CFLAGS) -c $< -o $@
36+
37+
$(BUILD_DIR)/tcc.o: $(SRC_DIR)/tcc.c $(SRC_DIR)/tcc.h
38+
$(CC) $(CFLAGS) -c $< -o $@
39+
40+
$(BUILD_DIR)/newmm.o: $(SRC_DIR)/newmm.c $(SRC_DIR)/trie.h $(SRC_DIR)/tcc.h $(INCLUDE_DIR)/newmm.h
41+
$(CC) $(CFLAGS) -c $< -o $@
42+
43+
# Build library
44+
$(LIBRARY): $(OBJECTS)
45+
$(AR) $(ARFLAGS) $@ $^
46+
47+
# Build example programs
48+
$(EXAMPLE_BASIC): $(EXAMPLES_DIR)/example_basic.c $(LIBRARY)
49+
$(CC) $(CFLAGS) $< -L$(LIB_DIR) -lcthainlp -o $@
50+
51+
# Build test programs
52+
$(TEST_NEWMM): tests/test_newmm.c $(LIBRARY)
53+
$(CC) $(CFLAGS) $< -L$(LIB_DIR) -lcthainlp -o $@
54+
55+
# Test target
56+
test: $(TEST_NEWMM)
57+
./$(TEST_NEWMM)
58+
59+
# Clean
60+
clean:
61+
rm -rf $(BUILD_DIR) $(LIB_DIR)
62+
63+
.PHONY: all dirs clean test

README.md

Lines changed: 210 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,210 @@
1-
# CThaiNLP
1+
# CThaiNLP
2+
3+
![Build and Test](https://github.com/wannaphong/CThaiNLP/actions/workflows/test.yml/badge.svg)
4+
5+
C implementation of Thai Natural Language Processing tools, ported from [PyThaiNLP](https://github.com/PyThaiNLP/pythainlp).
6+
7+
## Features
8+
9+
- **newmm**: Dictionary-based maximal matching word segmentation constrained by Thai Character Cluster (TCC) boundaries
10+
- Similar API to PyThaiNLP for easy migration from Python to C
11+
- UTF-8 support
12+
- Efficient Trie data structure for dictionary lookup
13+
- Handles mixed Thai/English/numeric content
14+
15+
## Building
16+
17+
### Prerequisites
18+
19+
- GCC or compatible C compiler
20+
- Make
21+
22+
### Compilation
23+
24+
```bash
25+
make
26+
```
27+
28+
This will create:
29+
- Static library: `lib/libcthainlp.a`
30+
- Example program: `build/example_basic`
31+
32+
## Usage
33+
34+
### Basic Example
35+
36+
```c
37+
#include "newmm.h"
38+
39+
int main() {
40+
const char* text = "ฉันไปโรงเรียน";
41+
int token_count;
42+
43+
// Segment text (with NULL for dict_path to use default dictionary)
44+
char** tokens = newmm_segment(text, NULL, &token_count);
45+
46+
// Print tokens
47+
for (int i = 0; i < token_count; i++) {
48+
printf("%s\n", tokens[i]);
49+
}
50+
51+
// Free memory
52+
newmm_free_result(tokens, token_count);
53+
54+
return 0;
55+
}
56+
```
57+
58+
### Compile Your Program
59+
60+
```bash
61+
gcc your_program.c -I./include -L./lib -lcthainlp -o your_program
62+
```
63+
64+
### Running Examples
65+
66+
Basic example with default dictionary:
67+
```bash
68+
./build/example_basic "ฉันไปโรงเรียน"
69+
```
70+
71+
With custom dictionary:
72+
```bash
73+
./build/example_basic "ฉันไปโรงเรียน" data/thai_words.txt
74+
```
75+
76+
### Running Tests
77+
78+
Run the test suite:
79+
```bash
80+
make test
81+
```
82+
83+
This will compile and run all unit tests to verify the tokenizer is working correctly.
84+
85+
## API Reference
86+
87+
### Functions
88+
89+
#### `char** newmm_segment(const char* text, const char* dict_path, int* token_count)`
90+
91+
Segment Thai text into words using the newmm algorithm.
92+
93+
**Parameters:**
94+
- `text`: Input text to segment (UTF-8 encoded)
95+
- `dict_path`: Path to dictionary file (one word per line, UTF-8). Use `NULL` for default dictionary
96+
- `token_count`: Output parameter - receives the number of tokens found
97+
98+
**Returns:**
99+
- Array of strings (tokens), or `NULL` on error
100+
- Caller must free the result using `newmm_free_result()`
101+
102+
**Example:**
103+
```c
104+
int count;
105+
char** tokens = newmm_segment("ฉันไปโรงเรียน", "dict.txt", &count);
106+
```
107+
108+
#### `void newmm_free_result(char** tokens, int token_count)`
109+
110+
Free memory allocated by `newmm_segment()`.
111+
112+
**Parameters:**
113+
- `tokens`: Array of tokens returned by `newmm_segment()`
114+
- `token_count`: Number of tokens in the array
115+
116+
**Example:**
117+
```c
118+
newmm_free_result(tokens, count);
119+
```
120+
121+
## Dictionary Format
122+
123+
Dictionary files should contain one word per line in UTF-8 encoding:
124+
125+
```
126+
ฉัน
127+
ไป
128+
โรงเรียน
129+
วันนี้
130+
อากาศ
131+
ดี
132+
มาก
133+
```
134+
135+
A sample dictionary is provided in `data/thai_words.txt`.
136+
137+
## Comparison with PyThaiNLP
138+
139+
The API is designed to be similar to PyThaiNLP's `segment()` function:
140+
141+
**PyThaiNLP (Python):**
142+
```python
143+
from pythainlp.tokenize import word_tokenize
144+
145+
text = "ฉันไปโรงเรียน"
146+
tokens = word_tokenize(text, engine="newmm")
147+
print(tokens) # ['ฉัน', 'ไป', 'โรงเรียน']
148+
```
149+
150+
**CThaiNLP (C):**
151+
```c
152+
const char* text = "ฉันไปโรงเรียน";
153+
int token_count;
154+
char** tokens = newmm_segment(text, NULL, &token_count);
155+
// tokens = ['ฉัน', 'ไป', 'โรงเรียน']
156+
newmm_free_result(tokens, token_count);
157+
```
158+
159+
## Algorithm
160+
161+
The newmm (New Maximum Matching) algorithm:
162+
163+
1. **Trie-based Dictionary Lookup**: Uses a trie data structure for efficient prefix matching
164+
2. **Thai Character Cluster (TCC) Boundaries**: Respects Thai character cluster rules for valid word boundaries
165+
3. **Maximal Matching**: Finds the longest dictionary word that matches at each position
166+
4. **Fallback Handling**: Handles non-dictionary words and non-Thai characters (Latin, digits, etc.)
167+
168+
## Project Structure
169+
170+
```
171+
CThaiNLP/
172+
├── include/
173+
│ └── newmm.h # Public API header
174+
├── src/
175+
│ ├── newmm.c # Main newmm implementation
176+
│ ├── trie.c # Trie data structure
177+
│ ├── trie.h # Trie header
178+
│ ├── tcc.c # Thai Character Cluster
179+
│ └── tcc.h # TCC header
180+
├── examples/
181+
│ └── example_basic.c # Basic usage example
182+
├── tests/
183+
│ └── test_newmm.c # Test suite
184+
├── data/
185+
│ └── thai_words.txt # Sample dictionary
186+
├── Makefile # Build configuration
187+
└── README.md # This file
188+
```
189+
190+
## Credits
191+
192+
- Original PyThaiNLP implementation: [PyThaiNLP Project](https://github.com/PyThaiNLP/pythainlp)
193+
- newmm algorithm: Based on work by Korakot Chaovavanich
194+
- TCC rules: Theeramunkong et al. 2000
195+
196+
## License
197+
198+
Apache License 2.0 (following PyThaiNLP's license)
199+
200+
## Contributing
201+
202+
Contributions are welcome! Please feel free to submit issues or pull requests.
203+
204+
## Future Enhancements
205+
206+
- [ ] Add more tokenization engines (attacut, deepcut, etc.)
207+
- [ ] Improve performance with optimized data structures
208+
- [ ] Add part-of-speech tagging
209+
- [ ] Add named entity recognition
210+
- [ ] Provide Python bindings (PyPI package)

0 commit comments

Comments
 (0)