PyThaiNLP · wannaphong · Jan 11, 2026 · Jan 11, 2026 · Jan 11, 2026 · Jan 11, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,58 @@
+name: Build and Test
+
+on:
+  push:
+    branches: [ main, dev, copilot/** ]
+  pull_request:
+    branches: [ main, dev ]
+
+jobs:
+  build-and-test:
+    name: Build and Test on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        include:
+          - os: ubuntu-latest
+            cc: gcc
+          - os: macos-latest
+            cc: clang
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Display system information
+      run: |
+        echo "OS: ${{ matrix.os }}"
+        echo "Compiler: ${{ matrix.cc }}"
+        ${{ matrix.cc }} --version
+        make --version
+
+    - name: Build project
+      run: |
+        make clean
+        make all
+      env:
+        CC: ${{ matrix.cc }}
+
+    - name: Run tests
+      run: make test
+
+    - name: Test basic example
+      run: |
+        ./build/example_basic "ฉันไปโรงเรียน" data/thai_words.txt
+        ./build/example_basic "วันนี้อากาศดีมาก" data/thai_words.txt
+        ./build/example_basic "hello world 123"
+
+    - name: Upload build artifacts
+      if: failure()
+      uses: actions/upload-artifact@v4
+      with:
+        name: build-artifacts-${{ matrix.os }}
+        path: |
+          build/
+          lib/
+        retention-days: 5
diff --git a/.gitignore b/.gitignore
@@ -53,3 +53,7 @@ dkms.conf
 
 # debug information files
 *.dwo
+
+# Build directories
+build/
+lib/
diff --git a/Makefile b/Makefile
@@ -0,0 +1,63 @@
+# Makefile for CThaiNLP
+
+CC = gcc
+CFLAGS = -Wall -Wextra -O2 -I./include
+AR = ar
+ARFLAGS = rcs
+
+# Directories
+SRC_DIR = src
+INCLUDE_DIR = include
+BUILD_DIR = build
+EXAMPLES_DIR = examples
+LIB_DIR = lib
+
+# Source files
+SOURCES = $(SRC_DIR)/trie.c $(SRC_DIR)/tcc.c $(SRC_DIR)/newmm.c
+OBJECTS = $(BUILD_DIR)/trie.o $(BUILD_DIR)/tcc.o $(BUILD_DIR)/newmm.o
+
+# Library
+LIBRARY = $(LIB_DIR)/libcthainlp.a
+
+# Example programs
+EXAMPLE_BASIC = $(BUILD_DIR)/example_basic
+TEST_NEWMM = $(BUILD_DIR)/test_newmm
+
+# Default target
+all: dirs $(LIBRARY) $(EXAMPLE_BASIC) $(TEST_NEWMM)
+
+# Create directories
+dirs:
+	@mkdir -p $(BUILD_DIR) $(LIB_DIR)
+
+# Build object files
+$(BUILD_DIR)/trie.o: $(SRC_DIR)/trie.c $(SRC_DIR)/trie.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+$(BUILD_DIR)/tcc.o: $(SRC_DIR)/tcc.c $(SRC_DIR)/tcc.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+$(BUILD_DIR)/newmm.o: $(SRC_DIR)/newmm.c $(SRC_DIR)/trie.h $(SRC_DIR)/tcc.h $(INCLUDE_DIR)/newmm.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+# Build library
+$(LIBRARY): $(OBJECTS)
+	$(AR) $(ARFLAGS) $@ $^
+
+# Build example programs
+$(EXAMPLE_BASIC): $(EXAMPLES_DIR)/example_basic.c $(LIBRARY)
+	$(CC) $(CFLAGS) $< -L$(LIB_DIR) -lcthainlp -o $@
+
+# Build test programs
+$(TEST_NEWMM): tests/test_newmm.c $(LIBRARY)
+	$(CC) $(CFLAGS) $< -L$(LIB_DIR) -lcthainlp -o $@
+
+# Test target
+test: $(TEST_NEWMM)
+	./$(TEST_NEWMM)
+
+# Clean
+clean:
+	rm -rf $(BUILD_DIR) $(LIB_DIR)
+
+.PHONY: all dirs clean test
diff --git a/README.md b/README.md
@@ -1 +1,210 @@
-# CThaiNLP
+# CThaiNLP
+
+![Build and Test](https://github.com/wannaphong/CThaiNLP/actions/workflows/test.yml/badge.svg)
+
+C implementation of Thai Natural Language Processing tools, ported from [PyThaiNLP](https://github.com/PyThaiNLP/pythainlp).
+
+## Features
+
+- **newmm**: Dictionary-based maximal matching word segmentation constrained by Thai Character Cluster (TCC) boundaries
+- Similar API to PyThaiNLP for easy migration from Python to C
+- UTF-8 support
+- Efficient Trie data structure for dictionary lookup
+- Handles mixed Thai/English/numeric content
+
+## Building
+
+### Prerequisites
+
+- GCC or compatible C compiler
+- Make
+
+### Compilation
+
+```bash
+make
+```
+
+This will create:
+- Static library: `lib/libcthainlp.a`
+- Example program: `build/example_basic`
+
+## Usage
+
+### Basic Example
+
+```c
+#include "newmm.h"
+
+int main() {
+    const char* text = "ฉันไปโรงเรียน";
+    int token_count;
+
+    // Segment text (with NULL for dict_path to use default dictionary)
+    char** tokens = newmm_segment(text, NULL, &token_count);
+
+    // Print tokens
+    for (int i = 0; i < token_count; i++) {
+        printf("%s\n", tokens[i]);
+    }
+
+    // Free memory
+    newmm_free_result(tokens, token_count);
+
+    return 0;
+}
+```
+
+### Compile Your Program
+
+```bash
+gcc your_program.c -I./include -L./lib -lcthainlp -o your_program
+```
+
+### Running Examples
+
+Basic example with default dictionary:
+```bash
+./build/example_basic "ฉันไปโรงเรียน"
+```
+
+With custom dictionary:
+```bash
+./build/example_basic "ฉันไปโรงเรียน" data/thai_words.txt
+```
+
+### Running Tests
+
+Run the test suite:
+```bash
+make test
+```
+
+This will compile and run all unit tests to verify the tokenizer is working correctly.
+
+## API Reference
+
+### Functions
+
+#### `char** newmm_segment(const char* text, const char* dict_path, int* token_count)`
+
+Segment Thai text into words using the newmm algorithm.
+
+**Parameters:**
+- `text`: Input text to segment (UTF-8 encoded)
+- `dict_path`: Path to dictionary file (one word per line, UTF-8). Use `NULL` for default dictionary
+- `token_count`: Output parameter - receives the number of tokens found
+
+**Returns:**
+- Array of strings (tokens), or `NULL` on error
+- Caller must free the result using `newmm_free_result()`
+
+**Example:**
+```c
+int count;
+char** tokens = newmm_segment("ฉันไปโรงเรียน", "dict.txt", &count);
+```
+
+#### `void newmm_free_result(char** tokens, int token_count)`
+
+Free memory allocated by `newmm_segment()`.
+
+**Parameters:**
+- `tokens`: Array of tokens returned by `newmm_segment()`
+- `token_count`: Number of tokens in the array
+
+**Example:**
+```c
+newmm_free_result(tokens, count);
+```
+
+## Dictionary Format
+
+Dictionary files should contain one word per line in UTF-8 encoding:
+
+```
+ฉัน
+ไป
+โรงเรียน
+วันนี้
+อากาศ
+ดี
+มาก
+```
+
+A sample dictionary is provided in `data/thai_words.txt`.
+
+## Comparison with PyThaiNLP
+
+The API is designed to be similar to PyThaiNLP's `segment()` function:
+
+**PyThaiNLP (Python):**
+```python
+from pythainlp.tokenize import word_tokenize
+
+text = "ฉันไปโรงเรียน"
+tokens = word_tokenize(text, engine="newmm")
+print(tokens)  # ['ฉัน', 'ไป', 'โรงเรียน']
+```
+
+**CThaiNLP (C):**
+```c
+const char* text = "ฉันไปโรงเรียน";
+int token_count;
+char** tokens = newmm_segment(text, NULL, &token_count);
+// tokens = ['ฉัน', 'ไป', 'โรงเรียน']
+newmm_free_result(tokens, token_count);
+```
+
+## Algorithm
+
+The newmm (New Maximum Matching) algorithm:
+
+1. **Trie-based Dictionary Lookup**: Uses a trie data structure for efficient prefix matching
+2. **Thai Character Cluster (TCC) Boundaries**: Respects Thai character cluster rules for valid word boundaries
+3. **Maximal Matching**: Finds the longest dictionary word that matches at each position
+4. **Fallback Handling**: Handles non-dictionary words and non-Thai characters (Latin, digits, etc.)
+
+## Project Structure
+
+```
+CThaiNLP/
+├── include/
+│   └── newmm.h           # Public API header
+├── src/
+│   ├── newmm.c           # Main newmm implementation
+│   ├── trie.c            # Trie data structure
+│   ├── trie.h            # Trie header
+│   ├── tcc.c             # Thai Character Cluster
+│   └── tcc.h             # TCC header
+├── examples/
+│   └── example_basic.c   # Basic usage example
+├── tests/
+│   └── test_newmm.c      # Test suite
+├── data/
+│   └── thai_words.txt    # Sample dictionary
+├── Makefile              # Build configuration
+└── README.md             # This file
+```
+
+## Credits
+
+- Original PyThaiNLP implementation: [PyThaiNLP Project](https://github.com/PyThaiNLP/pythainlp)
+- newmm algorithm: Based on work by Korakot Chaovavanich
+- TCC rules: Theeramunkong et al. 2000
+
+## License
+
+Apache License 2.0 (following PyThaiNLP's license)
+
+## Contributing
+
+Contributions are welcome! Please feel free to submit issues or pull requests.
+
+## Future Enhancements
+
+- [ ] Add more tokenization engines (attacut, deepcut, etc.)
+- [ ] Improve performance with optimized data structures
+- [ ] Add part-of-speech tagging
+- [ ] Add named entity recognition
+- [ ] Provide Python bindings (PyPI package)
-Original file line number
+Diff line change
@@ Expand Up / @@ -53,3 +53,7 @@ dkms.conf @@
     # debug information files
     *.dwo
+    # Build directories
+    build/
+    lib/