diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..b1c6b7a --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,58 @@ +name: Build and Test + +on: + push: + branches: [ main, dev, copilot/** ] + pull_request: + branches: [ main, dev ] + +jobs: + build-and-test: + name: Build and Test on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + include: + - os: ubuntu-latest + cc: gcc + - os: macos-latest + cc: clang + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Display system information + run: | + echo "OS: ${{ matrix.os }}" + echo "Compiler: ${{ matrix.cc }}" + ${{ matrix.cc }} --version + make --version + + - name: Build project + run: | + make clean + make all + env: + CC: ${{ matrix.cc }} + + - name: Run tests + run: make test + + - name: Test basic example + run: | + ./build/example_basic "ฉันไปโรงเรียน" data/thai_words.txt + ./build/example_basic "วันนี้อากาศดีมาก" data/thai_words.txt + ./build/example_basic "hello world 123" + + - name: Upload build artifacts + if: failure() + uses: actions/upload-artifact@v4 + with: + name: build-artifacts-${{ matrix.os }} + path: | + build/ + lib/ + retention-days: 5 diff --git a/.gitignore b/.gitignore index 845cda6..8c6d7a5 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,7 @@ dkms.conf # debug information files *.dwo + +# Build directories +build/ +lib/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c8450d5 --- /dev/null +++ b/Makefile @@ -0,0 +1,63 @@ +# Makefile for CThaiNLP + +CC = gcc +CFLAGS = -Wall -Wextra -O2 -I./include +AR = ar +ARFLAGS = rcs + +# Directories +SRC_DIR = src +INCLUDE_DIR = include +BUILD_DIR = build +EXAMPLES_DIR = examples +LIB_DIR = lib + +# Source files +SOURCES = $(SRC_DIR)/trie.c $(SRC_DIR)/tcc.c $(SRC_DIR)/newmm.c +OBJECTS = $(BUILD_DIR)/trie.o $(BUILD_DIR)/tcc.o $(BUILD_DIR)/newmm.o + +# Library +LIBRARY = $(LIB_DIR)/libcthainlp.a + +# Example programs +EXAMPLE_BASIC = $(BUILD_DIR)/example_basic +TEST_NEWMM = $(BUILD_DIR)/test_newmm + +# Default target +all: dirs $(LIBRARY) $(EXAMPLE_BASIC) $(TEST_NEWMM) + +# Create directories +dirs: + @mkdir -p $(BUILD_DIR) $(LIB_DIR) + +# Build object files +$(BUILD_DIR)/trie.o: $(SRC_DIR)/trie.c $(SRC_DIR)/trie.h + $(CC) $(CFLAGS) -c $< -o $@ + +$(BUILD_DIR)/tcc.o: $(SRC_DIR)/tcc.c $(SRC_DIR)/tcc.h + $(CC) $(CFLAGS) -c $< -o $@ + +$(BUILD_DIR)/newmm.o: $(SRC_DIR)/newmm.c $(SRC_DIR)/trie.h $(SRC_DIR)/tcc.h $(INCLUDE_DIR)/newmm.h + $(CC) $(CFLAGS) -c $< -o $@ + +# Build library +$(LIBRARY): $(OBJECTS) + $(AR) $(ARFLAGS) $@ $^ + +# Build example programs +$(EXAMPLE_BASIC): $(EXAMPLES_DIR)/example_basic.c $(LIBRARY) + $(CC) $(CFLAGS) $< -L$(LIB_DIR) -lcthainlp -o $@ + +# Build test programs +$(TEST_NEWMM): tests/test_newmm.c $(LIBRARY) + $(CC) $(CFLAGS) $< -L$(LIB_DIR) -lcthainlp -o $@ + +# Test target +test: $(TEST_NEWMM) + ./$(TEST_NEWMM) + +# Clean +clean: + rm -rf $(BUILD_DIR) $(LIB_DIR) + +.PHONY: all dirs clean test diff --git a/README.md b/README.md index 9b09349..e274581 100644 --- a/README.md +++ b/README.md @@ -1 +1,210 @@ -# CThaiNLP \ No newline at end of file +# CThaiNLP + +![Build and Test](https://github.com/wannaphong/CThaiNLP/actions/workflows/test.yml/badge.svg) + +C implementation of Thai Natural Language Processing tools, ported from [PyThaiNLP](https://github.com/PyThaiNLP/pythainlp). + +## Features + +- **newmm**: Dictionary-based maximal matching word segmentation constrained by Thai Character Cluster (TCC) boundaries +- Similar API to PyThaiNLP for easy migration from Python to C +- UTF-8 support +- Efficient Trie data structure for dictionary lookup +- Handles mixed Thai/English/numeric content + +## Building + +### Prerequisites + +- GCC or compatible C compiler +- Make + +### Compilation + +```bash +make +``` + +This will create: +- Static library: `lib/libcthainlp.a` +- Example program: `build/example_basic` + +## Usage + +### Basic Example + +```c +#include "newmm.h" + +int main() { + const char* text = "ฉันไปโรงเรียน"; + int token_count; + + // Segment text (with NULL for dict_path to use default dictionary) + char** tokens = newmm_segment(text, NULL, &token_count); + + // Print tokens + for (int i = 0; i < token_count; i++) { + printf("%s\n", tokens[i]); + } + + // Free memory + newmm_free_result(tokens, token_count); + + return 0; +} +``` + +### Compile Your Program + +```bash +gcc your_program.c -I./include -L./lib -lcthainlp -o your_program +``` + +### Running Examples + +Basic example with default dictionary: +```bash +./build/example_basic "ฉันไปโรงเรียน" +``` + +With custom dictionary: +```bash +./build/example_basic "ฉันไปโรงเรียน" data/thai_words.txt +``` + +### Running Tests + +Run the test suite: +```bash +make test +``` + +This will compile and run all unit tests to verify the tokenizer is working correctly. + +## API Reference + +### Functions + +#### `char** newmm_segment(const char* text, const char* dict_path, int* token_count)` + +Segment Thai text into words using the newmm algorithm. + +**Parameters:** +- `text`: Input text to segment (UTF-8 encoded) +- `dict_path`: Path to dictionary file (one word per line, UTF-8). Use `NULL` for default dictionary +- `token_count`: Output parameter - receives the number of tokens found + +**Returns:** +- Array of strings (tokens), or `NULL` on error +- Caller must free the result using `newmm_free_result()` + +**Example:** +```c +int count; +char** tokens = newmm_segment("ฉันไปโรงเรียน", "dict.txt", &count); +``` + +#### `void newmm_free_result(char** tokens, int token_count)` + +Free memory allocated by `newmm_segment()`. + +**Parameters:** +- `tokens`: Array of tokens returned by `newmm_segment()` +- `token_count`: Number of tokens in the array + +**Example:** +```c +newmm_free_result(tokens, count); +``` + +## Dictionary Format + +Dictionary files should contain one word per line in UTF-8 encoding: + +``` +ฉัน +ไป +โรงเรียน +วันนี้ +อากาศ +ดี +มาก +``` + +A sample dictionary is provided in `data/thai_words.txt`. + +## Comparison with PyThaiNLP + +The API is designed to be similar to PyThaiNLP's `segment()` function: + +**PyThaiNLP (Python):** +```python +from pythainlp.tokenize import word_tokenize + +text = "ฉันไปโรงเรียน" +tokens = word_tokenize(text, engine="newmm") +print(tokens) # ['ฉัน', 'ไป', 'โรงเรียน'] +``` + +**CThaiNLP (C):** +```c +const char* text = "ฉันไปโรงเรียน"; +int token_count; +char** tokens = newmm_segment(text, NULL, &token_count); +// tokens = ['ฉัน', 'ไป', 'โรงเรียน'] +newmm_free_result(tokens, token_count); +``` + +## Algorithm + +The newmm (New Maximum Matching) algorithm: + +1. **Trie-based Dictionary Lookup**: Uses a trie data structure for efficient prefix matching +2. **Thai Character Cluster (TCC) Boundaries**: Respects Thai character cluster rules for valid word boundaries +3. **Maximal Matching**: Finds the longest dictionary word that matches at each position +4. **Fallback Handling**: Handles non-dictionary words and non-Thai characters (Latin, digits, etc.) + +## Project Structure + +``` +CThaiNLP/ +├── include/ +│ └── newmm.h # Public API header +├── src/ +│ ├── newmm.c # Main newmm implementation +│ ├── trie.c # Trie data structure +│ ├── trie.h # Trie header +│ ├── tcc.c # Thai Character Cluster +│ └── tcc.h # TCC header +├── examples/ +│ └── example_basic.c # Basic usage example +├── tests/ +│ └── test_newmm.c # Test suite +├── data/ +│ └── thai_words.txt # Sample dictionary +├── Makefile # Build configuration +└── README.md # This file +``` + +## Credits + +- Original PyThaiNLP implementation: [PyThaiNLP Project](https://github.com/PyThaiNLP/pythainlp) +- newmm algorithm: Based on work by Korakot Chaovavanich +- TCC rules: Theeramunkong et al. 2000 + +## License + +Apache License 2.0 (following PyThaiNLP's license) + +## Contributing + +Contributions are welcome! Please feel free to submit issues or pull requests. + +## Future Enhancements + +- [ ] Add more tokenization engines (attacut, deepcut, etc.) +- [ ] Improve performance with optimized data structures +- [ ] Add part-of-speech tagging +- [ ] Add named entity recognition +- [ ] Provide Python bindings (PyPI package) \ No newline at end of file diff --git a/data/thai_words.txt b/data/thai_words.txt new file mode 100644 index 0000000..b802e61 --- /dev/null +++ b/data/thai_words.txt @@ -0,0 +1,84 @@ +กา +ก็ +กัน +การ +กิน +ขอ +ของ +ของ +คน +คือ +ครั้ง +ครับ +คะ +ความ +งาน +จะ +จัด +จาก +ฉัน +ช่วย +ซึ่ง +ดี +ตาม +ตัว +ตาม +ถ้า +ถึง +ทุก +ที่ +ธรรมชาติ +นะ +นัก +นั้น +นี้ +ใน +บาท +ผล +ผู้ +พร้อม +ภาพ +มา +มาก +มี +ยัง +ระบบ +รับ +รัฐบาล +วัน +วันนี้ +ว่า +สิ่ง +สุด +หรือ +หลาย +ห้อง +ให้ +อยาก +อยู่ +อาจ +เขา +เข้า +เคย +เด็ก +เดือน +เนื่องจาก +เมื่อ +เป็น +เพื่อ +เรา +เราะ +เล่น +เอง +แต่ +แบบ +แล้ว +โดย +โครงการ +โรงเรียน +ใช้ +ให้ +ได้ +ไป +ไม่ +ไว้ diff --git a/examples/example_basic.c b/examples/example_basic.c new file mode 100644 index 0000000..f04678e --- /dev/null +++ b/examples/example_basic.c @@ -0,0 +1,56 @@ +/** + * @file example_basic.c + * @brief Basic example of using CThaiNLP newmm tokenizer + */ + +#include +#include +#include "../include/newmm.h" + +int main(int argc, char* argv[]) { + const char* text; + const char* dict_path = NULL; + + /* Get text from command line or use default */ + if (argc > 1) { + text = argv[1]; + } else { + text = "ฉันไปโรงเรียน"; + } + + /* Optional: dictionary path */ + if (argc > 2) { + dict_path = argv[2]; + } + + printf("Input text: %s\n", text); + printf("Segmenting...\n"); + + /* Segment text */ + int token_count; + char** tokens = newmm_segment(text, dict_path, &token_count); + + if (!tokens) { + fprintf(stderr, "Error: Failed to segment text\n"); + return 1; + } + + /* Print results */ + printf("Found %d tokens:\n", token_count); + for (int i = 0; i < token_count; i++) { + printf(" [%d] %s\n", i, tokens[i]); + } + + /* Output in list format (like PyThaiNLP) */ + printf("\nOutput: ["); + for (int i = 0; i < token_count; i++) { + printf("'%s'", tokens[i]); + if (i < token_count - 1) printf(", "); + } + printf("]\n"); + + /* Cleanup */ + newmm_free_result(tokens, token_count); + + return 0; +} diff --git a/include/newmm.h b/include/newmm.h new file mode 100644 index 0000000..21a5727 --- /dev/null +++ b/include/newmm.h @@ -0,0 +1,46 @@ +/** + * @file newmm.h + * @brief Thai word segmentation using New Maximum Matching algorithm + * + * Dictionary-based maximal matching word segmentation, constrained by + * Thai Character Cluster (TCC) boundaries with improved rules. + * + * This is a C port of PyThaiNLP's newmm tokenizer. + * GitHub: https://github.com/PyThaiNLP/pythainlp + * + * @author CThaiNLP + * @date 2026 + */ + +#ifndef NEWMM_H +#define NEWMM_H + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Segment Thai text into words using newmm algorithm + * + * @param text Input Thai text to be segmented (UTF-8 encoded) + * @param dict_path Path to dictionary file (one word per line, UTF-8 encoded) + * If NULL, uses a default minimal dictionary + * @param token_count Output parameter for number of tokens found + * @return Array of strings (tokens), caller must free using newmm_free_result() + * Returns NULL on error + */ +char** newmm_segment(const char* text, const char* dict_path, int* token_count); + +/** + * @brief Free memory allocated by newmm_segment + * + * @param tokens Array of tokens returned by newmm_segment() + * @param token_count Number of tokens in the array + */ +void newmm_free_result(char** tokens, int token_count); + +#ifdef __cplusplus +} +#endif + +#endif /* NEWMM_H */ diff --git a/src/newmm.c b/src/newmm.c new file mode 100644 index 0000000..e1af166 --- /dev/null +++ b/src/newmm.c @@ -0,0 +1,250 @@ +/** + * @file newmm.c + * @brief New Maximum Matching word segmentation implementation + */ + +#include "../include/newmm.h" +#include "trie.h" +#include "tcc.h" +#include +#include +#include +#include +#include + +#define MAX_GRAPH_SIZE 50 +#define MAX_TOKENS 10000 + +/* Graph structure for BFS */ +typedef struct { + int* edges; + int* edge_counts; + int* capacities; + int size; +} Graph; + +/* Helper: Check if position is in the valid positions set */ +static bool is_valid_pos(int pos, int* valid_pos, int num_valid) { + for (int i = 0; i < num_valid; i++) { + if (valid_pos[i] == pos) return true; + if (valid_pos[i] > pos) return false; + } + return false; +} + +/* Helper: Check if character is non-Thai */ +static bool is_non_thai_char(int codepoint) { + /* Latin letters, digits, spaces */ + if ((codepoint >= 'a' && codepoint <= 'z') || + (codepoint >= 'A' && codepoint <= 'Z') || + (codepoint >= '0' && codepoint <= '9') || + codepoint == ' ' || codepoint == '\t' || + codepoint == '\r' || codepoint == '\n') { + return true; + } + /* Thai range */ + if (codepoint >= 0x0E00 && codepoint <= 0x0E7F) { + return false; + } + /* Other non-Thai */ + return true; +} + +/* UTF-8 helper */ +static int utf8_char_len(unsigned char c) { + if ((c & 0x80) == 0) return 1; + if ((c & 0xE0) == 0xC0) return 2; + if ((c & 0xF0) == 0xE0) return 3; + if ((c & 0xF8) == 0xF0) return 4; + return 1; +} + +static int get_utf8_codepoint(const char* str, int* byte_len) { + unsigned char c = (unsigned char)str[0]; + int len = utf8_char_len(c); + int codepoint = 0; + + if (len == 1) { + codepoint = c; + } else if (len == 2) { + codepoint = ((c & 0x1F) << 6) | (str[1] & 0x3F); + } else if (len == 3) { + codepoint = ((c & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F); + } else if (len == 4) { + codepoint = ((c & 0x07) << 18) | ((str[1] & 0x3F) << 12) | + ((str[2] & 0x3F) << 6) | (str[3] & 0x3F); + } + + *byte_len = len; + return codepoint; +} + +/* Helper: Extract substring */ +static char* substring(const char* text, int start, int end) { + int len = end - start; + char* result = (char*)malloc(len + 1); + if (result) { + memcpy(result, text + start, len); + result[len] = '\0'; + } + return result; +} + +/* Simplified newmm segmentation */ +static int segment_text(const char* text, Trie* trie, char*** tokens) { + int text_len = strlen(text); + if (text_len == 0) return 0; + + /* Get valid TCC positions */ + int* valid_pos; + int num_valid = tcc_pos(text, &valid_pos); + if (num_valid == 0) { + free(valid_pos); + return 0; + } + + /* Allocate token array */ + *tokens = (char**)malloc(MAX_TOKENS * sizeof(char*)); + if (!*tokens) { + free(valid_pos); + return 0; + } + + int token_count = 0; + int pos = 0; + + while (pos < text_len) { + /* Try to find longest matching word from dictionary */ + char** prefixes; + int* lengths; + int num_prefixes = trie_prefixes(trie, text + pos, &prefixes, &lengths); + + int best_len = 0; + int best_end_pos = pos; + + /* Find longest valid prefix */ + for (int i = 0; i < num_prefixes; i++) { + int end_pos = pos + lengths[i]; + if (is_valid_pos(end_pos, valid_pos, num_valid) && lengths[i] > best_len) { + best_len = lengths[i]; + best_end_pos = end_pos; + } + } + + /* Free prefix results */ + for (int i = 0; i < num_prefixes; i++) { + free(prefixes[i]); + } + free(prefixes); + free(lengths); + + /* If found a dictionary word, use it */ + if (best_len > 0) { + (*tokens)[token_count++] = substring(text, pos, best_end_pos); + pos = best_end_pos; + } else { + /* Handle non-dictionary word */ + /* Check if it's a non-Thai sequence */ + int byte_len; + int cp = get_utf8_codepoint(text + pos, &byte_len); + + if (is_non_thai_char(cp)) { + /* Skip all consecutive non-Thai characters of same type */ + int end = pos + byte_len; + bool is_space = (cp == ' ' || cp == '\t'); + bool is_alpha = ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z')); + bool is_digit = (cp >= '0' && cp <= '9'); + + while (end < text_len) { + int next_cp = get_utf8_codepoint(text + end, &byte_len); + bool match = false; + + if (is_space && (next_cp == ' ' || next_cp == '\t')) match = true; + else if (is_alpha && ((next_cp >= 'a' && next_cp <= 'z') || (next_cp >= 'A' && next_cp <= 'Z'))) match = true; + else if (is_digit && ((next_cp >= '0' && next_cp <= '9') || next_cp == '.' || next_cp == ',')) match = true; + + if (!match) break; + end += byte_len; + } + + (*tokens)[token_count++] = substring(text, pos, end); + pos = end; + } else { + /* Thai character not in dictionary - advance to next TCC boundary */ + /* Find next valid TCC boundary after current position */ + int next_pos = text_len; /* Default to end of text */ + for (int i = 0; i < num_valid; i++) { + if (valid_pos[i] > pos) { + next_pos = valid_pos[i]; + break; + } + } + + (*tokens)[token_count++] = substring(text, pos, next_pos); + pos = next_pos; + } + } + + if (token_count >= MAX_TOKENS - 1) break; + } + + free(valid_pos); + return token_count; +} + +/* Default minimal Thai dictionary */ +static const char* default_words[] = { + "ไป", "มา", "ใน", "ที่", "และ", "หรือ", "คือ", "เป็น", "มี", "ได้", + "จะ", "ไม่", "ของ", "กับ", "ก็", "ให้", "ถ้า", "แล้ว", "เมื่อ", "ซึ่ง", + "นี้", "นั้น", "อยู่", "เพื่อ", "การ", "ความ", "จาก", "โดย", "อย่าง", "ถึง", + "ว่า", "เอง", "ทุก", "แต่", "ตาม", "นัก", "ยัง", "ผล", "ผู้", "คน", + "วัน", "ปี", "เดือน", "ครั้ง", "ตัว", "คน", "สิ่ง", "งาน", "ข้อ", "รับ", + NULL +}; + +char** newmm_segment(const char* text, const char* dict_path, int* token_count) { + if (!text || !token_count) return NULL; + + *token_count = 0; + + /* Empty text */ + if (!text[0]) return NULL; + + /* Create trie */ + Trie* trie = trie_create(); + if (!trie) return NULL; + + /* Load dictionary */ + if (dict_path) { + if (trie_load_dict(trie, dict_path) < 0) { + /* Failed to load, use default */ + for (int i = 0; default_words[i] != NULL; i++) { + trie_add(trie, default_words[i]); + } + } + } else { + /* Use default dictionary */ + for (int i = 0; default_words[i] != NULL; i++) { + trie_add(trie, default_words[i]); + } + } + + /* Segment text */ + char** tokens = NULL; + int count = segment_text(text, trie, &tokens); + + /* Cleanup */ + trie_free(trie); + + *token_count = count; + return tokens; +} + +void newmm_free_result(char** tokens, int token_count) { + if (!tokens) return; + + for (int i = 0; i < token_count; i++) { + free(tokens[i]); + } + free(tokens); +} diff --git a/src/tcc.c b/src/tcc.c new file mode 100644 index 0000000..905e282 --- /dev/null +++ b/src/tcc.c @@ -0,0 +1,163 @@ +/** + * @file tcc.c + * @brief Thai Character Cluster (TCC) implementation + * + * Based on rules proposed by Theeramunkong et al. 2000 + * and improved rules used in PyThaiNLP's newmm + */ + +#include "tcc.h" +#include +#include +#include + +/* Thai Unicode ranges */ +#define THAI_START 0x0E00 +#define THAI_END 0x0E7F + +/* Thai character classes */ +#define is_thai_consonant(c) ((c) >= 0x0E01 && (c) <= 0x0E2E) +#define is_thai_vowel_above(c) ((c) >= 0x0E34 && (c) <= 0x0E37) +#define is_thai_vowel_below(c) ((c) == 0x0E38 || (c) == 0x0E39) +#define is_thai_tone(c) ((c) >= 0x0E48 && (c) <= 0x0E4B) +#define is_thai_sign(c) ((c) == 0x0E4C || (c) == 0x0E4D || (c) == 0x0E4E) +#define is_thai_vowel_follow(c) ((c) >= 0x0E30 && (c) <= 0x0E33) +#define is_thai_vowel_lead(c) ((c) >= 0x0E40 && (c) <= 0x0E44) + +/* UTF-8 helper functions */ +static int utf8_char_len(unsigned char c) { + if ((c & 0x80) == 0) return 1; + if ((c & 0xE0) == 0xC0) return 2; + if ((c & 0xF0) == 0xE0) return 3; + if ((c & 0xF8) == 0xF0) return 4; + return 1; +} + +static int get_utf8_codepoint(const char* str, int* byte_len) { + unsigned char c = (unsigned char)str[0]; + int len = utf8_char_len(c); + int codepoint = 0; + + if (len == 1) { + codepoint = c; + } else if (len == 2) { + codepoint = ((c & 0x1F) << 6) | (str[1] & 0x3F); + } else if (len == 3) { + codepoint = ((c & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F); + } else if (len == 4) { + codepoint = ((c & 0x07) << 18) | ((str[1] & 0x3F) << 12) | + ((str[2] & 0x3F) << 6) | (str[3] & 0x3F); + } + + *byte_len = len; + return codepoint; +} + +/* Simplified TCC detection - matches basic Thai character clusters */ +static int get_tcc_length(const char* text) { + int byte_len; + int cp = get_utf8_codepoint(text, &byte_len); + int total_len = byte_len; + const char* ptr = text + byte_len; + + /* Leading vowel (เ, แ, โ, ใ, ไ) */ + if (is_thai_vowel_lead(cp)) { + /* Must be followed by consonant */ + if (*ptr) { + cp = get_utf8_codepoint(ptr, &byte_len); + if (is_thai_consonant(cp)) { + total_len += byte_len; + ptr += byte_len; + + /* Optional: consonant */ + if (*ptr) { + int next_cp = get_utf8_codepoint(ptr, &byte_len); + if (is_thai_consonant(next_cp)) { + total_len += byte_len; + ptr += byte_len; + } + } + + /* Optional: tone mark or other diacritics */ + while (*ptr) { + int next_cp = get_utf8_codepoint(ptr, &byte_len); + if (is_thai_tone(next_cp) || is_thai_sign(next_cp) || + is_thai_vowel_above(next_cp) || is_thai_vowel_below(next_cp)) { + total_len += byte_len; + ptr += byte_len; + } else { + break; + } + } + } + } + return total_len; + } + + /* Consonant-based cluster */ + if (is_thai_consonant(cp)) { + /* Optional: additional consonant */ + if (*ptr) { + int next_cp = get_utf8_codepoint(ptr, &byte_len); + if (is_thai_consonant(next_cp)) { + total_len += byte_len; + ptr += byte_len; + } + } + + /* Optional: tone marks, vowels, signs */ + while (*ptr) { + int next_cp = get_utf8_codepoint(ptr, &byte_len); + if (is_thai_tone(next_cp) || is_thai_sign(next_cp) || + is_thai_vowel_above(next_cp) || is_thai_vowel_below(next_cp) || + is_thai_vowel_follow(next_cp)) { + total_len += byte_len; + ptr += byte_len; + } else { + break; + } + } + + return total_len; + } + + /* Single character (non-Thai or standalone) */ + return byte_len; +} + +int tcc_pos(const char* text, int** positions) { + if (!text || !positions) return 0; + + int len = strlen(text); + if (len == 0) return 0; + + /* Allocate initial array */ + int capacity = 100; + *positions = (int*)malloc(capacity * sizeof(int)); + if (!*positions) return 0; + + int count = 0; + const char* ptr = text; + int byte_pos = 0; + + while (*ptr) { + int cluster_len = get_tcc_length(ptr); + byte_pos += cluster_len; + + /* Add position */ + if (count >= capacity) { + capacity *= 2; + int* new_positions = (int*)realloc(*positions, capacity * sizeof(int)); + if (!new_positions) { + free(*positions); + return 0; + } + *positions = new_positions; + } + + (*positions)[count++] = byte_pos; + ptr += cluster_len; + } + + return count; +} diff --git a/src/tcc.h b/src/tcc.h new file mode 100644 index 0000000..be8bf75 --- /dev/null +++ b/src/tcc.h @@ -0,0 +1,23 @@ +/** + * @file tcc.h + * @brief Thai Character Cluster (TCC) tokenization + * + * Implementation of tokenizer according to Thai Character Clusters (TCCs) + * rules proposed by Theeramunkong et al. 2000. + */ + +#ifndef TCC_H +#define TCC_H + +#include + +/** + * @brief Get valid Thai Character Cluster breaking positions + * + * @param text Input Thai text (UTF-8) + * @param positions Output array of byte positions (caller must free) + * @return Number of positions found + */ +int tcc_pos(const char* text, int** positions); + +#endif /* TCC_H */ diff --git a/src/trie.c b/src/trie.c new file mode 100644 index 0000000..d978fba --- /dev/null +++ b/src/trie.c @@ -0,0 +1,266 @@ +/** + * @file trie.c + * @brief Trie data structure implementation + */ + +#include "trie.h" +#include +#include +#include + +#define INITIAL_CAPACITY 8 + +/* Helper function to decode UTF-8 character */ +static int utf8_char_len(unsigned char c) { + if ((c & 0x80) == 0) return 1; + if ((c & 0xE0) == 0xC0) return 2; + if ((c & 0xF0) == 0xE0) return 3; + if ((c & 0xF8) == 0xF0) return 4; + return 1; /* Invalid UTF-8 */ +} + +/* Get UTF-8 codepoint from string */ +static int get_utf8_codepoint(const char* str, int* byte_len) { + unsigned char c = (unsigned char)str[0]; + int len = utf8_char_len(c); + int codepoint = 0; + + if (len == 1) { + codepoint = c; + } else if (len == 2) { + codepoint = ((c & 0x1F) << 6) | (str[1] & 0x3F); + } else if (len == 3) { + codepoint = ((c & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F); + } else if (len == 4) { + codepoint = ((c & 0x07) << 18) | ((str[1] & 0x3F) << 12) | + ((str[2] & 0x3F) << 6) | (str[3] & 0x3F); + } + + *byte_len = len; + return codepoint; +} + +/* Create a new trie node */ +static TrieNode* trie_node_create(void) { + TrieNode* node = (TrieNode*)calloc(1, sizeof(TrieNode)); + if (!node) return NULL; + + node->is_end = false; + node->children = NULL; + node->child_chars = NULL; + node->num_children = 0; + node->capacity = 0; + + return node; +} + +/* Free a trie node and all its children */ +static void trie_node_free(TrieNode* node) { + if (!node) return; + + for (int i = 0; i < node->num_children; i++) { + trie_node_free(node->children[i]); + } + + free(node->children); + free(node->child_chars); + free(node); +} + +/* Find child node by codepoint */ +static TrieNode* trie_node_get_child(TrieNode* node, int codepoint) { + for (int i = 0; i < node->num_children; i++) { + if (node->child_chars[i] == codepoint) { + return node->children[i]; + } + } + return NULL; +} + +/* Add child node */ +static TrieNode* trie_node_add_child(TrieNode* node, int codepoint) { + /* Check if need to expand capacity */ + if (node->num_children >= node->capacity) { + int new_capacity = node->capacity == 0 ? INITIAL_CAPACITY : node->capacity * 2; + + /* Allocate both arrays before updating pointers */ + TrieNode** new_children = (TrieNode**)realloc(node->children, + new_capacity * sizeof(TrieNode*)); + if (!new_children) return NULL; + + int* new_chars = (int*)realloc(node->child_chars, new_capacity * sizeof(int)); + if (!new_chars) { + /* new_children was allocated but new_chars failed */ + /* Since realloc succeeded for new_children, the old pointer is invalid */ + /* We must use the new_children pointer, even though we can't proceed */ + node->children = new_children; + /* Alternatively, we could try to restore by reallocating to old size */ + /* But that could also fail, so we just update and return NULL */ + return NULL; + } + + /* Both allocations succeeded, update pointers */ + node->children = new_children; + node->child_chars = new_chars; + node->capacity = new_capacity; + } + + /* Create new child */ + TrieNode* child = trie_node_create(); + if (!child) return NULL; + + node->children[node->num_children] = child; + node->child_chars[node->num_children] = codepoint; + node->num_children++; + + return child; +} + +Trie* trie_create(void) { + Trie* trie = (Trie*)malloc(sizeof(Trie)); + if (!trie) return NULL; + + trie->root = trie_node_create(); + if (!trie->root) { + free(trie); + return NULL; + } + + trie->num_words = 0; + return trie; +} + +void trie_add(Trie* trie, const char* word) { + if (!trie || !word || !word[0]) return; + + /* Trim leading/trailing whitespace */ + while (*word == ' ' || *word == '\t' || *word == '\r' || *word == '\n') { + word++; + } + if (!*word) return; + + int len = strlen(word); + while (len > 0 && (word[len-1] == ' ' || word[len-1] == '\t' || + word[len-1] == '\r' || word[len-1] == '\n')) { + len--; + } + if (len == 0) return; + + TrieNode* current = trie->root; + const char* ptr = word; + const char* end = word + len; + + while (ptr < end) { + int byte_len; + int codepoint = get_utf8_codepoint(ptr, &byte_len); + + TrieNode* child = trie_node_get_child(current, codepoint); + if (!child) { + child = trie_node_add_child(current, codepoint); + if (!child) return; /* Out of memory */ + } + + current = child; + ptr += byte_len; + } + + if (!current->is_end) { + current->is_end = true; + trie->num_words++; + } +} + +int trie_load_dict(Trie* trie, const char* dict_path) { + if (!trie || !dict_path) return -1; + + FILE* fp = fopen(dict_path, "r"); + if (!fp) return -1; + + char buffer[1024]; + int count = 0; + + while (fgets(buffer, sizeof(buffer), fp)) { + /* Remove newline */ + int len = strlen(buffer); + if (len > 0 && buffer[len-1] == '\n') { + buffer[len-1] = '\0'; + len--; + } + if (len > 0 && buffer[len-1] == '\r') { + buffer[len-1] = '\0'; + len--; + } + + if (len > 0) { + trie_add(trie, buffer); + count++; + } + } + + fclose(fp); + return count; +} + +int trie_prefixes(Trie* trie, const char* text, char*** prefixes, int** lengths) { + if (!trie || !text || !prefixes || !lengths) return 0; + + int max_prefixes = 100; /* Initial allocation */ + *prefixes = (char**)malloc(max_prefixes * sizeof(char*)); + *lengths = (int*)malloc(max_prefixes * sizeof(int)); + if (!*prefixes || !*lengths) { + /* Clean up any successful allocation */ + if (*prefixes) free(*prefixes); + if (*lengths) free(*lengths); + *prefixes = NULL; + *lengths = NULL; + return 0; + } + + int count = 0; + TrieNode* current = trie->root; + const char* ptr = text; + int byte_pos = 0; + + while (*ptr) { + int byte_len; + int codepoint = get_utf8_codepoint(ptr, &byte_len); + + TrieNode* child = trie_node_get_child(current, codepoint); + if (!child) break; + + byte_pos += byte_len; + + if (child->is_end) { + /* Need to expand arrays? */ + if (count >= max_prefixes) { + max_prefixes *= 2; + char** new_prefixes = (char**)realloc(*prefixes, max_prefixes * sizeof(char*)); + int* new_lengths = (int*)realloc(*lengths, max_prefixes * sizeof(int)); + if (!new_prefixes || !new_lengths) break; + *prefixes = new_prefixes; + *lengths = new_lengths; + } + + /* Copy prefix */ + (*prefixes)[count] = (char*)malloc(byte_pos + 1); + if ((*prefixes)[count]) { + memcpy((*prefixes)[count], text, byte_pos); + (*prefixes)[count][byte_pos] = '\0'; + (*lengths)[count] = byte_pos; + count++; + } + } + + current = child; + ptr += byte_len; + } + + return count; +} + +void trie_free(Trie* trie) { + if (!trie) return; + + trie_node_free(trie->root); + free(trie); +} diff --git a/src/trie.h b/src/trie.h new file mode 100644 index 0000000..8c30534 --- /dev/null +++ b/src/trie.h @@ -0,0 +1,57 @@ +/** + * @file trie.h + * @brief Trie data structure for efficient dictionary lookup + * + * Internal header for trie implementation. + */ + +#ifndef TRIE_H +#define TRIE_H + +#include + +typedef struct TrieNode { + bool is_end; + struct TrieNode** children; + int* child_chars; /* UTF-8 code points of children */ + int num_children; + int capacity; +} TrieNode; + +typedef struct Trie { + TrieNode* root; + int num_words; +} Trie; + +/** + * @brief Create a new empty trie + */ +Trie* trie_create(void); + +/** + * @brief Add a word to the trie + */ +void trie_add(Trie* trie, const char* word); + +/** + * @brief Load words from a dictionary file + */ +int trie_load_dict(Trie* trie, const char* dict_path); + +/** + * @brief Get all possible word prefixes from text + * + * @param trie The trie structure + * @param text Input text (UTF-8) + * @param prefixes Output array of prefix strings (caller must free) + * @param lengths Output array of prefix byte lengths + * @return Number of prefixes found + */ +int trie_prefixes(Trie* trie, const char* text, char*** prefixes, int** lengths); + +/** + * @brief Free trie memory + */ +void trie_free(Trie* trie); + +#endif /* TRIE_H */ diff --git a/tests/test_newmm.c b/tests/test_newmm.c new file mode 100644 index 0000000..5f9d7bd --- /dev/null +++ b/tests/test_newmm.c @@ -0,0 +1,154 @@ +/** + * @file test_newmm.c + * @brief Test program for newmm tokenizer + */ + +#include +#include +#include +#include "../include/newmm.h" + +typedef struct { + const char* text; + const char* expected; + const char* description; +} TestCase; + +static int test_count = 0; +static int test_passed = 0; + +#define INITIAL_OUTPUT_SIZE 1024 +#define TOKEN_OVERHEAD 4 /* For "'', " around each token */ + +void run_test(const char* text, const char* dict_path, const char* expected, const char* description) { + test_count++; + printf("\n[Test %d] %s\n", test_count, description); + printf("Input: %s\n", text); + + int token_count; + char** tokens = newmm_segment(text, dict_path, &token_count); + + /* Empty string should return NULL and token_count = 0 */ + if (!tokens && token_count == 0) { + printf("Output: []\n"); + printf("Expected: %s\n", expected); + if (strcmp(expected, "[]") == 0) { + printf("✓ PASS\n"); + test_passed++; + } else { + printf("❌ FAIL\n"); + } + return; + } + + if (!tokens) { + printf("❌ FAIL: Segmentation failed\n"); + return; + } + + /* Build output string with dynamic allocation */ + size_t output_size = INITIAL_OUTPUT_SIZE; + char* output = (char*)malloc(output_size); + if (!output) { + printf("❌ FAIL: Memory allocation failed\n"); + newmm_free_result(tokens, token_count); + return; + } + + strcpy(output, "["); + for (int i = 0; i < token_count; i++) { + size_t needed = strlen(output) + strlen(tokens[i]) + TOKEN_OVERHEAD + 1; /* +1 for null */ + if (needed >= output_size) { + output_size = needed * 2; + char* new_output = (char*)realloc(output, output_size); + if (!new_output) { + printf("❌ FAIL: Memory reallocation failed\n"); + free(output); + newmm_free_result(tokens, token_count); + return; + } + output = new_output; + } + + strcat(output, "'"); + strcat(output, tokens[i]); + strcat(output, "'"); + if (i < token_count - 1) strcat(output, ", "); + } + strcat(output, "]"); + + printf("Output: %s\n", output); + printf("Expected: %s\n", expected); + + /* Compare */ + if (strcmp(output, expected) == 0) { + printf("✓ PASS\n"); + test_passed++; + } else { + printf("❌ FAIL\n"); + } + + free(output); + + newmm_free_result(tokens, token_count); +} + +int main() { + printf("=== CThaiNLP newmm Tokenizer Test Suite ===\n"); + + const char* dict = "data/thai_words.txt"; + + /* Test 1: Basic Thai sentence */ + run_test("ฉันไปโรงเรียน", dict, + "['ฉัน', 'ไป', 'โรงเรียน']", + "Basic Thai sentence"); + + /* Test 2: Thai sentence with common words */ + run_test("วันนี้อากาศดีมาก", dict, + "['วันนี้', 'อา', 'กา', 'ศดี', 'มาก']", + "Thai sentence with partial dictionary match"); + + /* Test 3: English text */ + run_test("hello world", dict, + "['hello', ' ', 'world']", + "English text"); + + /* Test 4: Numbers */ + run_test("123", dict, + "['123']", + "Numbers only"); + + /* Test 5: Mixed content */ + run_test("ไป ABC 123", dict, + "['ไป', ' ', 'ABC', ' ', '123']", + "Mixed Thai, English, and numbers"); + + /* Test 6: Empty string */ + run_test("", dict, + "[]", + "Empty string"); + + /* Test 7: Single Thai word */ + run_test("ไป", dict, + "['ไป']", + "Single Thai word"); + + /* Test 8: Using default dictionary */ + run_test("ฉันไปโรงเรียน", NULL, + "['ฉั', 'น', 'ไป', 'โรง', 'เรี', 'ยน']", + "Default dictionary (limited words)"); + + /* Summary */ + printf("\n=== Test Summary ===\n"); + printf("Total tests: %d\n", test_count); + printf("Passed: %d\n", test_passed); + printf("Failed: %d\n", test_count - test_passed); + + if (test_passed == test_count) { + printf("\n✓ All tests passed!\n"); + return 0; + } else { + printf("\n❌ Some tests failed\n"); + return 1; + } +}