diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..b1c6b7a
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,58 @@
+name: Build and Test
+
+on:
+  push:
+    branches: [ main, dev, copilot/** ]
+  pull_request:
+    branches: [ main, dev ]
+
+jobs:
+  build-and-test:
+    name: Build and Test on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        include:
+          - os: ubuntu-latest
+            cc: gcc
+          - os: macos-latest
+            cc: clang
+    
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+    
+    - name: Display system information
+      run: |
+        echo "OS: ${{ matrix.os }}"
+        echo "Compiler: ${{ matrix.cc }}"
+        ${{ matrix.cc }} --version
+        make --version
+    
+    - name: Build project
+      run: |
+        make clean
+        make all
+      env:
+        CC: ${{ matrix.cc }}
+    
+    - name: Run tests
+      run: make test
+    
+    - name: Test basic example
+      run: |
+        ./build/example_basic "ฉันไปโรงเรียน" data/thai_words.txt
+        ./build/example_basic "วันนี้อากาศดีมาก" data/thai_words.txt
+        ./build/example_basic "hello world 123"
+    
+    - name: Upload build artifacts
+      if: failure()
+      uses: actions/upload-artifact@v4
+      with:
+        name: build-artifacts-${{ matrix.os }}
+        path: |
+          build/
+          lib/
+        retention-days: 5
diff --git a/.gitignore b/.gitignore
index 845cda6..8c6d7a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,3 +53,7 @@ dkms.conf
 
 # debug information files
 *.dwo
+
+# Build directories
+build/
+lib/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..c8450d5
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,63 @@
+# Makefile for CThaiNLP
+
+CC = gcc
+CFLAGS = -Wall -Wextra -O2 -I./include
+AR = ar
+ARFLAGS = rcs
+
+# Directories
+SRC_DIR = src
+INCLUDE_DIR = include
+BUILD_DIR = build
+EXAMPLES_DIR = examples
+LIB_DIR = lib
+
+# Source files
+SOURCES = $(SRC_DIR)/trie.c $(SRC_DIR)/tcc.c $(SRC_DIR)/newmm.c
+OBJECTS = $(BUILD_DIR)/trie.o $(BUILD_DIR)/tcc.o $(BUILD_DIR)/newmm.o
+
+# Library
+LIBRARY = $(LIB_DIR)/libcthainlp.a
+
+# Example programs
+EXAMPLE_BASIC = $(BUILD_DIR)/example_basic
+TEST_NEWMM = $(BUILD_DIR)/test_newmm
+
+# Default target
+all: dirs $(LIBRARY) $(EXAMPLE_BASIC) $(TEST_NEWMM)
+
+# Create directories
+dirs:
+	@mkdir -p $(BUILD_DIR) $(LIB_DIR)
+
+# Build object files
+$(BUILD_DIR)/trie.o: $(SRC_DIR)/trie.c $(SRC_DIR)/trie.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+$(BUILD_DIR)/tcc.o: $(SRC_DIR)/tcc.c $(SRC_DIR)/tcc.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+$(BUILD_DIR)/newmm.o: $(SRC_DIR)/newmm.c $(SRC_DIR)/trie.h $(SRC_DIR)/tcc.h $(INCLUDE_DIR)/newmm.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+# Build library
+$(LIBRARY): $(OBJECTS)
+	$(AR) $(ARFLAGS) $@ $^
+
+# Build example programs
+$(EXAMPLE_BASIC): $(EXAMPLES_DIR)/example_basic.c $(LIBRARY)
+	$(CC) $(CFLAGS) $< -L$(LIB_DIR) -lcthainlp -o $@
+
+# Build test programs
+$(TEST_NEWMM): tests/test_newmm.c $(LIBRARY)
+	$(CC) $(CFLAGS) $< -L$(LIB_DIR) -lcthainlp -o $@
+
+# Test target
+test: $(TEST_NEWMM)
+	./$(TEST_NEWMM)
+
+# Clean
+clean:
+	rm -rf $(BUILD_DIR) $(LIB_DIR)
+
+.PHONY: all dirs clean test
diff --git a/README.md b/README.md
index 9b09349..e274581 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,210 @@
-# CThaiNLP
\ No newline at end of file
+# CThaiNLP
+
+![Build and Test](https://github.com/wannaphong/CThaiNLP/actions/workflows/test.yml/badge.svg)
+
+C implementation of Thai Natural Language Processing tools, ported from [PyThaiNLP](https://github.com/PyThaiNLP/pythainlp).
+
+## Features
+
+- **newmm**: Dictionary-based maximal matching word segmentation constrained by Thai Character Cluster (TCC) boundaries
+- Similar API to PyThaiNLP for easy migration from Python to C
+- UTF-8 support
+- Efficient Trie data structure for dictionary lookup
+- Handles mixed Thai/English/numeric content
+
+## Building
+
+### Prerequisites
+
+- GCC or compatible C compiler
+- Make
+
+### Compilation
+
+```bash
+make
+```
+
+This will create:
+- Static library: `lib/libcthainlp.a`
+- Example program: `build/example_basic`
+
+## Usage
+
+### Basic Example
+
+```c
+#include "newmm.h"
+
+int main() {
+    const char* text = "ฉันไปโรงเรียน";
+    int token_count;
+    
+    // Segment text (with NULL for dict_path to use default dictionary)
+    char** tokens = newmm_segment(text, NULL, &token_count);
+    
+    // Print tokens
+    for (int i = 0; i < token_count; i++) {
+        printf("%s\n", tokens[i]);
+    }
+    
+    // Free memory
+    newmm_free_result(tokens, token_count);
+    
+    return 0;
+}
+```
+
+### Compile Your Program
+
+```bash
+gcc your_program.c -I./include -L./lib -lcthainlp -o your_program
+```
+
+### Running Examples
+
+Basic example with default dictionary:
+```bash
+./build/example_basic "ฉันไปโรงเรียน"
+```
+
+With custom dictionary:
+```bash
+./build/example_basic "ฉันไปโรงเรียน" data/thai_words.txt
+```
+
+### Running Tests
+
+Run the test suite:
+```bash
+make test
+```
+
+This will compile and run all unit tests to verify the tokenizer is working correctly.
+
+## API Reference
+
+### Functions
+
+#### `char** newmm_segment(const char* text, const char* dict_path, int* token_count)`
+
+Segment Thai text into words using the newmm algorithm.
+
+**Parameters:**
+- `text`: Input text to segment (UTF-8 encoded)
+- `dict_path`: Path to dictionary file (one word per line, UTF-8). Use `NULL` for default dictionary
+- `token_count`: Output parameter - receives the number of tokens found
+
+**Returns:**
+- Array of strings (tokens), or `NULL` on error
+- Caller must free the result using `newmm_free_result()`
+
+**Example:**
+```c
+int count;
+char** tokens = newmm_segment("ฉันไปโรงเรียน", "dict.txt", &count);
+```
+
+#### `void newmm_free_result(char** tokens, int token_count)`
+
+Free memory allocated by `newmm_segment()`.
+
+**Parameters:**
+- `tokens`: Array of tokens returned by `newmm_segment()`
+- `token_count`: Number of tokens in the array
+
+**Example:**
+```c
+newmm_free_result(tokens, count);
+```
+
+## Dictionary Format
+
+Dictionary files should contain one word per line in UTF-8 encoding:
+
+```
+ฉัน
+ไป
+โรงเรียน
+วันนี้
+อากาศ
+ดี
+มาก
+```
+
+A sample dictionary is provided in `data/thai_words.txt`.
+
+## Comparison with PyThaiNLP
+
+The API is designed to be similar to PyThaiNLP's `segment()` function:
+
+**PyThaiNLP (Python):**
+```python
+from pythainlp.tokenize import word_tokenize
+
+text = "ฉันไปโรงเรียน"
+tokens = word_tokenize(text, engine="newmm")
+print(tokens)  # ['ฉัน', 'ไป', 'โรงเรียน']
+```
+
+**CThaiNLP (C):**
+```c
+const char* text = "ฉันไปโรงเรียน";
+int token_count;
+char** tokens = newmm_segment(text, NULL, &token_count);
+// tokens = ['ฉัน', 'ไป', 'โรงเรียน']
+newmm_free_result(tokens, token_count);
+```
+
+## Algorithm
+
+The newmm (New Maximum Matching) algorithm:
+
+1. **Trie-based Dictionary Lookup**: Uses a trie data structure for efficient prefix matching
+2. **Thai Character Cluster (TCC) Boundaries**: Respects Thai character cluster rules for valid word boundaries
+3. **Maximal Matching**: Finds the longest dictionary word that matches at each position
+4. **Fallback Handling**: Handles non-dictionary words and non-Thai characters (Latin, digits, etc.)
+
+## Project Structure
+
+```
+CThaiNLP/
+├── include/
+│   └── newmm.h           # Public API header
+├── src/
+│   ├── newmm.c           # Main newmm implementation
+│   ├── trie.c            # Trie data structure
+│   ├── trie.h            # Trie header
+│   ├── tcc.c             # Thai Character Cluster
+│   └── tcc.h             # TCC header
+├── examples/
+│   └── example_basic.c   # Basic usage example
+├── tests/
+│   └── test_newmm.c      # Test suite
+├── data/
+│   └── thai_words.txt    # Sample dictionary
+├── Makefile              # Build configuration
+└── README.md             # This file
+```
+
+## Credits
+
+- Original PyThaiNLP implementation: [PyThaiNLP Project](https://github.com/PyThaiNLP/pythainlp)
+- newmm algorithm: Based on work by Korakot Chaovavanich
+- TCC rules: Theeramunkong et al. 2000
+
+## License
+
+Apache License 2.0 (following PyThaiNLP's license)
+
+## Contributing
+
+Contributions are welcome! Please feel free to submit issues or pull requests.
+
+## Future Enhancements
+
+- [ ] Add more tokenization engines (attacut, deepcut, etc.)
+- [ ] Improve performance with optimized data structures
+- [ ] Add part-of-speech tagging
+- [ ] Add named entity recognition
+- [ ] Provide Python bindings (PyPI package)
\ No newline at end of file
diff --git a/data/thai_words.txt b/data/thai_words.txt
new file mode 100644
index 0000000..b802e61
--- /dev/null
+++ b/data/thai_words.txt
@@ -0,0 +1,84 @@
+กา
+ก็
+กัน
+การ
+กิน
+ขอ
+ของ
+ของ
+คน
+คือ
+ครั้ง
+ครับ
+คะ
+ความ
+งาน
+จะ
+จัด
+จาก
+ฉัน
+ช่วย
+ซึ่ง
+ดี
+ตาม
+ตัว
+ตาม
+ถ้า
+ถึง
+ทุก
+ที่
+ธรรมชาติ
+นะ
+นัก
+นั้น
+นี้
+ใน
+บาท
+ผล
+ผู้
+พร้อม
+ภาพ
+มา
+มาก
+มี
+ยัง
+ระบบ
+รับ
+รัฐบาล
+วัน
+วันนี้
+ว่า
+สิ่ง
+สุด
+หรือ
+หลาย
+ห้อง
+ให้
+อยาก
+อยู่
+อาจ
+เขา
+เข้า
+เคย
+เด็ก
+เดือน
+เนื่องจาก
+เมื่อ
+เป็น
+เพื่อ
+เรา
+เราะ
+เล่น
+เอง
+แต่
+แบบ
+แล้ว
+โดย
+โครงการ
+โรงเรียน
+ใช้
+ให้
+ได้
+ไป
+ไม่
+ไว้
diff --git a/examples/example_basic.c b/examples/example_basic.c
new file mode 100644
index 0000000..f04678e
--- /dev/null
+++ b/examples/example_basic.c
@@ -0,0 +1,56 @@
+/**
+ * @file example_basic.c
+ * @brief Basic example of using CThaiNLP newmm tokenizer
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "../include/newmm.h"
+
+int main(int argc, char* argv[]) {
+    const char* text;
+    const char* dict_path = NULL;
+    
+    /* Get text from command line or use default */
+    if (argc > 1) {
+        text = argv[1];
+    } else {
+        text = "ฉันไปโรงเรียน";
+    }
+    
+    /* Optional: dictionary path */
+    if (argc > 2) {
+        dict_path = argv[2];
+    }
+    
+    printf("Input text: %s\n", text);
+    printf("Segmenting...\n");
+    
+    /* Segment text */
+    int token_count;
+    char** tokens = newmm_segment(text, dict_path, &token_count);
+    
+    if (!tokens) {
+        fprintf(stderr, "Error: Failed to segment text\n");
+        return 1;
+    }
+    
+    /* Print results */
+    printf("Found %d tokens:\n", token_count);
+    for (int i = 0; i < token_count; i++) {
+        printf("  [%d] %s\n", i, tokens[i]);
+    }
+    
+    /* Output in list format (like PyThaiNLP) */
+    printf("\nOutput: [");
+    for (int i = 0; i < token_count; i++) {
+        printf("'%s'", tokens[i]);
+        if (i < token_count - 1) printf(", ");
+    }
+    printf("]\n");
+    
+    /* Cleanup */
+    newmm_free_result(tokens, token_count);
+    
+    return 0;
+}
diff --git a/include/newmm.h b/include/newmm.h
new file mode 100644
index 0000000..21a5727
--- /dev/null
+++ b/include/newmm.h
@@ -0,0 +1,46 @@
+/**
+ * @file newmm.h
+ * @brief Thai word segmentation using New Maximum Matching algorithm
+ * 
+ * Dictionary-based maximal matching word segmentation, constrained by
+ * Thai Character Cluster (TCC) boundaries with improved rules.
+ * 
+ * This is a C port of PyThaiNLP's newmm tokenizer.
+ * GitHub: https://github.com/PyThaiNLP/pythainlp
+ * 
+ * @author CThaiNLP
+ * @date 2026
+ */
+
+#ifndef NEWMM_H
+#define NEWMM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Segment Thai text into words using newmm algorithm
+ * 
+ * @param text Input Thai text to be segmented (UTF-8 encoded)
+ * @param dict_path Path to dictionary file (one word per line, UTF-8 encoded)
+ *                  If NULL, uses a default minimal dictionary
+ * @param token_count Output parameter for number of tokens found
+ * @return Array of strings (tokens), caller must free using newmm_free_result()
+ *         Returns NULL on error
+ */
+char** newmm_segment(const char* text, const char* dict_path, int* token_count);
+
+/**
+ * @brief Free memory allocated by newmm_segment
+ * 
+ * @param tokens Array of tokens returned by newmm_segment()
+ * @param token_count Number of tokens in the array
+ */
+void newmm_free_result(char** tokens, int token_count);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NEWMM_H */
diff --git a/src/newmm.c b/src/newmm.c
new file mode 100644
index 0000000..e1af166
--- /dev/null
+++ b/src/newmm.c
@@ -0,0 +1,250 @@
+/**
+ * @file newmm.c
+ * @brief New Maximum Matching word segmentation implementation
+ */
+
+#include "../include/newmm.h"
+#include "trie.h"
+#include "tcc.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <ctype.h>
+
+#define MAX_GRAPH_SIZE 50
+#define MAX_TOKENS 10000
+
+/* Graph structure for BFS */
+typedef struct {
+    int* edges;
+    int* edge_counts;
+    int* capacities;
+    int size;
+} Graph;
+
+/* Helper: Check if position is in the valid positions set */
+static bool is_valid_pos(int pos, int* valid_pos, int num_valid) {
+    for (int i = 0; i < num_valid; i++) {
+        if (valid_pos[i] == pos) return true;
+        if (valid_pos[i] > pos) return false;
+    }
+    return false;
+}
+
+/* Helper: Check if character is non-Thai */
+static bool is_non_thai_char(int codepoint) {
+    /* Latin letters, digits, spaces */
+    if ((codepoint >= 'a' && codepoint <= 'z') ||
+        (codepoint >= 'A' && codepoint <= 'Z') ||
+        (codepoint >= '0' && codepoint <= '9') ||
+        codepoint == ' ' || codepoint == '\t' ||
+        codepoint == '\r' || codepoint == '\n') {
+        return true;
+    }
+    /* Thai range */
+    if (codepoint >= 0x0E00 && codepoint <= 0x0E7F) {
+        return false;
+    }
+    /* Other non-Thai */
+    return true;
+}
+
+/* UTF-8 helper */
+static int utf8_char_len(unsigned char c) {
+    if ((c & 0x80) == 0) return 1;
+    if ((c & 0xE0) == 0xC0) return 2;
+    if ((c & 0xF0) == 0xE0) return 3;
+    if ((c & 0xF8) == 0xF0) return 4;
+    return 1;
+}
+
+static int get_utf8_codepoint(const char* str, int* byte_len) {
+    unsigned char c = (unsigned char)str[0];
+    int len = utf8_char_len(c);
+    int codepoint = 0;
+    
+    if (len == 1) {
+        codepoint = c;
+    } else if (len == 2) {
+        codepoint = ((c & 0x1F) << 6) | (str[1] & 0x3F);
+    } else if (len == 3) {
+        codepoint = ((c & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F);
+    } else if (len == 4) {
+        codepoint = ((c & 0x07) << 18) | ((str[1] & 0x3F) << 12) | 
+                    ((str[2] & 0x3F) << 6) | (str[3] & 0x3F);
+    }
+    
+    *byte_len = len;
+    return codepoint;
+}
+
+/* Helper: Extract substring */
+static char* substring(const char* text, int start, int end) {
+    int len = end - start;
+    char* result = (char*)malloc(len + 1);
+    if (result) {
+        memcpy(result, text + start, len);
+        result[len] = '\0';
+    }
+    return result;
+}
+
+/* Simplified newmm segmentation */
+static int segment_text(const char* text, Trie* trie, char*** tokens) {
+    int text_len = strlen(text);
+    if (text_len == 0) return 0;
+    
+    /* Get valid TCC positions */
+    int* valid_pos;
+    int num_valid = tcc_pos(text, &valid_pos);
+    if (num_valid == 0) {
+        free(valid_pos);
+        return 0;
+    }
+    
+    /* Allocate token array */
+    *tokens = (char**)malloc(MAX_TOKENS * sizeof(char*));
+    if (!*tokens) {
+        free(valid_pos);
+        return 0;
+    }
+    
+    int token_count = 0;
+    int pos = 0;
+    
+    while (pos < text_len) {
+        /* Try to find longest matching word from dictionary */
+        char** prefixes;
+        int* lengths;
+        int num_prefixes = trie_prefixes(trie, text + pos, &prefixes, &lengths);
+        
+        int best_len = 0;
+        int best_end_pos = pos;
+        
+        /* Find longest valid prefix */
+        for (int i = 0; i < num_prefixes; i++) {
+            int end_pos = pos + lengths[i];
+            if (is_valid_pos(end_pos, valid_pos, num_valid) && lengths[i] > best_len) {
+                best_len = lengths[i];
+                best_end_pos = end_pos;
+            }
+        }
+        
+        /* Free prefix results */
+        for (int i = 0; i < num_prefixes; i++) {
+            free(prefixes[i]);
+        }
+        free(prefixes);
+        free(lengths);
+        
+        /* If found a dictionary word, use it */
+        if (best_len > 0) {
+            (*tokens)[token_count++] = substring(text, pos, best_end_pos);
+            pos = best_end_pos;
+        } else {
+            /* Handle non-dictionary word */
+            /* Check if it's a non-Thai sequence */
+            int byte_len;
+            int cp = get_utf8_codepoint(text + pos, &byte_len);
+            
+            if (is_non_thai_char(cp)) {
+                /* Skip all consecutive non-Thai characters of same type */
+                int end = pos + byte_len;
+                bool is_space = (cp == ' ' || cp == '\t');
+                bool is_alpha = ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z'));
+                bool is_digit = (cp >= '0' && cp <= '9');
+                
+                while (end < text_len) {
+                    int next_cp = get_utf8_codepoint(text + end, &byte_len);
+                    bool match = false;
+                    
+                    if (is_space && (next_cp == ' ' || next_cp == '\t')) match = true;
+                    else if (is_alpha && ((next_cp >= 'a' && next_cp <= 'z') || (next_cp >= 'A' && next_cp <= 'Z'))) match = true;
+                    else if (is_digit && ((next_cp >= '0' && next_cp <= '9') || next_cp == '.' || next_cp == ',')) match = true;
+                    
+                    if (!match) break;
+                    end += byte_len;
+                }
+                
+                (*tokens)[token_count++] = substring(text, pos, end);
+                pos = end;
+            } else {
+                /* Thai character not in dictionary - advance to next TCC boundary */
+                /* Find next valid TCC boundary after current position */
+                int next_pos = text_len; /* Default to end of text */
+                for (int i = 0; i < num_valid; i++) {
+                    if (valid_pos[i] > pos) {
+                        next_pos = valid_pos[i];
+                        break;
+                    }
+                }
+                
+                (*tokens)[token_count++] = substring(text, pos, next_pos);
+                pos = next_pos;
+            }
+        }
+        
+        if (token_count >= MAX_TOKENS - 1) break;
+    }
+    
+    free(valid_pos);
+    return token_count;
+}
+
+/* Default minimal Thai dictionary */
+static const char* default_words[] = {
+    "ไป", "มา", "ใน", "ที่", "และ", "หรือ", "คือ", "เป็น", "มี", "ได้",
+    "จะ", "ไม่", "ของ", "กับ", "ก็", "ให้", "ถ้า", "แล้ว", "เมื่อ", "ซึ่ง",
+    "นี้", "นั้น", "อยู่", "เพื่อ", "การ", "ความ", "จาก", "โดย", "อย่าง", "ถึง",
+    "ว่า", "เอง", "ทุก", "แต่", "ตาม", "นัก", "ยัง", "ผล", "ผู้", "คน",
+    "วัน", "ปี", "เดือน", "ครั้ง", "ตัว", "คน", "สิ่ง", "งาน", "ข้อ", "รับ",
+    NULL
+};
+
+char** newmm_segment(const char* text, const char* dict_path, int* token_count) {
+    if (!text || !token_count) return NULL;
+    
+    *token_count = 0;
+    
+    /* Empty text */
+    if (!text[0]) return NULL;
+    
+    /* Create trie */
+    Trie* trie = trie_create();
+    if (!trie) return NULL;
+    
+    /* Load dictionary */
+    if (dict_path) {
+        if (trie_load_dict(trie, dict_path) < 0) {
+            /* Failed to load, use default */
+            for (int i = 0; default_words[i] != NULL; i++) {
+                trie_add(trie, default_words[i]);
+            }
+        }
+    } else {
+        /* Use default dictionary */
+        for (int i = 0; default_words[i] != NULL; i++) {
+            trie_add(trie, default_words[i]);
+        }
+    }
+    
+    /* Segment text */
+    char** tokens = NULL;
+    int count = segment_text(text, trie, &tokens);
+    
+    /* Cleanup */
+    trie_free(trie);
+    
+    *token_count = count;
+    return tokens;
+}
+
+void newmm_free_result(char** tokens, int token_count) {
+    if (!tokens) return;
+    
+    for (int i = 0; i < token_count; i++) {
+        free(tokens[i]);
+    }
+    free(tokens);
+}
diff --git a/src/tcc.c b/src/tcc.c
new file mode 100644
index 0000000..905e282
--- /dev/null
+++ b/src/tcc.c
@@ -0,0 +1,163 @@
+/**
+ * @file tcc.c
+ * @brief Thai Character Cluster (TCC) implementation
+ * 
+ * Based on rules proposed by Theeramunkong et al. 2000
+ * and improved rules used in PyThaiNLP's newmm
+ */
+
+#include "tcc.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+
+/* Thai Unicode ranges */
+#define THAI_START 0x0E00
+#define THAI_END   0x0E7F
+
+/* Thai character classes */
+#define is_thai_consonant(c) ((c) >= 0x0E01 && (c) <= 0x0E2E)
+#define is_thai_vowel_above(c) ((c) >= 0x0E34 && (c) <= 0x0E37)
+#define is_thai_vowel_below(c) ((c) == 0x0E38 || (c) == 0x0E39)
+#define is_thai_tone(c) ((c) >= 0x0E48 && (c) <= 0x0E4B)
+#define is_thai_sign(c) ((c) == 0x0E4C || (c) == 0x0E4D || (c) == 0x0E4E)
+#define is_thai_vowel_follow(c) ((c) >= 0x0E30 && (c) <= 0x0E33)
+#define is_thai_vowel_lead(c) ((c) >= 0x0E40 && (c) <= 0x0E44)
+
+/* UTF-8 helper functions */
+static int utf8_char_len(unsigned char c) {
+    if ((c & 0x80) == 0) return 1;
+    if ((c & 0xE0) == 0xC0) return 2;
+    if ((c & 0xF0) == 0xE0) return 3;
+    if ((c & 0xF8) == 0xF0) return 4;
+    return 1;
+}
+
+static int get_utf8_codepoint(const char* str, int* byte_len) {
+    unsigned char c = (unsigned char)str[0];
+    int len = utf8_char_len(c);
+    int codepoint = 0;
+    
+    if (len == 1) {
+        codepoint = c;
+    } else if (len == 2) {
+        codepoint = ((c & 0x1F) << 6) | (str[1] & 0x3F);
+    } else if (len == 3) {
+        codepoint = ((c & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F);
+    } else if (len == 4) {
+        codepoint = ((c & 0x07) << 18) | ((str[1] & 0x3F) << 12) | 
+                    ((str[2] & 0x3F) << 6) | (str[3] & 0x3F);
+    }
+    
+    *byte_len = len;
+    return codepoint;
+}
+
+/* Simplified TCC detection - matches basic Thai character clusters */
+static int get_tcc_length(const char* text) {
+    int byte_len;
+    int cp = get_utf8_codepoint(text, &byte_len);
+    int total_len = byte_len;
+    const char* ptr = text + byte_len;
+    
+    /* Leading vowel (เ, แ, โ, ใ, ไ) */
+    if (is_thai_vowel_lead(cp)) {
+        /* Must be followed by consonant */
+        if (*ptr) {
+            cp = get_utf8_codepoint(ptr, &byte_len);
+            if (is_thai_consonant(cp)) {
+                total_len += byte_len;
+                ptr += byte_len;
+                
+                /* Optional: consonant */
+                if (*ptr) {
+                    int next_cp = get_utf8_codepoint(ptr, &byte_len);
+                    if (is_thai_consonant(next_cp)) {
+                        total_len += byte_len;
+                        ptr += byte_len;
+                    }
+                }
+                
+                /* Optional: tone mark or other diacritics */
+                while (*ptr) {
+                    int next_cp = get_utf8_codepoint(ptr, &byte_len);
+                    if (is_thai_tone(next_cp) || is_thai_sign(next_cp) || 
+                        is_thai_vowel_above(next_cp) || is_thai_vowel_below(next_cp)) {
+                        total_len += byte_len;
+                        ptr += byte_len;
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+        return total_len;
+    }
+    
+    /* Consonant-based cluster */
+    if (is_thai_consonant(cp)) {
+        /* Optional: additional consonant */
+        if (*ptr) {
+            int next_cp = get_utf8_codepoint(ptr, &byte_len);
+            if (is_thai_consonant(next_cp)) {
+                total_len += byte_len;
+                ptr += byte_len;
+            }
+        }
+        
+        /* Optional: tone marks, vowels, signs */
+        while (*ptr) {
+            int next_cp = get_utf8_codepoint(ptr, &byte_len);
+            if (is_thai_tone(next_cp) || is_thai_sign(next_cp) || 
+                is_thai_vowel_above(next_cp) || is_thai_vowel_below(next_cp) ||
+                is_thai_vowel_follow(next_cp)) {
+                total_len += byte_len;
+                ptr += byte_len;
+            } else {
+                break;
+            }
+        }
+        
+        return total_len;
+    }
+    
+    /* Single character (non-Thai or standalone) */
+    return byte_len;
+}
+
+int tcc_pos(const char* text, int** positions) {
+    if (!text || !positions) return 0;
+    
+    int len = strlen(text);
+    if (len == 0) return 0;
+    
+    /* Allocate initial array */
+    int capacity = 100;
+    *positions = (int*)malloc(capacity * sizeof(int));
+    if (!*positions) return 0;
+    
+    int count = 0;
+    const char* ptr = text;
+    int byte_pos = 0;
+    
+    while (*ptr) {
+        int cluster_len = get_tcc_length(ptr);
+        byte_pos += cluster_len;
+        
+        /* Add position */
+        if (count >= capacity) {
+            capacity *= 2;
+            int* new_positions = (int*)realloc(*positions, capacity * sizeof(int));
+            if (!new_positions) {
+                free(*positions);
+                return 0;
+            }
+            *positions = new_positions;
+        }
+        
+        (*positions)[count++] = byte_pos;
+        ptr += cluster_len;
+    }
+    
+    return count;
+}
diff --git a/src/tcc.h b/src/tcc.h
new file mode 100644
index 0000000..be8bf75
--- /dev/null
+++ b/src/tcc.h
@@ -0,0 +1,23 @@
+/**
+ * @file tcc.h
+ * @brief Thai Character Cluster (TCC) tokenization
+ * 
+ * Implementation of tokenizer according to Thai Character Clusters (TCCs)
+ * rules proposed by Theeramunkong et al. 2000.
+ */
+
+#ifndef TCC_H
+#define TCC_H
+
+#include <stdbool.h>
+
+/**
+ * @brief Get valid Thai Character Cluster breaking positions
+ * 
+ * @param text Input Thai text (UTF-8)
+ * @param positions Output array of byte positions (caller must free)
+ * @return Number of positions found
+ */
+int tcc_pos(const char* text, int** positions);
+
+#endif /* TCC_H */
diff --git a/src/trie.c b/src/trie.c
new file mode 100644
index 0000000..d978fba
--- /dev/null
+++ b/src/trie.c
@@ -0,0 +1,266 @@
+/**
+ * @file trie.c
+ * @brief Trie data structure implementation
+ */
+
+#include "trie.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#define INITIAL_CAPACITY 8
+
+/* Helper function to decode UTF-8 character */
+static int utf8_char_len(unsigned char c) {
+    if ((c & 0x80) == 0) return 1;
+    if ((c & 0xE0) == 0xC0) return 2;
+    if ((c & 0xF0) == 0xE0) return 3;
+    if ((c & 0xF8) == 0xF0) return 4;
+    return 1; /* Invalid UTF-8 */
+}
+
+/* Get UTF-8 codepoint from string */
+static int get_utf8_codepoint(const char* str, int* byte_len) {
+    unsigned char c = (unsigned char)str[0];
+    int len = utf8_char_len(c);
+    int codepoint = 0;
+    
+    if (len == 1) {
+        codepoint = c;
+    } else if (len == 2) {
+        codepoint = ((c & 0x1F) << 6) | (str[1] & 0x3F);
+    } else if (len == 3) {
+        codepoint = ((c & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F);
+    } else if (len == 4) {
+        codepoint = ((c & 0x07) << 18) | ((str[1] & 0x3F) << 12) | 
+                    ((str[2] & 0x3F) << 6) | (str[3] & 0x3F);
+    }
+    
+    *byte_len = len;
+    return codepoint;
+}
+
+/* Create a new trie node */
+static TrieNode* trie_node_create(void) {
+    TrieNode* node = (TrieNode*)calloc(1, sizeof(TrieNode));
+    if (!node) return NULL;
+    
+    node->is_end = false;
+    node->children = NULL;
+    node->child_chars = NULL;
+    node->num_children = 0;
+    node->capacity = 0;
+    
+    return node;
+}
+
+/* Free a trie node and all its children */
+static void trie_node_free(TrieNode* node) {
+    if (!node) return;
+    
+    for (int i = 0; i < node->num_children; i++) {
+        trie_node_free(node->children[i]);
+    }
+    
+    free(node->children);
+    free(node->child_chars);
+    free(node);
+}
+
+/* Find child node by codepoint */
+static TrieNode* trie_node_get_child(TrieNode* node, int codepoint) {
+    for (int i = 0; i < node->num_children; i++) {
+        if (node->child_chars[i] == codepoint) {
+            return node->children[i];
+        }
+    }
+    return NULL;
+}
+
+/* Add child node */
+static TrieNode* trie_node_add_child(TrieNode* node, int codepoint) {
+    /* Check if need to expand capacity */
+    if (node->num_children >= node->capacity) {
+        int new_capacity = node->capacity == 0 ? INITIAL_CAPACITY : node->capacity * 2;
+        
+        /* Allocate both arrays before updating pointers */
+        TrieNode** new_children = (TrieNode**)realloc(node->children, 
+                                                       new_capacity * sizeof(TrieNode*));
+        if (!new_children) return NULL;
+        
+        int* new_chars = (int*)realloc(node->child_chars, new_capacity * sizeof(int));
+        if (!new_chars) {
+            /* new_children was allocated but new_chars failed */
+            /* Since realloc succeeded for new_children, the old pointer is invalid */
+            /* We must use the new_children pointer, even though we can't proceed */
+            node->children = new_children;
+            /* Alternatively, we could try to restore by reallocating to old size */
+            /* But that could also fail, so we just update and return NULL */
+            return NULL;
+        }
+        
+        /* Both allocations succeeded, update pointers */
+        node->children = new_children;
+        node->child_chars = new_chars;
+        node->capacity = new_capacity;
+    }
+    
+    /* Create new child */
+    TrieNode* child = trie_node_create();
+    if (!child) return NULL;
+    
+    node->children[node->num_children] = child;
+    node->child_chars[node->num_children] = codepoint;
+    node->num_children++;
+    
+    return child;
+}
+
+Trie* trie_create(void) {
+    Trie* trie = (Trie*)malloc(sizeof(Trie));
+    if (!trie) return NULL;
+    
+    trie->root = trie_node_create();
+    if (!trie->root) {
+        free(trie);
+        return NULL;
+    }
+    
+    trie->num_words = 0;
+    return trie;
+}
+
+void trie_add(Trie* trie, const char* word) {
+    if (!trie || !word || !word[0]) return;
+    
+    /* Trim leading/trailing whitespace */
+    while (*word == ' ' || *word == '\t' || *word == '\r' || *word == '\n') {
+        word++;
+    }
+    if (!*word) return;
+    
+    int len = strlen(word);
+    while (len > 0 && (word[len-1] == ' ' || word[len-1] == '\t' || 
+                       word[len-1] == '\r' || word[len-1] == '\n')) {
+        len--;
+    }
+    if (len == 0) return;
+    
+    TrieNode* current = trie->root;
+    const char* ptr = word;
+    const char* end = word + len;
+    
+    while (ptr < end) {
+        int byte_len;
+        int codepoint = get_utf8_codepoint(ptr, &byte_len);
+        
+        TrieNode* child = trie_node_get_child(current, codepoint);
+        if (!child) {
+            child = trie_node_add_child(current, codepoint);
+            if (!child) return; /* Out of memory */
+        }
+        
+        current = child;
+        ptr += byte_len;
+    }
+    
+    if (!current->is_end) {
+        current->is_end = true;
+        trie->num_words++;
+    }
+}
+
+int trie_load_dict(Trie* trie, const char* dict_path) {
+    if (!trie || !dict_path) return -1;
+    
+    FILE* fp = fopen(dict_path, "r");
+    if (!fp) return -1;
+    
+    char buffer[1024];
+    int count = 0;
+    
+    while (fgets(buffer, sizeof(buffer), fp)) {
+        /* Remove newline */
+        int len = strlen(buffer);
+        if (len > 0 && buffer[len-1] == '\n') {
+            buffer[len-1] = '\0';
+            len--;
+        }
+        if (len > 0 && buffer[len-1] == '\r') {
+            buffer[len-1] = '\0';
+            len--;
+        }
+        
+        if (len > 0) {
+            trie_add(trie, buffer);
+            count++;
+        }
+    }
+    
+    fclose(fp);
+    return count;
+}
+
+int trie_prefixes(Trie* trie, const char* text, char*** prefixes, int** lengths) {
+    if (!trie || !text || !prefixes || !lengths) return 0;
+    
+    int max_prefixes = 100; /* Initial allocation */
+    *prefixes = (char**)malloc(max_prefixes * sizeof(char*));
+    *lengths = (int*)malloc(max_prefixes * sizeof(int));
+    if (!*prefixes || !*lengths) {
+        /* Clean up any successful allocation */
+        if (*prefixes) free(*prefixes);
+        if (*lengths) free(*lengths);
+        *prefixes = NULL;
+        *lengths = NULL;
+        return 0;
+    }
+    
+    int count = 0;
+    TrieNode* current = trie->root;
+    const char* ptr = text;
+    int byte_pos = 0;
+    
+    while (*ptr) {
+        int byte_len;
+        int codepoint = get_utf8_codepoint(ptr, &byte_len);
+        
+        TrieNode* child = trie_node_get_child(current, codepoint);
+        if (!child) break;
+        
+        byte_pos += byte_len;
+        
+        if (child->is_end) {
+            /* Need to expand arrays? */
+            if (count >= max_prefixes) {
+                max_prefixes *= 2;
+                char** new_prefixes = (char**)realloc(*prefixes, max_prefixes * sizeof(char*));
+                int* new_lengths = (int*)realloc(*lengths, max_prefixes * sizeof(int));
+                if (!new_prefixes || !new_lengths) break;
+                *prefixes = new_prefixes;
+                *lengths = new_lengths;
+            }
+            
+            /* Copy prefix */
+            (*prefixes)[count] = (char*)malloc(byte_pos + 1);
+            if ((*prefixes)[count]) {
+                memcpy((*prefixes)[count], text, byte_pos);
+                (*prefixes)[count][byte_pos] = '\0';
+                (*lengths)[count] = byte_pos;
+                count++;
+            }
+        }
+        
+        current = child;
+        ptr += byte_len;
+    }
+    
+    return count;
+}
+
+void trie_free(Trie* trie) {
+    if (!trie) return;
+    
+    trie_node_free(trie->root);
+    free(trie);
+}
diff --git a/src/trie.h b/src/trie.h
new file mode 100644
index 0000000..8c30534
--- /dev/null
+++ b/src/trie.h
@@ -0,0 +1,57 @@
+/**
+ * @file trie.h
+ * @brief Trie data structure for efficient dictionary lookup
+ * 
+ * Internal header for trie implementation.
+ */
+
+#ifndef TRIE_H
+#define TRIE_H
+
+#include <stdbool.h>
+
+typedef struct TrieNode {
+    bool is_end;
+    struct TrieNode** children;
+    int* child_chars;  /* UTF-8 code points of children */
+    int num_children;
+    int capacity;
+} TrieNode;
+
+typedef struct Trie {
+    TrieNode* root;
+    int num_words;
+} Trie;
+
+/**
+ * @brief Create a new empty trie
+ */
+Trie* trie_create(void);
+
+/**
+ * @brief Add a word to the trie
+ */
+void trie_add(Trie* trie, const char* word);
+
+/**
+ * @brief Load words from a dictionary file
+ */
+int trie_load_dict(Trie* trie, const char* dict_path);
+
+/**
+ * @brief Get all possible word prefixes from text
+ * 
+ * @param trie The trie structure
+ * @param text Input text (UTF-8)
+ * @param prefixes Output array of prefix strings (caller must free)
+ * @param lengths Output array of prefix byte lengths
+ * @return Number of prefixes found
+ */
+int trie_prefixes(Trie* trie, const char* text, char*** prefixes, int** lengths);
+
+/**
+ * @brief Free trie memory
+ */
+void trie_free(Trie* trie);
+
+#endif /* TRIE_H */
diff --git a/tests/test_newmm.c b/tests/test_newmm.c
new file mode 100644
index 0000000..5f9d7bd
--- /dev/null
+++ b/tests/test_newmm.c
@@ -0,0 +1,154 @@
+/**
+ * @file test_newmm.c
+ * @brief Test program for newmm tokenizer
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../include/newmm.h"
+
+typedef struct {
+    const char* text;
+    const char* expected;
+    const char* description;
+} TestCase;
+
+static int test_count = 0;
+static int test_passed = 0;
+
+#define INITIAL_OUTPUT_SIZE 1024
+#define TOKEN_OVERHEAD 4  /* For "'', " around each token */
+
+void run_test(const char* text, const char* dict_path, const char* expected, const char* description) {
+    test_count++;
+    printf("\n[Test %d] %s\n", test_count, description);
+    printf("Input: %s\n", text);
+    
+    int token_count;
+    char** tokens = newmm_segment(text, dict_path, &token_count);
+    
+    /* Empty string should return NULL and token_count = 0 */
+    if (!tokens && token_count == 0) {
+        printf("Output: []\n");
+        printf("Expected: %s\n", expected);
+        if (strcmp(expected, "[]") == 0) {
+            printf("✓ PASS\n");
+            test_passed++;
+        } else {
+            printf("❌ FAIL\n");
+        }
+        return;
+    }
+    
+    if (!tokens) {
+        printf("❌ FAIL: Segmentation failed\n");
+        return;
+    }
+    
+    /* Build output string with dynamic allocation */
+    size_t output_size = INITIAL_OUTPUT_SIZE;
+    char* output = (char*)malloc(output_size);
+    if (!output) {
+        printf("❌ FAIL: Memory allocation failed\n");
+        newmm_free_result(tokens, token_count);
+        return;
+    }
+    
+    strcpy(output, "[");
+    for (int i = 0; i < token_count; i++) {
+        size_t needed = strlen(output) + strlen(tokens[i]) + TOKEN_OVERHEAD + 1; /* +1 for null */
+        if (needed >= output_size) {
+            output_size = needed * 2;
+            char* new_output = (char*)realloc(output, output_size);
+            if (!new_output) {
+                printf("❌ FAIL: Memory reallocation failed\n");
+                free(output);
+                newmm_free_result(tokens, token_count);
+                return;
+            }
+            output = new_output;
+        }
+        
+        strcat(output, "'");
+        strcat(output, tokens[i]);
+        strcat(output, "'");
+        if (i < token_count - 1) strcat(output, ", ");
+    }
+    strcat(output, "]");
+    
+    printf("Output: %s\n", output);
+    printf("Expected: %s\n", expected);
+    
+    /* Compare */
+    if (strcmp(output, expected) == 0) {
+        printf("✓ PASS\n");
+        test_passed++;
+    } else {
+        printf("❌ FAIL\n");
+    }
+    
+    free(output);
+    
+    newmm_free_result(tokens, token_count);
+}
+
+int main() {
+    printf("=== CThaiNLP newmm Tokenizer Test Suite ===\n");
+    
+    const char* dict = "data/thai_words.txt";
+    
+    /* Test 1: Basic Thai sentence */
+    run_test("ฉันไปโรงเรียน", dict, 
+             "['ฉัน', 'ไป', 'โรงเรียน']",
+             "Basic Thai sentence");
+    
+    /* Test 2: Thai sentence with common words */
+    run_test("วันนี้อากาศดีมาก", dict,
+             "['วันนี้', 'อา', 'กา', 'ศดี', 'มาก']",
+             "Thai sentence with partial dictionary match");
+    
+    /* Test 3: English text */
+    run_test("hello world", dict,
+             "['hello', ' ', 'world']",
+             "English text");
+    
+    /* Test 4: Numbers */
+    run_test("123", dict,
+             "['123']",
+             "Numbers only");
+    
+    /* Test 5: Mixed content */
+    run_test("ไป ABC 123", dict,
+             "['ไป', ' ', 'ABC', ' ', '123']",
+             "Mixed Thai, English, and numbers");
+    
+    /* Test 6: Empty string */
+    run_test("", dict,
+             "[]",
+             "Empty string");
+    
+    /* Test 7: Single Thai word */
+    run_test("ไป", dict,
+             "['ไป']",
+             "Single Thai word");
+    
+    /* Test 8: Using default dictionary */
+    run_test("ฉันไปโรงเรียน", NULL,
+             "['ฉั', 'น', 'ไป', 'โรง', 'เรี', 'ยน']",
+             "Default dictionary (limited words)");
+    
+    /* Summary */
+    printf("\n=== Test Summary ===\n");
+    printf("Total tests: %d\n", test_count);
+    printf("Passed: %d\n", test_passed);
+    printf("Failed: %d\n", test_count - test_passed);
+    
+    if (test_passed == test_count) {
+        printf("\n✓ All tests passed!\n");
+        return 0;
+    } else {
+        printf("\n❌ Some tests failed\n");
+        return 1;
+    }
+}