HugoFara
diff --git a/‎Dockerfile‎
Lines changed: 23 additions & 1 deletion b/‎Dockerfile‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎INSTALL.sh‎
Lines changed: 54 additions & 0 deletions b/‎INSTALL.sh‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎config/parsers.php‎
Lines changed: 94 additions & 0 deletions b/‎config/parsers.php‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎parsers/jieba_tokenize.py‎
Lines changed: 95 additions & 0 deletions b/‎parsers/jieba_tokenize.py‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎parsers/mecab_tokenize.py‎
Lines changed: 101 additions & 0 deletions b/‎parsers/mecab_tokenize.py‎
Lines changed: 101 additions & 0 deletions
@@ -9,11 +9,33 @@ LABEL org.opencontainers.image.license=Unlicense
 LABEL org.opencontainers.image.source="https://github.com/HugoFara/lwt"
 
 
-# creating config file php.ini 
+# Creating config file php.ini
 RUN mv "$PHP_INI_DIR/php.ini-production" "$PHP_INI_DIR/php.ini" && \
     echo 'mysqli.allow_local_infile = On' >> "$PHP_INI_DIR/php.ini"; \
     docker-php-ext-install pdo pdo_mysql mysqli
 
+# Install Python and MeCab for NLP parsing
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    python3-venv \
+    mecab \
+    mecab-ipadic-utf8 \
+    libmecab-dev \
+    && rm -rf /var/lib/apt/lists/* \
+    && mkdir -p /usr/local/etc \
+    && ln -s /etc/mecabrc /usr/local/etc/mecabrc
+
+# Create Python virtual environment and install NLP packages
+RUN python3 -m venv /opt/lwt-parsers && \
+    /opt/lwt-parsers/bin/pip install --no-cache-dir \
+    jieba>=0.42.1 \
+    mecab-python3>=1.0.6
+
+# Copy parser scripts first (for better caching)
+COPY parsers/ /opt/lwt/parsers/
+
+# Copy application files
 COPY . /var/www/html/lwt
 
 # creating .env configuration file
 
@@ -68,6 +68,53 @@ enable_php_extensions() {
     fi
 }
 
+# Install Python NLP parsers (optional)
+install_python_parsers() {
+    info "Installing Python NLP parsers for CJK language support..."
+
+    local python_packages="python3 python3-pip python3-venv"
+    local mecab_packages=""
+
+    # Detect MeCab packages based on package manager
+    case "$PKG_MANAGER" in
+        apt-get)
+            mecab_packages="mecab mecab-ipadic-utf8"
+            ;;
+        dnf|yum)
+            mecab_packages="mecab mecab-ipadic"
+            ;;
+        pacman)
+            mecab_packages="mecab mecab-ipadic"
+            ;;
+    esac
+
+    info "Installing Python and MeCab system packages..."
+    $PKG_INSTALL $python_packages $mecab_packages
+
+    info "Creating Python virtual environment..."
+    sudo python3 -m venv /opt/lwt-parsers
+
+    info "Installing Python NLP packages (jieba, mecab-python3)..."
+    sudo /opt/lwt-parsers/bin/pip install --no-cache-dir jieba mecab-python3
+
+    info "Python NLP parsers installed successfully"
+}
+
+# Copy parser scripts to installation location
+deploy_parser_scripts() {
+    local dest="$1"
+
+    if [ -d "parsers" ]; then
+        info "Copying parser scripts to /opt/lwt/parsers/..."
+        sudo mkdir -p /opt/lwt/parsers
+        sudo cp -r parsers/* /opt/lwt/parsers/
+        sudo chmod +x /opt/lwt/parsers/*.py
+        info "Parser scripts deployed"
+    else
+        warn "parsers/ directory not found - skipping parser scripts"
+    fi
+}
+
 # Generate a random password
 generate_password() {
     if command -v openssl > /dev/null 2>&1; then
@@ -275,6 +322,13 @@ main() {
         enable_php_extensions
     fi
 
+    echo
+    read -rp "Install Python NLP parsers for Chinese/Japanese support? (Y/n): " install_parsers
+    if [[ ! "$install_parsers" =~ ^[Nn]$ ]]; then
+        install_python_parsers
+        deploy_parser_scripts "."
+    fi
+
     configure_database_credentials
     setup_database
     save_env_file
 
@@ -0,0 +1,94 @@
+<?php
+
+/**
+ * External Parser Allowlist Configuration.
+ *
+ * SECURITY: This file defines which external programs can be executed for text parsing.
+ * Only server administrators should modify this file. Never allow user input to determine
+ * parser paths or arguments.
+ *
+ * PHP version 8.1
+ *
+ * @category Configuration
+ * @package  Lwt\Config
+ * @author   HugoFara <[email protected]>
+ * @license  Unlicense <http://unlicense.org/>
+ * @link     https://hugofara.github.io/lwt/docs/php/
+ * @since    3.0.0
+ */
+
+declare(strict_types=1);
+
+/**
+ * External parser configurations.
+ *
+ * Each parser entry is keyed by a unique type identifier and contains:
+ *
+ * - 'name' (string, required): Human-readable name displayed in the UI
+ * - 'binary' (string, required): Path to executable. Can be:
+ *     - Absolute path: '/usr/bin/python3'
+ *     - Command name: 'python3' (uses system PATH)
+ * - 'args' (array, optional): Command-line arguments passed to the binary
+ * - 'input_mode' (string, optional): How text is passed to the parser:
+ *     - 'stdin' (default): Text is piped to stdin
+ *     - 'file': Text is written to a temp file, path appended as last argument
+ * - 'output_format' (string, optional): How parser output is interpreted:
+ *     - 'line' (default): One token per line
+ *     - 'wakati': Space-separated tokens (like MeCab wakati mode)
+ *
+ * Built-in parsers (regex, character, mecab) are always available and do not
+ * need to be configured here. This file is for adding additional external parsers.
+ *
+ * Example configurations:
+ *
+ * return [
+ *     'jieba' => [
+ *         'name' => 'Jieba (Chinese)',
+ *         'binary' => '/usr/bin/python3',
+ *         'args' => ['/opt/lwt/parsers/jieba_tokenize.py'],
+ *         'input_mode' => 'stdin',
+ *         'output_format' => 'line',
+ *     ],
+ *
+ *     'sudachi' => [
+ *         'name' => 'Sudachi (Japanese)',
+ *         'binary' => 'sudachipy',
+ *         'args' => ['-m', 'C', '-a'],
+ *         'input_mode' => 'stdin',
+ *         'output_format' => 'wakati',
+ *     ],
+ *
+ *     'custom_tokenizer' => [
+ *         'name' => 'Custom Tokenizer',
+ *         'binary' => '/opt/lwt/bin/tokenize',
+ *         'args' => ['--format=simple'],
+ *         'input_mode' => 'file',
+ *         'output_format' => 'line',
+ *     ],
+ * ];
+ */
+return [
+    // Jieba - Chinese word segmentation
+    // Requires: Python 3, jieba package
+    // Docker: Included by default
+    // Manual: pip install jieba
+    'jieba' => [
+        'name' => 'Jieba (Chinese)',
+        'binary' => '/opt/lwt-parsers/bin/python3',
+        'args' => ['/opt/lwt/parsers/jieba_tokenize.py'],
+        'input_mode' => 'stdin',
+        'output_format' => 'line',
+    ],
+
+    // MeCab Python - Japanese morphological analyzer
+    // Requires: Python 3, mecab-python3 package, system MeCab with dictionary
+    // Docker: Included by default
+    // Manual: apt-get install mecab mecab-ipadic-utf8 && pip install mecab-python3
+    'mecab-python' => [
+        'name' => 'MeCab Python (Japanese)',
+        'binary' => '/opt/lwt-parsers/bin/python3',
+        'args' => ['/opt/lwt/parsers/mecab_tokenize.py'],
+        'input_mode' => 'stdin',
+        'output_format' => 'line',
+    ],
+];
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""
+Jieba tokenizer bridge for LWT (Learning With Texts).
+
+This script reads Chinese text from stdin and outputs tokens one per line,
+compatible with LWT's ExternalParser 'line' output format.
+
+Usage:
+    echo "这是一个测试" | python3 jieba_tokenize.py
+
+Output format:
+    - One token per line
+    - Empty lines indicate sentence/paragraph boundaries
+    - All Chinese characters and punctuation are preserved
+
+Dependencies:
+    pip install jieba
+"""
+
+import sys
+import re
+
+try:
+    import jieba
+except ImportError:
+    print("Error: jieba is not installed. Install with: pip install jieba", file=sys.stderr)
+    sys.exit(1)
+
+
+# Chinese sentence-ending punctuation
+SENTENCE_ENDINGS = re.compile(r'[。！？…\n]')
+
+# Chinese punctuation that should be treated as non-words
+PUNCTUATION = re.compile(r'^[\s\u3000-\u303F\uFF00-\uFFEF\u2000-\u206F]+$')
+
+
+def is_word(token: str) -> bool:
+    """Check if a token is a word (not just punctuation/whitespace)."""
+    if not token or not token.strip():
+        return False
+    # Contains at least one CJK character or letter
+    return bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf\p{L}]', token, re.UNICODE))
+
+
+def tokenize(text: str) -> None:
+    """
+    Tokenize Chinese text using jieba and output tokens.
+
+    Args:
+        text: Input text to tokenize
+    """
+    # Normalize whitespace but preserve newlines
+    text = re.sub(r'[^\S\n]+', ' ', text)
+
+    # Split into paragraphs first
+    paragraphs = text.split('\n')
+
+    for para_idx, paragraph in enumerate(paragraphs):
+        paragraph = paragraph.strip()
+
+        if not paragraph:
+            # Empty line = paragraph boundary
+            print()
+            continue
+
+        # Use jieba's precise mode for better accuracy
+        tokens = jieba.cut(paragraph, cut_all=False)
+
+        for token in tokens:
+            if token and token.strip():
+                print(token)
+
+        # Paragraph boundary
+        print()
+
+
+def main():
+    """Main entry point."""
+    # Disable jieba's verbose output
+    jieba.setLogLevel(jieba.logging.WARNING)
+
+    # Read all input from stdin
+    try:
+        text = sys.stdin.read()
+    except KeyboardInterrupt:
+        sys.exit(0)
+
+    if not text.strip():
+        sys.exit(0)
+
+    tokenize(text)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""
+MeCab tokenizer bridge for LWT (Learning With Texts).
+
+This script reads Japanese text from stdin and outputs tokens one per line,
+compatible with LWT's ExternalParser 'line' output format.
+
+Usage:
+    echo "これはテストです" | python3 mecab_tokenize.py
+
+Output format:
+    - One token per line
+    - Empty lines indicate sentence/paragraph boundaries
+    - All Japanese characters and punctuation are preserved
+
+Dependencies:
+    - System: mecab, mecab-ipadic-utf8 (or other dictionary)
+    - Python: pip install mecab-python3
+
+Installation on Debian/Ubuntu:
+    apt-get install mecab mecab-ipadic-utf8
+    pip install mecab-python3
+"""
+
+import sys
+import re
+
+try:
+    import MeCab
+except ImportError:
+    print("Error: mecab-python3 is not installed.", file=sys.stderr)
+    print("Install with: pip install mecab-python3", file=sys.stderr)
+    print("Also ensure system MeCab is installed: apt-get install mecab mecab-ipadic-utf8", file=sys.stderr)
+    sys.exit(1)
+
+
+def tokenize(text: str) -> None:
+    """
+    Tokenize Japanese text using MeCab and output tokens.
+
+    Args:
+        text: Input text to tokenize
+    """
+    try:
+        # Create MeCab tagger
+        # Empty string uses default dictionary
+        tagger = MeCab.Tagger("")
+    except RuntimeError as e:
+        print(f"Error initializing MeCab: {e}", file=sys.stderr)
+        print("Ensure MeCab dictionary is installed: apt-get install mecab-ipadic-utf8", file=sys.stderr)
+        sys.exit(1)
+
+    # Normalize whitespace but preserve newlines
+    text = re.sub(r'[^\S\n]+', ' ', text)
+
+    # Split into paragraphs
+    paragraphs = text.split('\n')
+
+    for para_idx, paragraph in enumerate(paragraphs):
+        paragraph = paragraph.strip()
+
+        if not paragraph:
+            # Empty line = paragraph boundary
+            print()
+            continue
+
+        # Parse with MeCab
+        # parseToNode returns a linked list of nodes
+        node = tagger.parseToNode(paragraph)
+
+        while node:
+            # surface is the actual token text
+            surface = node.surface
+
+            if surface:
+                # Output the token
+                print(surface)
+
+            # Move to next node
+            node = node.next
+
+        # Paragraph boundary
+        print()
+
+
+def main():
+    """Main entry point."""
+    # Read all input from stdin
+    try:
+        text = sys.stdin.read()
+    except KeyboardInterrupt:
+        sys.exit(0)
+
+    if not text.strip():
+        sys.exit(0)
+
+    tokenize(text)
+
+
+if __name__ == '__main__':
+    main()