Skip to content

Commit 91477ae

Browse files
author
HugoFara
committed
feat(parsers): simple rework of the parsers system to use Python. It includes jieba for chinese as well.
2 parents 86b91be + 8c328c2 commit 91477ae

File tree

14 files changed

+2227
-4
lines changed

14 files changed

+2227
-4
lines changed

Dockerfile

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,33 @@ LABEL org.opencontainers.image.license=Unlicense
99
LABEL org.opencontainers.image.source="https://github.com/HugoFara/lwt"
1010

1111

12-
# creating config file php.ini
12+
# Creating config file php.ini
1313
RUN mv "$PHP_INI_DIR/php.ini-production" "$PHP_INI_DIR/php.ini" && \
1414
echo 'mysqli.allow_local_infile = On' >> "$PHP_INI_DIR/php.ini"; \
1515
docker-php-ext-install pdo pdo_mysql mysqli
1616

17+
# Install Python and MeCab for NLP parsing
18+
RUN apt-get update && apt-get install -y --no-install-recommends \
19+
python3 \
20+
python3-pip \
21+
python3-venv \
22+
mecab \
23+
mecab-ipadic-utf8 \
24+
libmecab-dev \
25+
&& rm -rf /var/lib/apt/lists/* \
26+
&& mkdir -p /usr/local/etc \
27+
&& ln -s /etc/mecabrc /usr/local/etc/mecabrc
28+
29+
# Create Python virtual environment and install NLP packages
30+
RUN python3 -m venv /opt/lwt-parsers && \
31+
/opt/lwt-parsers/bin/pip install --no-cache-dir \
32+
jieba>=0.42.1 \
33+
mecab-python3>=1.0.6
34+
35+
# Copy parser scripts first (for better caching)
36+
COPY parsers/ /opt/lwt/parsers/
37+
38+
# Copy application files
1739
COPY . /var/www/html/lwt
1840

1941
# creating .env configuration file

INSTALL.sh

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,53 @@ enable_php_extensions() {
6868
fi
6969
}
7070

71+
# Install Python NLP parsers (optional)
72+
install_python_parsers() {
73+
info "Installing Python NLP parsers for CJK language support..."
74+
75+
local python_packages="python3 python3-pip python3-venv"
76+
local mecab_packages=""
77+
78+
# Detect MeCab packages based on package manager
79+
case "$PKG_MANAGER" in
80+
apt-get)
81+
mecab_packages="mecab mecab-ipadic-utf8"
82+
;;
83+
dnf|yum)
84+
mecab_packages="mecab mecab-ipadic"
85+
;;
86+
pacman)
87+
mecab_packages="mecab mecab-ipadic"
88+
;;
89+
esac
90+
91+
info "Installing Python and MeCab system packages..."
92+
$PKG_INSTALL $python_packages $mecab_packages
93+
94+
info "Creating Python virtual environment..."
95+
sudo python3 -m venv /opt/lwt-parsers
96+
97+
info "Installing Python NLP packages (jieba, mecab-python3)..."
98+
sudo /opt/lwt-parsers/bin/pip install --no-cache-dir jieba mecab-python3
99+
100+
info "Python NLP parsers installed successfully"
101+
}
102+
103+
# Copy parser scripts to installation location
104+
deploy_parser_scripts() {
105+
local dest="$1"
106+
107+
if [ -d "parsers" ]; then
108+
info "Copying parser scripts to /opt/lwt/parsers/..."
109+
sudo mkdir -p /opt/lwt/parsers
110+
sudo cp -r parsers/* /opt/lwt/parsers/
111+
sudo chmod +x /opt/lwt/parsers/*.py
112+
info "Parser scripts deployed"
113+
else
114+
warn "parsers/ directory not found - skipping parser scripts"
115+
fi
116+
}
117+
71118
# Generate a random password
72119
generate_password() {
73120
if command -v openssl > /dev/null 2>&1; then
@@ -275,6 +322,13 @@ main() {
275322
enable_php_extensions
276323
fi
277324

325+
echo
326+
read -rp "Install Python NLP parsers for Chinese/Japanese support? (Y/n): " install_parsers
327+
if [[ ! "$install_parsers" =~ ^[Nn]$ ]]; then
328+
install_python_parsers
329+
deploy_parser_scripts "."
330+
fi
331+
278332
configure_database_credentials
279333
setup_database
280334
save_env_file

config/parsers.php

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
<?php
2+
3+
/**
4+
* External Parser Allowlist Configuration.
5+
*
6+
* SECURITY: This file defines which external programs can be executed for text parsing.
7+
* Only server administrators should modify this file. Never allow user input to determine
8+
* parser paths or arguments.
9+
*
10+
* PHP version 8.1
11+
*
12+
* @category Configuration
13+
* @package Lwt\Config
14+
* @author HugoFara <[email protected]>
15+
* @license Unlicense <http://unlicense.org/>
16+
* @link https://hugofara.github.io/lwt/docs/php/
17+
* @since 3.0.0
18+
*/
19+
20+
declare(strict_types=1);
21+
22+
/**
23+
* External parser configurations.
24+
*
25+
* Each parser entry is keyed by a unique type identifier and contains:
26+
*
27+
* - 'name' (string, required): Human-readable name displayed in the UI
28+
* - 'binary' (string, required): Path to executable. Can be:
29+
* - Absolute path: '/usr/bin/python3'
30+
* - Command name: 'python3' (uses system PATH)
31+
* - 'args' (array, optional): Command-line arguments passed to the binary
32+
* - 'input_mode' (string, optional): How text is passed to the parser:
33+
* - 'stdin' (default): Text is piped to stdin
34+
* - 'file': Text is written to a temp file, path appended as last argument
35+
* - 'output_format' (string, optional): How parser output is interpreted:
36+
* - 'line' (default): One token per line
37+
* - 'wakati': Space-separated tokens (like MeCab wakati mode)
38+
*
39+
* Built-in parsers (regex, character, mecab) are always available and do not
40+
* need to be configured here. This file is for adding additional external parsers.
41+
*
42+
* Example configurations:
43+
*
44+
* return [
45+
* 'jieba' => [
46+
* 'name' => 'Jieba (Chinese)',
47+
* 'binary' => '/usr/bin/python3',
48+
* 'args' => ['/opt/lwt/parsers/jieba_tokenize.py'],
49+
* 'input_mode' => 'stdin',
50+
* 'output_format' => 'line',
51+
* ],
52+
*
53+
* 'sudachi' => [
54+
* 'name' => 'Sudachi (Japanese)',
55+
* 'binary' => 'sudachipy',
56+
* 'args' => ['-m', 'C', '-a'],
57+
* 'input_mode' => 'stdin',
58+
* 'output_format' => 'wakati',
59+
* ],
60+
*
61+
* 'custom_tokenizer' => [
62+
* 'name' => 'Custom Tokenizer',
63+
* 'binary' => '/opt/lwt/bin/tokenize',
64+
* 'args' => ['--format=simple'],
65+
* 'input_mode' => 'file',
66+
* 'output_format' => 'line',
67+
* ],
68+
* ];
69+
*/
70+
return [
71+
// Jieba - Chinese word segmentation
72+
// Requires: Python 3, jieba package
73+
// Docker: Included by default
74+
// Manual: pip install jieba
75+
'jieba' => [
76+
'name' => 'Jieba (Chinese)',
77+
'binary' => '/opt/lwt-parsers/bin/python3',
78+
'args' => ['/opt/lwt/parsers/jieba_tokenize.py'],
79+
'input_mode' => 'stdin',
80+
'output_format' => 'line',
81+
],
82+
83+
// MeCab Python - Japanese morphological analyzer
84+
// Requires: Python 3, mecab-python3 package, system MeCab with dictionary
85+
// Docker: Included by default
86+
// Manual: apt-get install mecab mecab-ipadic-utf8 && pip install mecab-python3
87+
'mecab-python' => [
88+
'name' => 'MeCab Python (Japanese)',
89+
'binary' => '/opt/lwt-parsers/bin/python3',
90+
'args' => ['/opt/lwt/parsers/mecab_tokenize.py'],
91+
'input_mode' => 'stdin',
92+
'output_format' => 'line',
93+
],
94+
];

parsers/jieba_tokenize.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Jieba tokenizer bridge for LWT (Learning With Texts).
4+
5+
This script reads Chinese text from stdin and outputs tokens one per line,
6+
compatible with LWT's ExternalParser 'line' output format.
7+
8+
Usage:
9+
echo "这是一个测试" | python3 jieba_tokenize.py
10+
11+
Output format:
12+
- One token per line
13+
- Empty lines indicate sentence/paragraph boundaries
14+
- All Chinese characters and punctuation are preserved
15+
16+
Dependencies:
17+
pip install jieba
18+
"""
19+
20+
import sys
21+
import re
22+
23+
try:
24+
import jieba
25+
except ImportError:
26+
print("Error: jieba is not installed. Install with: pip install jieba", file=sys.stderr)
27+
sys.exit(1)
28+
29+
30+
# Chinese sentence-ending punctuation
31+
SENTENCE_ENDINGS = re.compile(r'[。!?…\n]')
32+
33+
# Chinese punctuation that should be treated as non-words
34+
PUNCTUATION = re.compile(r'^[\s\u3000-\u303F\uFF00-\uFFEF\u2000-\u206F]+$')
35+
36+
37+
def is_word(token: str) -> bool:
38+
"""Check if a token is a word (not just punctuation/whitespace)."""
39+
if not token or not token.strip():
40+
return False
41+
# Contains at least one CJK character or letter
42+
return bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf\p{L}]', token, re.UNICODE))
43+
44+
45+
def tokenize(text: str) -> None:
46+
"""
47+
Tokenize Chinese text using jieba and output tokens.
48+
49+
Args:
50+
text: Input text to tokenize
51+
"""
52+
# Normalize whitespace but preserve newlines
53+
text = re.sub(r'[^\S\n]+', ' ', text)
54+
55+
# Split into paragraphs first
56+
paragraphs = text.split('\n')
57+
58+
for para_idx, paragraph in enumerate(paragraphs):
59+
paragraph = paragraph.strip()
60+
61+
if not paragraph:
62+
# Empty line = paragraph boundary
63+
print()
64+
continue
65+
66+
# Use jieba's precise mode for better accuracy
67+
tokens = jieba.cut(paragraph, cut_all=False)
68+
69+
for token in tokens:
70+
if token and token.strip():
71+
print(token)
72+
73+
# Paragraph boundary
74+
print()
75+
76+
77+
def main():
78+
"""Main entry point."""
79+
# Disable jieba's verbose output
80+
jieba.setLogLevel(jieba.logging.WARNING)
81+
82+
# Read all input from stdin
83+
try:
84+
text = sys.stdin.read()
85+
except KeyboardInterrupt:
86+
sys.exit(0)
87+
88+
if not text.strip():
89+
sys.exit(0)
90+
91+
tokenize(text)
92+
93+
94+
if __name__ == '__main__':
95+
main()

parsers/mecab_tokenize.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#!/usr/bin/env python3
2+
"""
3+
MeCab tokenizer bridge for LWT (Learning With Texts).
4+
5+
This script reads Japanese text from stdin and outputs tokens one per line,
6+
compatible with LWT's ExternalParser 'line' output format.
7+
8+
Usage:
9+
echo "これはテストです" | python3 mecab_tokenize.py
10+
11+
Output format:
12+
- One token per line
13+
- Empty lines indicate sentence/paragraph boundaries
14+
- All Japanese characters and punctuation are preserved
15+
16+
Dependencies:
17+
- System: mecab, mecab-ipadic-utf8 (or other dictionary)
18+
- Python: pip install mecab-python3
19+
20+
Installation on Debian/Ubuntu:
21+
apt-get install mecab mecab-ipadic-utf8
22+
pip install mecab-python3
23+
"""
24+
25+
import sys
26+
import re
27+
28+
try:
29+
import MeCab
30+
except ImportError:
31+
print("Error: mecab-python3 is not installed.", file=sys.stderr)
32+
print("Install with: pip install mecab-python3", file=sys.stderr)
33+
print("Also ensure system MeCab is installed: apt-get install mecab mecab-ipadic-utf8", file=sys.stderr)
34+
sys.exit(1)
35+
36+
37+
def tokenize(text: str) -> None:
38+
"""
39+
Tokenize Japanese text using MeCab and output tokens.
40+
41+
Args:
42+
text: Input text to tokenize
43+
"""
44+
try:
45+
# Create MeCab tagger
46+
# Empty string uses default dictionary
47+
tagger = MeCab.Tagger("")
48+
except RuntimeError as e:
49+
print(f"Error initializing MeCab: {e}", file=sys.stderr)
50+
print("Ensure MeCab dictionary is installed: apt-get install mecab-ipadic-utf8", file=sys.stderr)
51+
sys.exit(1)
52+
53+
# Normalize whitespace but preserve newlines
54+
text = re.sub(r'[^\S\n]+', ' ', text)
55+
56+
# Split into paragraphs
57+
paragraphs = text.split('\n')
58+
59+
for para_idx, paragraph in enumerate(paragraphs):
60+
paragraph = paragraph.strip()
61+
62+
if not paragraph:
63+
# Empty line = paragraph boundary
64+
print()
65+
continue
66+
67+
# Parse with MeCab
68+
# parseToNode returns a linked list of nodes
69+
node = tagger.parseToNode(paragraph)
70+
71+
while node:
72+
# surface is the actual token text
73+
surface = node.surface
74+
75+
if surface:
76+
# Output the token
77+
print(surface)
78+
79+
# Move to next node
80+
node = node.next
81+
82+
# Paragraph boundary
83+
print()
84+
85+
86+
def main():
87+
"""Main entry point."""
88+
# Read all input from stdin
89+
try:
90+
text = sys.stdin.read()
91+
except KeyboardInterrupt:
92+
sys.exit(0)
93+
94+
if not text.strip():
95+
sys.exit(0)
96+
97+
tokenize(text)
98+
99+
100+
if __name__ == '__main__':
101+
main()

0 commit comments

Comments
 (0)