Skip to content

Commit 9c860df

Browse files
committed
fix: remove pkuseg for install issue
1 parent ea262e6 commit 9c860df

File tree

6 files changed

+44
-76
lines changed

6 files changed

+44
-76
lines changed

.github/workflows/docs.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@ jobs:
3030
- name: Install dependencies
3131
run: |
3232
python -m pip install --upgrade pip setuptools wheel
33-
pip install numpy
34-
pip install pkuseg
3533
pip install -e ".[docs]"
3634
3735
- name: Build documentation

openchatbi/catalog/retrival_helper.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
"""Helper functions for building column retrieval systems."""
22

3-
import pkuseg
43
from rank_bm25 import BM25Okapi
54

65
from openchatbi.llm.llm import get_embedding_model
7-
8-
# Initialize pkuseg segmenter
9-
_segmenter = pkuseg.pkuseg()
106
from openchatbi.utils import log, create_vector_db
7+
from openchatbi.text_segmenter import _segmenter
118

129

1310
def get_columns_metadata(catalog):

openchatbi/catalog/schema_retrival.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@
44
import re
55

66
import Levenshtein
7-
import pkuseg
87

98
from openchatbi import config
10-
11-
# Initialize pkuseg segmenter
12-
_segmenter = pkuseg.pkuseg()
9+
from openchatbi.text_segmenter import _segmenter
1310
from openchatbi.catalog.retrival_helper import build_column_tables_mapping, build_columns_retriever
1411
from openchatbi.utils import log
1512

openchatbi/text_segmenter.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""Simple text segmentation utility."""
2+
3+
import re
4+
import string
5+
from typing import List
6+
7+
8+
class SimpleSegmenter:
9+
"""A simple text segmenter that splits text by punctuation and whitespace.
10+
11+
This is a lightweight text segmentation tool that provides basic
12+
functionality without external dependencies.
13+
"""
14+
15+
def __init__(self):
16+
# Include both English and Chinese punctuation
17+
chinese_punctuation = ",。!?;:" "''()【】《》〈〉「」『』〔〕"
18+
all_separators = string.punctuation + chinese_punctuation + " \t\n\r"
19+
# Create regex pattern to split on any separator
20+
self.split_pattern = "[" + re.escape(all_separators) + "]+"
21+
22+
def cut(self, text: str) -> List[str]:
23+
"""Segment text into tokens by splitting on punctuation and whitespace.
24+
25+
Args:
26+
text: Input text to be segmented
27+
28+
Returns:
29+
List of tokens
30+
"""
31+
if not text:
32+
return []
33+
34+
# Split by separators and filter empty strings
35+
tokens = re.split(self.split_pattern, text)
36+
return [token for token in tokens if token.strip()]
37+
38+
39+
# Global instance
40+
_segmenter = SimpleSegmenter()

pyproject.toml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "openchatbi"
3-
version = "0.1.0"
3+
version = "0.1.1"
44
description = "OpenChatBI - Natural language business intelligence powered by LLMs for intuitive data analysis and SQL generation"
55
authors = [
66
{ name = "Yu Zhong", email = "[email protected]" },
@@ -72,7 +72,6 @@ dependencies = [
7272
"seaborn>=0.13.0,<1.0.0",
7373
"plotly>=5.17.0,<6.0.0",
7474
"json5>=0.10.0,<1.0.0",
75-
"pkuseg>=0.0.25",
7675
]
7776

7877
[project.urls]
@@ -117,8 +116,6 @@ dev-dependencies = [
117116
"openchatbi[dev]",
118117
]
119118

120-
[tool.uv.extra-build-dependencies]
121-
pkuseg = ["numpy", "cython"]
122119

123120
[build-system]
124121
requires = ["hatchling>=1.26.0"]

uv.lock

Lines changed: 1 addition & 62 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)