Skip to content

Commit 20329c9

Browse files
committed
add novita and tests for spaCy
1 parent fa6fbe1 commit 20329c9

File tree

6 files changed

+140
-90
lines changed

6 files changed

+140
-90
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## [2.1.2] Adding Novita!
6+
7+
## Added
8+
9+
- Added Novita Generator (https://www.novita.ai/)
10+
- Added basic tests for Document class
11+
12+
## Fixed
13+
14+
- spaCy Language Issues (https://github.com/weaviate/Verba/issues/359#issuecomment-2612233766) (https://github.com/weaviate/Verba/issues/352)
15+
516
## [2.1.1] More Bugs!
617

718
## Added

CONTRIBUTING.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ Open source is at the heart of Verba. We appreciate feedback, ideas, and enhance
88

99
## 📚 Before You Begin
1010

11-
Before contributing, please take a moment to read through the [README](https://github.com/weaviate/Verba/README.md) and the [Technical Documentation](https://github.com/weaviate/Verba/TECHNICAL.md). These documents provide a comprehensive understanding of the project and are essential reading to ensure that we're all on the same page.
11+
Before contributing, please take a moment to read through the [README](https://github.com/weaviate/Verba/README.md) and the [Technical Documentation](https://github.com/weaviate/Verba/TECHNICAL.md). These documents provide a comprehensive understanding of the project and are essential reading to ensure that we're all on the same page. Please note that the technical documentation is a work in progress and will be updated as we progress.
1212

1313
## 🐛 Reporting Issues
1414

@@ -22,6 +22,16 @@ If you've identified a bug or have an idea for an enhancement, please begin by c
2222

2323
We welcome all ideas and feedback. If you're not ready to open an Issue or if you're just looking for a place to discuss ideas, head over to our [GitHub Discussions](https://github.com/weaviate/Verba/discussions) or the [Weaviate Support Page](https://forum.weaviate.io/).
2424

25+
## 🧪 Testing
26+
27+
We use [pytest](https://docs.pytest.org) for testing. Please note that the tests are WIP and some are missing. We still encourage you to run the tests and add more tests as you see fit.
28+
29+
To run the tests, use the following command:
30+
31+
```bash
32+
pytest goldenverba/tests
33+
```
34+
2535
## 📝 Pull Requests
2636

2737
If you're ready to contribute code or documentation, please submit a Pull Request (PR) to the dev branch. Here's the process:
@@ -34,13 +44,6 @@ If you're ready to contribute code or documentation, please submit a Pull Reques
3444
- Include a clear description of your changes in the PR.
3545
- Link to the Issue in your PR description.
3646

37-
### 🧪 Tests and Formatting
38-
39-
To maintain the quality of the codebase, we ask that all contributors:
40-
41-
- Run unit tests to ensure that nothing is broken.
42-
- Use [Black](https://github.com/psf/black) to format your code before submitting.
43-
4447
### 🔄 Pull Request Process
4548

4649
- PRs are reviewed on a regular basis.

goldenverba/components/document.py

Lines changed: 6 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,6 @@
77

88
from langdetect import detect
99

10-
SUPPORTED_LANGUAGES = {
11-
"en": "English",
12-
"zh": "Simplified Chinese",
13-
"zh-hant": "Traditional Chinese",
14-
"fr": "French",
15-
"de": "German",
16-
"nl": "Dutch",
17-
}
18-
1910

2011
def load_nlp_for_language(language: str):
2112
"""Load SpaCy models based on language"""
@@ -32,13 +23,10 @@ def load_nlp_for_language(language: str):
3223
elif language == "nl":
3324
nlp = spacy.blank("nl")
3425
else:
35-
raise ValueError(f"Unsupported language: {language}")
26+
nlp = spacy.blank("en")
27+
28+
nlp.add_pipe("sentencizer")
3629

37-
# Add sentence segmentation to languages
38-
if language == "en":
39-
nlp.add_pipe("sentencizer", config={"punct_chars": None})
40-
else:
41-
nlp.add_pipe("sentencizer") #
4230
return nlp
4331

4432

@@ -55,57 +43,6 @@ def detect_language(text: str) -> str:
5543
return "unknown"
5644

5745

58-
def split_text_by_language(text: str):
59-
"""Separate text into language parts based on character ranges"""
60-
chinese_simplified = "".join(
61-
[char for char in text if "\u4e00" <= char <= "\u9fff"]
62-
)
63-
chinese_traditional = "".join(
64-
[
65-
char
66-
for char in text
67-
if "\u3400" <= char <= "\u4dbf" or "\u4e00" <= char <= "\u9fff"
68-
]
69-
)
70-
english_part = "".join([char for char in text if char.isascii()])
71-
other_text = "".join(
72-
[char for char in text if not (char.isascii() or "\u4e00" <= char <= "\u9fff")]
73-
)
74-
75-
return chinese_simplified, chinese_traditional, english_part, other_text
76-
77-
78-
def process_mixed_language(content: str):
79-
"""Process mixed language text"""
80-
chinese_simplified, chinese_traditional, english_text, other_text = (
81-
split_text_by_language(content)
82-
)
83-
84-
docs = []
85-
86-
if chinese_simplified:
87-
nlp_zh = load_nlp_for_language("zh")
88-
docs.append(nlp_zh(chinese_simplified))
89-
90-
if chinese_traditional:
91-
nlp_zh_hant = load_nlp_for_language("zh-hant")
92-
docs.append(nlp_zh_hant(chinese_traditional))
93-
94-
if english_text:
95-
nlp_en = load_nlp_for_language("en")
96-
docs.append(nlp_en(english_text))
97-
98-
if other_text:
99-
detected_lang = detect_language(other_text)
100-
if detected_lang in SUPPORTED_LANGUAGES:
101-
nlp_other = load_nlp_for_language(detected_lang)
102-
docs.append(nlp_other(other_text))
103-
104-
# Merge all processed documents
105-
doc = Doc.from_docs(docs)
106-
return doc
107-
108-
10946
class Document:
11047
def __init__(
11148
self,
@@ -132,13 +69,9 @@ def __init__(
13269

13370
if len(content) > MAX_BATCH_SIZE:
13471
# Process content in batches
135-
print("TOOO BIG!")
13672
docs = []
13773
detected_language = detect_language(content[0:MAX_BATCH_SIZE])
138-
if detected_language in SUPPORTED_LANGUAGES:
139-
nlp = load_nlp_for_language(detected_language)
140-
else:
141-
nlp = process_mixed_language
74+
nlp = load_nlp_for_language(detected_language)
14275

14376
for i in range(0, len(content), MAX_BATCH_SIZE):
14477
docs.append(nlp(content[i : i + MAX_BATCH_SIZE]))
@@ -148,12 +81,8 @@ def __init__(
14881
else:
14982
# Process smaller content, directly based on language
15083
detected_language = detect_language(content)
151-
if detected_language in SUPPORTED_LANGUAGES:
152-
nlp = load_nlp_for_language(detected_language)
153-
doc = nlp(content)
154-
else:
155-
# Process mixed language content
156-
doc = process_mixed_language(content)
84+
nlp = load_nlp_for_language(detected_language)
85+
doc = nlp(content)
15786

15887
self.spacy_doc = doc
15988

goldenverba/components/generation/NovitaGenerator.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
base_url = "https://api.novita.ai/v3/openai"
1414

15+
1516
class NovitaGenerator(Generator):
1617
"""
1718
Novita Generator.
@@ -85,14 +86,21 @@ async def generate_stream(
8586
json_line = json.loads(line)
8687
choice = json_line.get("choices")[0]
8788
yield {
88-
"message": choice.get("delta", {}).get("content", ""),
89+
"message": choice.get("delta", {}).get(
90+
"content", ""
91+
),
8992
"finish_reason": (
90-
"stop" if choice.get("finish_reason", "") == "stop" else ""
93+
"stop"
94+
if choice.get("finish_reason", "") == "stop"
95+
else ""
9196
),
9297
}
9398
else:
9499
error_message = await response.text()
95-
yield {"message": f"HTTP Error {response.status}: {error_message}", "finish_reason": "stop"}
100+
yield {
101+
"message": f"HTTP Error {response.status}: {error_message}",
102+
"finish_reason": "stop",
103+
}
96104

97105
def prepare_messages(
98106
self, query: str, context: str, conversation: list[dict], system_message: str
@@ -128,4 +136,4 @@ def get_models():
128136
return ["No Novita AI Model detected"]
129137
except Exception as e:
130138
# msg.fail(f"Couldn't connect to Novita AI: {e}")
131-
return [f"Couldn't connect to Novita AI"]
139+
return [f"Couldn't connect to Novita AI"]
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import pytest
2+
from goldenverba.components.document import Document, create_document
3+
from goldenverba.server.types import FileConfig
4+
5+
6+
def test_document_initialization():
7+
"""Test basic document initialization"""
8+
doc = Document(
9+
title="Test Doc",
10+
content="This is a test document.",
11+
extension=".txt",
12+
fileSize=23,
13+
labels=["test"],
14+
source="local",
15+
meta={"key": "value"},
16+
metadata="test metadata",
17+
)
18+
19+
assert doc.title == "Test Doc"
20+
assert doc.content == "This is a test document."
21+
assert doc.extension == ".txt"
22+
assert doc.fileSize == 23
23+
assert doc.labels == ["test"]
24+
assert doc.source == "local"
25+
assert doc.meta == {"key": "value"}
26+
assert doc.metadata == "test metadata"
27+
assert hasattr(doc, "spacy_doc")
28+
29+
30+
def test_document_json_serialization():
31+
"""Test document to/from JSON conversion"""
32+
original_doc = Document(
33+
title="Test Doc",
34+
content="Test content",
35+
extension=".txt",
36+
fileSize=12,
37+
labels=["test"],
38+
source="local",
39+
meta={"key": "value"},
40+
metadata="test metadata",
41+
)
42+
43+
# Convert to JSON
44+
json_dict = Document.to_json(original_doc)
45+
46+
# Convert back from JSON
47+
restored_doc = Document.from_json(json_dict, None)
48+
49+
assert restored_doc.title == original_doc.title
50+
assert restored_doc.content == original_doc.content
51+
assert restored_doc.extension == original_doc.extension
52+
assert restored_doc.fileSize == original_doc.fileSize
53+
assert restored_doc.labels == original_doc.labels
54+
assert restored_doc.source == original_doc.source
55+
assert restored_doc.metadata == original_doc.metadata
56+
57+
58+
def test_create_document_from_file_config():
59+
"""Test document creation from FileConfig"""
60+
# TODO: Add test
61+
assert True
62+
63+
64+
def test_document_with_large_content():
65+
"""Test document initialization with content larger than batch size"""
66+
large_content = "Test sentence. " * 50000 # Creates a large string
67+
doc = Document(content=large_content)
68+
69+
assert len(doc.content) > 500000 # Verify content is larger than MAX_BATCH_SIZE
70+
assert hasattr(doc, "spacy_doc")
71+
72+
73+
def test_invalid_json_document():
74+
"""Test document creation from invalid JSON"""
75+
invalid_dict = {"title": "Test"} # Missing required fields
76+
77+
doc = Document.from_json(invalid_dict, None)
78+
assert doc is None
79+
80+
81+
def test_special_characters_in_content():
82+
"""Test document initialization with special characters in content"""
83+
content = (
84+
"This is a test document with special characters: !@#$%^&*()_+-=[]{}|;:,.<>?~ "
85+
)
86+
content += "Hej detta är ett test, jag bor på en ö"
87+
doc = Document(content=content)
88+
assert doc.content == content
89+
assert doc.spacy_doc.text == content
90+
assert doc.spacy_doc.sents is not None
91+
92+
93+
def test_arabic_in_content():
94+
"""Test document initialization with Arabic in content"""
95+
content = "نص اختبار باللغة العربية"
96+
doc = Document(content=content)
97+
assert doc.content == content
98+
assert doc.spacy_doc.text == content
99+
assert doc.spacy_doc.sents is not None

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name="goldenverba",
5-
version="2.1.1",
5+
version="2.1.2",
66
packages=find_packages(),
77
python_requires=">=3.10.0,<3.13.0",
88
entry_points={

0 commit comments

Comments
 (0)