add novita and tests for spaCy

thomashacker · thomashacker · commit 20329c992c12 · 2025-01-27T11:06:27.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,17 @@
 
 All notable changes to this project will be documented in this file.
 
+## [2.1.2] Adding Novita!
+
+## Added
+
+- Added Novita Generator (https://www.novita.ai/)
+- Added basic tests for Document class
+
+## Fixed
+
+- spaCy Language Issues (https://github.com/weaviate/Verba/issues/359#issuecomment-2612233766) (https://github.com/weaviate/Verba/issues/352)
+
 ## [2.1.1] More Bugs!
 
 ## Added
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -8,7 +8,7 @@ Open source is at the heart of Verba. We appreciate feedback, ideas, and enhance
 
 ## 📚 Before You Begin
 
-Before contributing, please take a moment to read through the [README](https://github.com/weaviate/Verba/README.md) and the [Technical Documentation](https://github.com/weaviate/Verba/TECHNICAL.md). These documents provide a comprehensive understanding of the project and are essential reading to ensure that we're all on the same page.
+Before contributing, please take a moment to read through the [README](https://github.com/weaviate/Verba/README.md) and the [Technical Documentation](https://github.com/weaviate/Verba/TECHNICAL.md). These documents provide a comprehensive understanding of the project and are essential reading to ensure that we're all on the same page. Please note that the technical documentation is a work in progress and will be updated as we progress.
 
 ## 🐛 Reporting Issues
 
@@ -22,6 +22,16 @@ If you've identified a bug or have an idea for an enhancement, please begin by c
 
 We welcome all ideas and feedback. If you're not ready to open an Issue or if you're just looking for a place to discuss ideas, head over to our [GitHub Discussions](https://github.com/weaviate/Verba/discussions) or the [Weaviate Support Page](https://forum.weaviate.io/).
 
+## 🧪 Testing
+
+We use [pytest](https://docs.pytest.org) for testing. Please note that the tests are WIP and some are missing. We still encourage you to run the tests and add more tests as you see fit.
+
+To run the tests, use the following command:
+
+```bash
+pytest goldenverba/tests
+```
+
 ## 📝 Pull Requests
 
 If you're ready to contribute code or documentation, please submit a Pull Request (PR) to the dev branch. Here's the process:
@@ -34,13 +44,6 @@ If you're ready to contribute code or documentation, please submit a Pull Reques
 - Include a clear description of your changes in the PR.
 - Link to the Issue in your PR description.
 
-### 🧪 Tests and Formatting
-
-To maintain the quality of the codebase, we ask that all contributors:
-
-- Run unit tests to ensure that nothing is broken.
-- Use [Black](https://github.com/psf/black) to format your code before submitting.
-
 ### 🔄 Pull Request Process
 
 - PRs are reviewed on a regular basis.
diff --git a/goldenverba/components/document.py b/goldenverba/components/document.py
@@ -7,15 +7,6 @@
 
 from langdetect import detect
 
-SUPPORTED_LANGUAGES = {
-    "en": "English",
-    "zh": "Simplified Chinese",
-    "zh-hant": "Traditional Chinese",
-    "fr": "French",
-    "de": "German",
-    "nl": "Dutch",
-}
-
 
 def load_nlp_for_language(language: str):
     """Load SpaCy models based on language"""
@@ -32,13 +23,10 @@ def load_nlp_for_language(language: str):
     elif language == "nl":
         nlp = spacy.blank("nl")
     else:
-        raise ValueError(f"Unsupported language: {language}")
+        nlp = spacy.blank("en")
+
+    nlp.add_pipe("sentencizer")
 
-    # Add sentence segmentation to languages
-    if language == "en":
-        nlp.add_pipe("sentencizer", config={"punct_chars": None})
-    else:
-        nlp.add_pipe("sentencizer")  #
     return nlp
 
 
@@ -55,57 +43,6 @@ def detect_language(text: str) -> str:
         return "unknown"
 
 
-def split_text_by_language(text: str):
-    """Separate text into language parts based on character ranges"""
-    chinese_simplified = "".join(
-        [char for char in text if "\u4e00" <= char <= "\u9fff"]
-    )
-    chinese_traditional = "".join(
-        [
-            char
-            for char in text
-            if "\u3400" <= char <= "\u4dbf" or "\u4e00" <= char <= "\u9fff"
-        ]
-    )
-    english_part = "".join([char for char in text if char.isascii()])
-    other_text = "".join(
-        [char for char in text if not (char.isascii() or "\u4e00" <= char <= "\u9fff")]
-    )
-
-    return chinese_simplified, chinese_traditional, english_part, other_text
-
-
-def process_mixed_language(content: str):
-    """Process mixed language text"""
-    chinese_simplified, chinese_traditional, english_text, other_text = (
-        split_text_by_language(content)
-    )
-
-    docs = []
-
-    if chinese_simplified:
-        nlp_zh = load_nlp_for_language("zh")
-        docs.append(nlp_zh(chinese_simplified))
-
-    if chinese_traditional:
-        nlp_zh_hant = load_nlp_for_language("zh-hant")
-        docs.append(nlp_zh_hant(chinese_traditional))
-
-    if english_text:
-        nlp_en = load_nlp_for_language("en")
-        docs.append(nlp_en(english_text))
-
-    if other_text:
-        detected_lang = detect_language(other_text)
-        if detected_lang in SUPPORTED_LANGUAGES:
-            nlp_other = load_nlp_for_language(detected_lang)
-            docs.append(nlp_other(other_text))
-
-    # Merge all processed documents
-    doc = Doc.from_docs(docs)
-    return doc
-
-
 class Document:
     def __init__(
         self,
@@ -132,13 +69,9 @@ def __init__(
 
         if len(content) > MAX_BATCH_SIZE:
             # Process content in batches
-            print("TOOO BIG!")
             docs = []
             detected_language = detect_language(content[0:MAX_BATCH_SIZE])
-            if detected_language in SUPPORTED_LANGUAGES:
-                nlp = load_nlp_for_language(detected_language)
-            else:
-                nlp = process_mixed_language
+            nlp = load_nlp_for_language(detected_language)
 
             for i in range(0, len(content), MAX_BATCH_SIZE):
                 docs.append(nlp(content[i : i + MAX_BATCH_SIZE]))
@@ -148,12 +81,8 @@ def __init__(
         else:
             # Process smaller content, directly based on language
             detected_language = detect_language(content)
-            if detected_language in SUPPORTED_LANGUAGES:
-                nlp = load_nlp_for_language(detected_language)
-                doc = nlp(content)
-            else:
-                # Process mixed language content
-                doc = process_mixed_language(content)
+            nlp = load_nlp_for_language(detected_language)
+            doc = nlp(content)
 
         self.spacy_doc = doc
 
diff --git a/goldenverba/components/generation/NovitaGenerator.py b/goldenverba/components/generation/NovitaGenerator.py
@@ -12,6 +12,7 @@
 
 base_url = "https://api.novita.ai/v3/openai"
 
+
 class NovitaGenerator(Generator):
     """
     Novita Generator.
@@ -85,14 +86,21 @@ async def generate_stream(
                                 json_line = json.loads(line)
                                 choice = json_line.get("choices")[0]
                                 yield {
-                                    "message": choice.get("delta", {}).get("content", ""),
+                                    "message": choice.get("delta", {}).get(
+                                        "content", ""
+                                    ),
                                     "finish_reason": (
-                                        "stop" if choice.get("finish_reason", "") == "stop" else ""
+                                        "stop"
+                                        if choice.get("finish_reason", "") == "stop"
+                                        else ""
                                     ),
                                 }
                 else:
                     error_message = await response.text()
-                    yield  {"message": f"HTTP Error {response.status}: {error_message}", "finish_reason": "stop"}
+                    yield {
+                        "message": f"HTTP Error {response.status}: {error_message}",
+                        "finish_reason": "stop",
+                    }
 
     def prepare_messages(
         self, query: str, context: str, conversation: list[dict], system_message: str
@@ -128,4 +136,4 @@ def get_models():
             return ["No Novita AI Model detected"]
     except Exception as e:
         # msg.fail(f"Couldn't connect to Novita AI: {e}")
-        return [f"Couldn't connect to Novita AI"]
+        return [f"Couldn't connect to Novita AI"]
diff --git a/goldenverba/tests/document/test_document.py b/goldenverba/tests/document/test_document.py
@@ -0,0 +1,99 @@
+import pytest
+from goldenverba.components.document import Document, create_document
+from goldenverba.server.types import FileConfig
+
+
+def test_document_initialization():
+    """Test basic document initialization"""
+    doc = Document(
+        title="Test Doc",
+        content="This is a test document.",
+        extension=".txt",
+        fileSize=23,
+        labels=["test"],
+        source="local",
+        meta={"key": "value"},
+        metadata="test metadata",
+    )
+
+    assert doc.title == "Test Doc"
+    assert doc.content == "This is a test document."
+    assert doc.extension == ".txt"
+    assert doc.fileSize == 23
+    assert doc.labels == ["test"]
+    assert doc.source == "local"
+    assert doc.meta == {"key": "value"}
+    assert doc.metadata == "test metadata"
+    assert hasattr(doc, "spacy_doc")
+
+
+def test_document_json_serialization():
+    """Test document to/from JSON conversion"""
+    original_doc = Document(
+        title="Test Doc",
+        content="Test content",
+        extension=".txt",
+        fileSize=12,
+        labels=["test"],
+        source="local",
+        meta={"key": "value"},
+        metadata="test metadata",
+    )
+
+    # Convert to JSON
+    json_dict = Document.to_json(original_doc)
+
+    # Convert back from JSON
+    restored_doc = Document.from_json(json_dict, None)
+
+    assert restored_doc.title == original_doc.title
+    assert restored_doc.content == original_doc.content
+    assert restored_doc.extension == original_doc.extension
+    assert restored_doc.fileSize == original_doc.fileSize
+    assert restored_doc.labels == original_doc.labels
+    assert restored_doc.source == original_doc.source
+    assert restored_doc.metadata == original_doc.metadata
+
+
+def test_create_document_from_file_config():
+    """Test document creation from FileConfig"""
+    # TODO: Add test
+    assert True
+
+
+def test_document_with_large_content():
+    """Test document initialization with content larger than batch size"""
+    large_content = "Test sentence. " * 50000  # Creates a large string
+    doc = Document(content=large_content)
+
+    assert len(doc.content) > 500000  # Verify content is larger than MAX_BATCH_SIZE
+    assert hasattr(doc, "spacy_doc")
+
+
+def test_invalid_json_document():
+    """Test document creation from invalid JSON"""
+    invalid_dict = {"title": "Test"}  # Missing required fields
+
+    doc = Document.from_json(invalid_dict, None)
+    assert doc is None
+
+
+def test_special_characters_in_content():
+    """Test document initialization with special characters in content"""
+    content = (
+        "This is a test document with special characters: !@#$%^&*()_+-=[]{}|;:,.<>?~ "
+    )
+    content += "Hej detta är ett test, jag bor på en ö"
+    doc = Document(content=content)
+    assert doc.content == content
+    assert doc.spacy_doc.text == content
+    assert doc.spacy_doc.sents is not None
+
+
+def test_arabic_in_content():
+    """Test document initialization with Arabic in content"""
+    content = "نص اختبار باللغة العربية"
+    doc = Document(content=content)
+    assert doc.content == content
+    assert doc.spacy_doc.text == content
+    assert doc.spacy_doc.sents is not None
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="goldenverba",
-    version="2.1.1",
+    version="2.1.2",
     packages=find_packages(),
     python_requires=">=3.10.0,<3.13.0",
     entry_points={