Merge branch 'mindsdb:main' into patch-1

Better-Boy · web-flow · commit b9bf9c588833 · 2024-11-22T19:04:53.000+05:30
diff --git a/.github/workflows/test_on_deploy.yml b/.github/workflows/test_on_deploy.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9','3.10', '3.11']
+        python-version: ['3.10']
     steps:
       - name: Checkout code
         uses: actions/checkout@v2
@@ -28,4 +28,4 @@ jobs:
         env:
           PYTHONPATH: ./
           API_KEY: ${{ secrets.API_KEY }}
-          BASE_URL: ${{ secrets.BASE_URL }}
+          BASE_URL: 'https://mdb.ai'
diff --git a/README.md b/README.md
@@ -149,5 +149,14 @@ client.datasources.drop('my_datasource')
 ```
 >Note: The SDK currently does not support automatically removing a data source if it is no longer connected to any mind.
 
-### Other SDKs
-#### [Command-Line](https://github.com/Better-Boy/minds-cli-sdk)
+### Community Supported SDKs
+
+- [Java-SDK](https://github.com/Better-Boy/minds-java-sdk)
+- [Ruby-SDK](https://github.com/tungnt1203/minds_ruby_sdk)
+- [Dart-SDK](https://github.com/ArnavK-09/mdb_dart)
+- [C# SDK](https://github.com/priyanshuverma-dev/Minds.SDK)
+- [Go SDK](https://github.com/Abiji-2020/minds-go-sdk)
+
+#### Command Line Tools
+- [Minds CLI](https://github.com/Better-Boy/minds-cli-sdk)
+
diff --git a/minds/__about__.py b/minds/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'minds_sdk'
 __package_name__ = 'minds'
-__version__ = '1.0.8'
+__version__ = '1.2.0'
 __description__ = 'An AI-Data Mind is an LLM with the built-in power to answer data questions for Agents'
 __email__ = 'hello@mindsdb.com'
 __author__ = 'MindsDB Inc'
diff --git a/minds/client.py b/minds/client.py
@@ -2,6 +2,7 @@
 from minds.rest_api import RestAPI
 
 from minds.datasources import Datasources
+from minds.knowledge_bases import KnowledgeBases
 from minds.minds import Minds
 
 
@@ -12,5 +13,6 @@ def __init__(self, api_key, base_url=None):
         self.api = RestAPI(api_key, base_url)
 
         self.datasources = Datasources(self)
+        self.knowledge_bases = KnowledgeBases(self)
 
         self.minds = Minds(self)
diff --git a/minds/datasources/datasources.py b/minds/datasources/datasources.py
@@ -1,7 +1,7 @@
 from typing import List, Optional, Union
 
 from pydantic import BaseModel, Field
-
+import minds.utils as utils
 import minds.exceptions as exc
 
 class DatabaseConfig(BaseModel):
@@ -37,8 +37,10 @@ def create(self, ds_config: DatabaseConfig, update=False):
 
         name = ds_config.name
 
+        utils.validate_datasource_name(name)
+
         if update:
-            self.api.put('/datasources', data=ds_config.model_dump())
+            self.api.put(f'/datasources/{name}', data=ds_config.model_dump())
         else:
             self.api.post('/datasources', data=ds_config.model_dump())
         return self.get(name)
diff --git a/minds/exceptions.py b/minds/exceptions.py
@@ -20,4 +20,8 @@ class UnknownError(Exception):
 
 
 class MindNameInvalid(Exception):
+    ...
+
+
+class DatasourceNameInvalid(Exception):
     ...
diff --git a/minds/knowledge_bases/__init__.py b/minds/knowledge_bases/__init__.py
@@ -0,0 +1 @@
+from .knowledge_bases import *
diff --git a/minds/knowledge_bases/knowledge_bases.py b/minds/knowledge_bases/knowledge_bases.py
@@ -0,0 +1,175 @@
+from typing import Any, Dict, List, Optional, Union
+
+from pydantic import BaseModel
+
+from minds.knowledge_bases.preprocessing import PreprocessingConfig
+from minds.rest_api import RestAPI
+
+
+class VectorStoreConfig(BaseModel):
+    '''Configuration for the underlying vector store for knowledge base embeddings'''
+    engine: str
+    connection_data: Dict[str, Any]
+    table: str = 'embeddings'
+
+
+class EmbeddingConfig(BaseModel):
+    '''Configuration for embeddings to use with underlying vector store for knowledge base'''
+    provider: str
+    model: str
+    params: Optional[Dict[str, Any]] = None
+
+
+class KnowledgeBaseConfig(BaseModel):
+    '''Configuration for a knowledge base'''
+    name: str
+    description: str
+    vector_store_config: Optional[VectorStoreConfig] = None
+    embedding_config: Optional[EmbeddingConfig] = None
+    # Params to apply to retrieval pipeline.
+    params: Optional[Dict] = None
+
+
+class KnowledgeBaseDocument(BaseModel):
+    '''Represents a document that can be inserted into a knowledge base'''
+    id: Union[int, str]
+    content: str
+    metadata: Optional[Dict[str, Any]] = {}
+
+
+class KnowledgeBase:
+    def __init__(self, name, api: RestAPI):
+        self.name = name
+        self.api = api
+
+    def insert_from_select(self, query: str, preprocessing_config: PreprocessingConfig = None):
+        '''
+        Inserts select content of a connected datasource into this knowledge base
+
+        :param query: The SQL SELECT query to use to retrieve content to be inserted
+        '''
+        update_request = {
+            'query': query
+        }
+        if preprocessing_config is not None:
+            update_request['preprocessing'] = preprocessing_config.model_dump()
+        _ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)
+
+    def insert_documents(self, documents: List[KnowledgeBaseDocument], preprocessing_config: PreprocessingConfig = None):
+        '''
+        Inserts documents directly into this knowledge base
+
+        :param documents: The documents to insert
+        '''
+        update_request = {
+            'rows': [d.model_dump() for d in documents]
+        }
+        if preprocessing_config is not None:
+            update_request['preprocessing'] = preprocessing_config.model_dump()
+        _ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)
+
+    def insert_urls(self, urls: List[str], preprocessing_config: PreprocessingConfig = None):
+        '''
+        Crawls URLs & inserts the retrieved webpages into this knowledge base
+
+        :param urls: Valid URLs to crawl & insert
+        '''
+        update_request = {
+            'urls': urls
+        }
+        if preprocessing_config is not None:
+            update_request['preprocessing'] = preprocessing_config.model_dump()
+        _ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)
+
+    def insert_files(self, files: List[str], preprocessing_config: PreprocessingConfig = None):
+        '''
+        Inserts files that have already been uploaded to MindsDB into this knowledge base
+
+        :param files: Names of preuploaded files to insert
+        '''
+        update_request = {
+            'files': files
+        }
+        if preprocessing_config is not None:
+            update_request['preprocessing'] = preprocessing_config.model_dump()
+        _ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)
+
+
+class KnowledgeBases:
+    def __init__(self, client):
+        self.api = client.api
+
+    def create(self, config: KnowledgeBaseConfig) -> KnowledgeBase:
+        '''
+        Create new knowledge base and return it
+
+        :param config: knowledge base configuration, properties:
+           - name: str, name of knowledge base
+           - description: str, description of the knowledge base. Used by minds to know what data can be retrieved.
+           - vector_store_config: VectorStoreConfig, configuration for embeddings vector store.
+           - embedding_config: EmbeddingConfig, configuration for embeddings.
+        :return: knowledge base object
+        '''
+        create_request = {
+            'name': config.name,
+            'description': config.description
+        }
+        if config.vector_store_config is not None:
+            vector_store_data = {
+                'engine': config.vector_store_config.engine,
+                'connection_data': config.vector_store_config.connection_data
+            }
+            create_request['vector_store'] = vector_store_data
+        if config.embedding_config is not None:
+            embedding_data = {
+                'provider': config.embedding_config.provider,
+                'name': config.embedding_config.model
+            }
+            if config.embedding_config.params is not None:
+                embedding_data.update(config.embedding_config.params)
+            create_request['embedding_model'] = embedding_data
+        if config.params is not None:
+            create_request['params'] = config.params
+
+        _ = self.api.post('/knowledge_bases', data=create_request)
+        return self.get(config.name)
+
+    def list(self) -> List[KnowledgeBase]:
+        '''
+        Returns list of knowledge bases
+
+        :return: iterable knowledge bases
+        '''
+
+        list_knowledge_bases_response = self.api.get('/knowledge_bases')
+        knowledge_bases = list_knowledge_bases_response.json()
+
+        all_knowledge_bases = []
+        for knowledge_base in knowledge_bases:
+            all_knowledge_bases.append(KnowledgeBase(knowledge_base['name'], self.api))
+        return all_knowledge_bases
+
+    def get(self, name: str) -> KnowledgeBase:
+        '''
+        Get knowledge base by name
+
+        :param name: name of knowledge base
+        :return: knowledge base object
+        '''
+
+        knowledge_base_response = self.api.get(f'/knowledge_bases/{name}')
+        knowledge_base = knowledge_base_response.json()
+        return KnowledgeBase(knowledge_base['name'], self.api)
+
+    def drop(self, name: str, force=False):
+        '''
+        Drop knowledge base by name
+
+        :param name: name of knowledge base
+        :param force: if True - remove from all minds, default: False
+        '''
+        data = None
+        if force:
+            data = {'cascade': True}
+
+        self.api.delete(f'/knowledge_bases/{name}', data=data)
diff --git a/minds/knowledge_bases/preprocessing.py b/minds/knowledge_bases/preprocessing.py
@@ -0,0 +1,78 @@
+from typing import Any, Dict, List, Literal, Optional
+
+from pydantic import BaseModel, Field, model_validator
+
+
+DEFAULT_LLM_MODEL = 'gpt-4o'
+DEFAULT_LLM_MODEL_PROVIDER = 'openai'
+
+
+class TextChunkingConfig(BaseModel):
+    '''Configuration for chunking text content before they are inserted into a knowledge base'''
+    separators: List[str] = Field(
+        default=['\n\n', '\n', ' ', ''],
+        description='List of separators to use for splitting text, in order of priority'
+    )
+    chunk_size: int = Field(
+        default=1000,
+        description='The target size of each text chunk',
+        gt=0
+    )
+    chunk_overlap: int = Field(
+        default=200,
+        description='The number of characters to overlap between chunks',
+        ge=0
+    )
+
+
+class LLMConfig(BaseModel):
+    model_name: str = Field(default=DEFAULT_LLM_MODEL, description='LLM model to use for context generation')
+    provider: str = Field(default=DEFAULT_LLM_MODEL_PROVIDER, description='LLM model provider to use for context generation')
+    params: Dict[str, Any] = Field(default={}, description='Additional parameters to pass in when initializing the LLM')
+
+
+class ContextualConfig(BaseModel):
+    '''Configuration specific to contextual preprocessing'''
+    llm_config: LLMConfig = Field(
+        default=LLMConfig(),
+        description='LLM configuration to use for context generation'
+    )
+    context_template: Optional[str] = Field(
+        default=None,
+        description='Custom template for context generation'
+    )
+    chunk_size: int = Field(
+        default=1000,
+        description='The target size of each text chunk',
+        gt=0
+    )
+    chunk_overlap: int = Field(
+        default=200,
+        description='The number of characters to overlap between chunks',
+        ge=0
+    )
+
+
+class PreprocessingConfig(BaseModel):
+    '''Complete preprocessing configuration'''
+    type: Literal['contextual', 'text_chunking'] = Field(
+        default='text_chunking',
+        description='Type of preprocessing to apply'
+    )
+    contextual_config: Optional[ContextualConfig] = Field(
+        default=None,
+        description='Configuration for contextual preprocessing'
+    )
+    text_chunking_config: Optional[TextChunkingConfig] = Field(
+        default=None,
+        description='Configuration for text chunking preprocessing'
+    )
+
+    @model_validator(mode='after')
+    def validate_config_presence(self) -> 'PreprocessingConfig':
+        '''Ensure the appropriate config is present for the chosen type'''
+        if self.type == 'contextual' and not self.contextual_config:
+            self.contextual_config = ContextualConfig()
+        if self.type == 'text_chunking' and not self.text_chunking_config:
+            self.text_chunking_config = TextChunkingConfig()
+        return self
diff --git a/minds/minds.py b/minds/minds.py
diff --git a/minds/utils.py b/minds/utils.py
diff --git a/tests/integration/test_base_flow.py b/tests/integration/test_base_flow.py
diff --git a/tests/unit/test_unit.py b/tests/unit/test_unit.py