deepsense-ai
diff --git a/‎docs/api_reference/core/sources.md‎
Lines changed: 17 additions & 0 deletions b/‎docs/api_reference/core/sources.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎docs/api_reference/document_search/documents.md‎
Lines changed: 0 additions & 25 deletions b/‎docs/api_reference/document_search/documents.md‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎docs/api_reference/document_search/documents/documents.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/api_reference/document_search/documents/documents.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/api_reference/document_search/documents/elements.md‎
Lines changed: 7 additions & 0 deletions b/‎docs/api_reference/document_search/documents/elements.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/how-to/document_search/ingest-documents.md‎
Lines changed: 6 additions & 64 deletions b/‎docs/how-to/document_search/ingest-documents.md‎
Lines changed: 6 additions & 64 deletions
diff --git a/‎docs/how-to/sources/load-dataset.md‎
Lines changed: 65 additions & 0 deletions b/‎docs/how-to/sources/load-dataset.md‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎docs/quickstart/quickstart2_rag.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/quickstart/quickstart2_rag.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/document-search/multimodal.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/document-search/multimodal.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/evaluation/document-search/advanced/config/pipeline/source/hf.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/evaluation/document-search/advanced/config/pipeline/source/hf.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/evaluation/document-search/advanced/evaluate.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/evaluation/document-search/advanced/evaluate.py‎
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,17 @@
+# Sources
+
+::: ragbits.core.sources.base.Source
+
+::: ragbits.core.sources.azure.AzureBlobStorageSource
+
+::: ragbits.core.sources.gcs.GCSSource
+
+::: ragbits.core.sources.git.GitSource
+
+::: ragbits.core.sources.hf.HuggingFaceSource
+
+::: ragbits.core.sources.local.LocalFileSource
+
+::: ragbits.core.sources.s3.S3Source
+
+::: ragbits.core.sources.web.WebSource
@@ -0,0 +1,9 @@
+# Documents
+
+::: ragbits.document_search.documents.document.Document
+
+::: ragbits.document_search.documents.document.TextDocument
+
+::: ragbits.document_search.documents.document.DocumentMeta
+
+::: ragbits.document_search.documents.document.DocumentType
@@ -0,0 +1,7 @@
+# Elements
+
+::: ragbits.document_search.documents.element.Element
+
+::: ragbits.document_search.documents.element.TextElement
+
+::: ragbits.document_search.documents.element.ImageElement
@@ -2,9 +2,9 @@
 
 The Ragbits document ingest pipeline consists of four main steps: loading, parsing, enrichment, and indexing. All of these steps can be orchestrated using different strategies, depending on the expected load.
 
-## Loading sources
+## Loading dataset
 
-Before a document can be processed, it must be defined and downloaded. In Ragbits, there are a few ways to do this: you can specify the source URI, the source instance, the document metadata or the document itself.
+Before processing a document in Ragbits, it must first be defined and downloaded. This can be done in several ways: by specifying a source URI or using an instance of [`Source`][ragbits.core.sources.base.Source], [`DocumentMeta`][ragbits.document_search.documents.document.DocumentMeta] or [`Document`][ragbits.document_search.documents.document.Document].
 
 === "URI"
 
@@ -19,7 +19,7 @@ Before a document can be processed, it must be defined and downloaded. In Ragbit
 === "Source"
 
     ```python
-    from ragbits.document_search.documents.sources import WebSource
+    from ragbits.core.sources import WebSource
     from ragbits.document_search import DocumentSearch
 
     document_search = DocumentSearch(...)
@@ -49,65 +49,7 @@ Before a document can be processed, it must be defined and downloaded. In Ragbit
     await document_search.ingest([Document(...), ...])
     ```
 
-### Supported sources
-
-This is the list of currently supported sources by Ragbits.
-
-| Source | URI Schema | Class |
-|-|-|-|
-| Azure Blob Storage | `azure://https://account_name.blob.core.windows.net/<container-name>|<blob-name>` | [`AzureBlobStorageSource`][ragbits.document_search.documents.sources.AzureBlobStorageSource] |
-| Google Cloud Storage | `gcs://<bucket-name>/<prefix>` | [`GCSSource`][ragbits.document_search.documents.sources.GCSSource] |
-| Git | `git://<https-url>|<ssh-url>` | [`GitSource`][ragbits.document_search.documents.sources.GitSource] |
-| Hugging Face | `huggingface://<dataset-path>/<split>/<row>` | [`HuggingFaceSource`][ragbits.document_search.documents.sources.HuggingFaceSource] |
-| Local file | `file://<file-path>|<blob-pattern>` | [`LocalFileSource`][ragbits.document_search.documents.sources.LocalFileSource] |
-| Amazon S3 | `s3://<bucket-name>/<prefix>` | [`S3Source`][ragbits.document_search.documents.sources.S3Source] |
-| Web | `web://<https-url>` | [`WebSource`][ragbits.document_search.documents.sources.WebSource] |
-
-To define a new sources, extend the [`Source`][ragbits.document_search.documents.sources.Source] class.
-
-```python
-from ragbits.document_search.documents.sources import Source
-
-
-class CustomSource(Source):
-    """
-    Source that downloads file from the web.
-    """
-
-    protocol: ClassVar[str] = "custom"
-    source_url: str
-    ...
-
-    @property
-    def id(self) -> str:
-        """
-        Source unique identifier.
-        """
-        return f"{self.protocol}:{self.source_url}"
-
-    @classmethod
-    async def from_uri(cls, uri: str) -> list[Self]:
-        """
-        Create source instances from a URI path.
-
-        Args:
-            uri: The URI path.
-
-        Returns:
-            The list of sources.
-        """
-        return [cls(...), ...]
-
-    async def fetch(self) -> Path:
-        """
-        Download a file for the given url.
-
-        Returns:
-            The local path to the downloaded file.
-        """
-        ...
-        return Path(f"/tmp/{self.source_url}")
-```
+All sources supported by Ragbits are available [here](../sources/load-dataset.md#supported-sources).
 
 ## Parsing documents
 
@@ -290,7 +232,7 @@ Running an ingest pipeline can be time-consuming, depending on your expected loa
         --address http://<cluster_address>:8265 \
         --runtime-env '{"pip": ["ragbits-core", "ragbits-document-search[ray]"]}' \
         --working-dir . \
-        --python script.py
+        -- python script.py
     ```
 
     There are also other ways to submit jobs to the Ray cluster. For more information, please refer to the [Ray documentation](https://docs.ray.io/en/latest/ray-overview/index.html).
@@ -300,7 +242,7 @@ To define a new ingest strategy, extend the [`IngestStrategy`][ragbits.document_
 ```python
 from ragbits.core.vector_stores import VectorStore
 from ragbits.document_search.documents.document import Document, DocumentMeta
-from ragbits.document_search.documents.sources import Source
+from ragbits.core.sources import Source
 from ragbits.document_search.ingestion.enrichers import ElementEnricherRouter
 from ragbits.document_search.ingestion.parsers import DocumentParserRouter
 from ragbits.document_search.ingestion.strategies import (
 
@@ -0,0 +1,65 @@
+# How-To: Load dataset with sources
+
+Ragbits provides an abstraction for handling datasets. The [`Source`][ragbits.core.sources.Source] component is designed to define interactions with any data source, such as downloading and querying.
+
+## Supported sources
+
+This is the list of currently supported sources by Ragbits.
+
+| Source | URI Schema | Class |
+|-|-|-|
+| Azure Blob Storage | `azure://https://account_name.blob.core.windows.net/<container-name>|<blob-name>` | [`AzureBlobStorageSource`][ragbits.core.sources.AzureBlobStorageSource] |
+| Google Cloud Storage | `gcs://<bucket-name>/<prefix>` | [`GCSSource`][ragbits.core.sources.GCSSource] |
+| Git | `git://<https-url>|<ssh-url>` | [`GitSource`][ragbits.core.sources.GitSource] |
+| Hugging Face | `hf://<dataset-path>/<split>/<row>` | [`HuggingFaceSource`][ragbits.core.sources.HuggingFaceSource] |
+| Local file | `file://<file-path>|<blob-pattern>` | [`LocalFileSource`][ragbits.core.sources.LocalFileSource] |
+| Amazon S3 | `s3://<bucket-name>/<prefix>` | [`S3Source`][ragbits.core.sources.S3Source] |
+| Web | `web://<https-url>` | [`WebSource`][ragbits.core.sources.WebSource] |
+
+## Custom source
+
+To define a new sources, extend the [`Source`][ragbits.core.sources.Source] class.
+
+```python
+from ragbits.core.sources import Source
+
+
+class CustomSource(Source):
+    """
+    Source that downloads file from the web.
+    """
+
+    protocol: ClassVar[str] = "custom"
+    source_url: str
+    ...
+
+    @property
+    def id(self) -> str:
+        """
+        Source unique identifier.
+        """
+        return f"{self.protocol}:{self.source_url}"
+
+    @classmethod
+    async def from_uri(cls, uri: str) -> list[Self]:
+        """
+        Create source instances from a URI path.
+
+        Args:
+            uri: The URI path.
+
+        Returns:
+            The list of sources.
+        """
+        return [cls(...), ...]
+
+    async def fetch(self) -> Path:
+        """
+        Download a file for the given url.
+
+        Returns:
+            The local path to the downloaded file.
+        """
+        ...
+        return Path(f"/tmp/{self.source_url}")
+```
@@ -43,7 +43,7 @@ We first need to direct Ragbits to the location of the documents to load them. T
 
 ```python
 from pathlib import Path
-from ragbits.document_search.documents.sources import LocalFileSource
+from ragbits.core.sources import LocalFileSource
 
 # Path to the directory with markdown files to ingest
 documents_path = Path(__file__).parent / "pb-source/en"
 
@@ -34,12 +34,12 @@
 from pathlib import Path
 
 from ragbits.core.embeddings.vertex_multimodal import VertexAIMultimodelEmbedder
+from ragbits.core.sources import LocalFileSource
 from ragbits.core.vector_stores.base import EmbeddingType
 from ragbits.core.vector_stores.hybrid import HybridSearchVectorStore
 from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
 from ragbits.document_search import DocumentSearch
 from ragbits.document_search.documents.document import DocumentMeta, DocumentType
-from ragbits.document_search.documents.sources import LocalFileSource
 from ragbits.document_search.ingestion.parsers.base import ImageDocumentParser
 from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
 
 
@@ -1,4 +1,4 @@
-type: ragbits.document_search.documents.sources.hf:HuggingFaceSource
+type: ragbits.core.sources.hf:HuggingFaceSource
 config:
   path: "micpst/hf-docs"
   split: "train[:5]"
@@ -1,8 +1,8 @@
 # /// script
 # requires-python = ">=3.10"
 # dependencies = [
-#     "ragbits-core[chroma]",
-#     "ragbits-document-search[huggingface]",
+#     "ragbits-core[chroma,hf]",
+#     "ragbits-document-search",
 #     "ragbits-evaluate[relari]",
 # ]
 # ///