daxa-ai
diff --git a/‎pebblo_saferetriever/langchain/pebblo-saferag/google-qdrant-hf-groq/README.md‎
Lines changed: 90 additions & 0 deletions b/‎pebblo_saferetriever/langchain/pebblo-saferag/google-qdrant-hf-groq/README.md‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎pebblo_saferetriever/langchain/pebblo-saferag/google-qdrant-hf-groq/constant.py‎
Lines changed: 18 additions & 0 deletions b/‎pebblo_saferetriever/langchain/pebblo-saferag/google-qdrant-hf-groq/constant.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎pebblo_saferetriever/langchain/pebblo-saferag/google-qdrant-hf-groq/generate_token.py‎
Lines changed: 20 additions & 0 deletions b/‎pebblo_saferetriever/langchain/pebblo-saferag/google-qdrant-hf-groq/generate_token.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎pebblo_saferetriever/langchain/pebblo-saferag/google-qdrant-hf-groq/google_auth.py‎
Lines changed: 35 additions & 0 deletions b/‎pebblo_saferetriever/langchain/pebblo-saferag/google-qdrant-hf-groq/google_auth.py‎
Lines changed: 35 additions & 0 deletions
@@ -0,0 +1,90 @@
+# Pebblo Open Source SafeRAG Demo
+
+A secure and semantic Retrieval-Augmented Generation (RAG) pipeline that combines multiple powerful components to provide a robust, privacy-focused document retrieval and question-answering system.
+
+## Core Components
+
+- **Document Source**: Google Drive integration for document ingestion
+- **Security**: PebbloSafeLoader for semantic filtering and access control
+- **Vector Store**: Qdrant for efficient document retrieval
+- **Embeddings**: Local HuggingFace embeddings for semantic search
+- **LLM**: Groq-powered Llama 3.3 for high-quality responses
+
+## Prerequisites
+
+1. Google Service Account with access to target Drive folder
+2. Qdrant vector database instance
+3. Required Python packages (see requirements.txt)
+4. GROQ API key (set in .env file)
+
+## Setup Instructions
+
+### 1. Configure Environment
+
+- Set up Google Drive authentication:
+  - Follow the guide at: https://python.langchain.com/docs/integrations/document_loaders/google_drive/
+  - Create a service account and download credentials
+  - Share your target Google Drive folder with the service account email
+
+- Configure API Keys:
+  - Create a `.env` file in the project root
+  - Add your GROQ API key: `GROQ_API_KEY=your_api_key_here`
+
+- Update `constant.py` with your configuration:
+  - Set `SERVICE_ACCOUNT_PATH` to your Google service account credentials
+  - Set `INPUT_FOLDER_ID` to your Google Drive folder ID
+  - Configure other settings as needed
+
+### 2. Start Required Services
+
+#### Qdrant Vector Database
+```bash
+docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant
+```
+
+#### Pebblo Server
+```bash
+pip install pebblo
+pebblo --config-file config.yaml
+```
+
+### 3. Run the Application
+
+```bash
+python pebblo_opensource_saferag.py
+```
+
+## Features
+
+- **Secure Document Ingestion**: Semantic filtering during document loading
+- **Identity-Based Access Control**: User-level permissions and authentication
+- **Content Filtering**: Topic and entity-based content filtering
+- **Interactive Interface**: User-friendly query interface
+- **Real-time Search**: Efficient semantic search and retrieval
+- **Privacy-Focused**: Local embeddings and secure data handling
+
+## Project Structure
+
+```
+pebblo_google_drive_opensource/
+├── pebblo_opensource_saferag.py  # Main application file
+├── constant.py                   # Configuration settings
+├── utils.py                      # Utility functions
+├── google_auth.py               # Google authentication utilities
+├── .env                         # Environment variables
+└── README.md                    # This file
+```
+
+## Security and Privacy
+
+This implementation prioritizes security and privacy while maintaining high-quality retrieval and generation capabilities. Key security features include:
+
+- Semantic filtering of sensitive content
+- Identity-based access control
+- Local embedding generation
+- Secure API key management
+- Privacy-preserving document processing
+
+## Contributing
+
+Contributions are welcome! Please feel free to submit a Pull Request.
@@ -0,0 +1,18 @@
+from dotenv import load_dotenv
+import os
+
+# Load environment variables from .env file
+load_dotenv()
+
+LLM_NAME = "llama-3.3-70b-versatile"
+LOADER_APP_NAME = "py_data_demo_loader"
+RETRIEVAL_APP_NAME = "py_data_demo_retriever"
+COLLECTION_NAME = "py_data_demo_collection"
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+VECTOR_DB_URL = "http://localhost:6333"
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+SERVICE_ACCOUNT_PATH = ""
+KEY_PATH = ""
+INPUT_FOLDER_ID = ""
+INGESTION_USER_EMAIL_ADDRESS = ""
+TOKEN_PATH = ""
@@ -0,0 +1,20 @@
+from google_auth_oauthlib.flow import InstalledAppFlow
+
+# Define the API scopes you need:
+SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]  # Example
+
+
+def main():
+    creds = None
+    flow = InstalledAppFlow.from_client_secrets_file(
+        "<Entere file name>", SCOPES
+    )  # Replace with your credentials file
+    creds = flow.run_local_server(port=0)  # Opens a browser for auth
+    # Save the credentials to a file
+    with open("<Enter output file name>", "w") as token:
+        token.write(creds.to_json())
+    print("Token saved to google_token.json")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,35 @@
+from typing import List
+
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+
+
+def get_authorized_identities(
+    admin_user_email_address: str, credentials_file_path: str, user_email: str
+) -> List[str]:
+    """
+    Get authorized identities from Google Directory API
+    """
+    _authorized_identities = [user_email]
+    print(
+        f"User: {user_email}, \nAuthorized Identities: {admin_user_email_address}\n {credentials_file_path}"
+    )
+    credentials = service_account.Credentials.from_service_account_file(
+        credentials_file_path,
+        scopes=[
+            "https://www.googleapis.com/auth/admin.directory.group.readonly",
+            "https://www.googleapis.com/auth/admin.directory.group",
+        ],
+        subject=admin_user_email_address,
+    )
+    directory_service = build("admin", "directory_v1", credentials=credentials)
+
+    try:
+        groups = directory_service.groups().list(userKey=user_email).execute()
+        for group in groups.get("groups", []):
+            group_email = group["email"]
+            _authorized_identities.append(group_email)
+    except Exception as e:
+        print(f"Error in : {e}")
+    print(f"User: {user_email}, \nAuthorized Identities: {_authorized_identities}\n")
+    return _authorized_identities