Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Add Webcrawler connector enums

Revision ID: 38
Revises: 37
Create Date: 2025-11-17 17:00:00.000000

"""

from collections.abc import Sequence

from alembic import op

revision: str = "38"
down_revision: str | None = "37"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None


def upgrade() -> None:
"""Safely add 'WEBCRAWLER_CONNECTOR' to enum types if missing."""

# Add to searchsourceconnectortype enum
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'WEBCRAWLER_CONNECTOR'
) THEN
ALTER TYPE searchsourceconnectortype ADD VALUE 'WEBCRAWLER_CONNECTOR';
END IF;
END
$$;
"""
)

# Add to documenttype enum
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'documenttype' AND e.enumlabel = 'CRAWLED_URL'
) THEN
ALTER TYPE documenttype ADD VALUE 'CRAWLED_URL';
END IF;
END
$$;
"""
)


def downgrade() -> None:
"""Remove 'WEBCRAWLER_CONNECTOR' from enum types."""
pass
4 changes: 2 additions & 2 deletions surfsense_backend/app/agents/researcher/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ async def fetch_relevant_documents(
}
)

elif connector == "CRAWLED_URL":
elif connector == "WEBCRAWLER_CONNECTOR":
(
source_object,
crawled_urls_chunks,
Expand All @@ -689,7 +689,7 @@ async def fetch_relevant_documents(
writer(
{
"yield_value": streaming_service.format_terminal_info_delta(
f"🌐 Found {len(crawled_urls_chunks)} Web Pages chunks related to your query"
f"🌐 Found {len(crawled_urls_chunks)} Web Page chunks related to your query"
)
}
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
{chat_history_section}
<knowledge_sources>
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
- CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites)
- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
Expand All @@ -35,6 +34,7 @@
- TAVILY_API: "Tavily search API results" (personalized search results)
- LINKUP_API: "Linkup search API results" (personalized search results)
- LUMA_CONNECTOR: "Luma events"
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
</knowledge_sources>

<instructions>
Expand Down
4 changes: 2 additions & 2 deletions surfsense_backend/app/agents/researcher/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def get_connector_emoji(connector_name: str) -> str:
connector_emojis = {
"YOUTUBE_VIDEO": "📹",
"EXTENSION": "🧩",
"CRAWLED_URL": "🌐",
"FILE": "📄",
"SLACK_CONNECTOR": "💬",
"NOTION_CONNECTOR": "📘",
Expand All @@ -34,6 +33,7 @@ def get_connector_emoji(connector_name: str) -> str:
"AIRTABLE_CONNECTOR": "🗃️",
"LUMA_CONNECTOR": "✨",
"ELASTICSEARCH_CONNECTOR": "⚡",
"WEBCRAWLER_CONNECTOR": "🌐",
}
return connector_emojis.get(connector_name, "🔎")

Expand All @@ -43,7 +43,6 @@ def get_connector_friendly_name(connector_name: str) -> str:
connector_friendly_names = {
"YOUTUBE_VIDEO": "YouTube",
"EXTENSION": "Browser Extension",
"CRAWLED_URL": "Web Pages",
"FILE": "Files",
"SLACK_CONNECTOR": "Slack",
"NOTION_CONNECTOR": "Notion",
Expand All @@ -59,6 +58,7 @@ def get_connector_friendly_name(connector_name: str) -> str:
"AIRTABLE_CONNECTOR": "Airtable",
"LUMA_CONNECTOR": "Luma",
"ELASTICSEARCH_CONNECTOR": "Elasticsearch",
"WEBCRAWLER_CONNECTOR": "Web Pages",
}
return connector_friendly_names.get(connector_name, connector_name)

Expand Down
3 changes: 0 additions & 3 deletions surfsense_backend/app/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,6 @@ class Config:
# LlamaCloud API Key
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")

# Firecrawl API Key
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)

# Litellm TTS Configuration
TTS_SERVICE = os.getenv("TTS_SERVICE")
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")
Expand Down
191 changes: 191 additions & 0 deletions surfsense_backend/app/connectors/webcrawler_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""
WebCrawler Connector Module

A module for crawling web pages and extracting content using Firecrawl or AsyncChromiumLoader.
Provides a unified interface for web scraping.
"""

from typing import Any

import validators
from firecrawl import AsyncFirecrawlApp
from langchain_community.document_loaders import AsyncChromiumLoader


class WebCrawlerConnector:
"""Class for crawling web pages and extracting content."""

def __init__(self, firecrawl_api_key: str | None = None):
"""
Initialize the WebCrawlerConnector class.

Args:
firecrawl_api_key: Firecrawl API key (optional, will use AsyncChromiumLoader if not provided)
"""
self.firecrawl_api_key = firecrawl_api_key
self.use_firecrawl = bool(firecrawl_api_key)

def set_api_key(self, api_key: str) -> None:
"""
Set the Firecrawl API key and enable Firecrawl usage.

Args:
api_key: Firecrawl API key
"""
self.firecrawl_api_key = api_key
self.use_firecrawl = True

async def crawl_url(
self, url: str, formats: list[str] | None = None
) -> tuple[dict[str, Any] | None, str | None]:
"""
Crawl a single URL and extract its content.

Args:
url: URL to crawl
formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl

Returns:
Tuple containing (crawl result dict, error message or None)
Result dict contains:
- content: Extracted content (markdown or HTML)
- metadata: Page metadata (title, description, etc.)
- source: Original URL
- crawler_type: Type of crawler used
"""
try:
# Validate URL
if not validators.url(url):
return None, f"Invalid URL: {url}"

if self.use_firecrawl:
result = await self._crawl_with_firecrawl(url, formats)
else:
result = await self._crawl_with_chromium(url)

return result, None

except Exception as e:
return None, f"Error crawling URL {url}: {e!s}"

async def _crawl_with_firecrawl(
self, url: str, formats: list[str] | None = None
) -> dict[str, Any]:
"""
Crawl URL using Firecrawl.

Args:
url: URL to crawl
formats: List of formats to extract

Returns:
Dict containing crawled content and metadata

Raises:
ValueError: If Firecrawl scraping fails
"""
if not self.firecrawl_api_key:
raise ValueError("Firecrawl API key not set. Call set_api_key() first.")

firecrawl_app = AsyncFirecrawlApp(api_key=self.firecrawl_api_key)

# Default to markdown format
if formats is None:
formats = ["markdown"]

scrape_result = await firecrawl_app.scrape_url(url=url, formats=formats)

if not scrape_result or not scrape_result.success:
error_msg = (
scrape_result.error
if scrape_result and hasattr(scrape_result, "error")
else "Unknown error"
)
raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")

# Extract content based on format
content = scrape_result.markdown or scrape_result.html or ""

# Extract metadata
metadata = scrape_result.metadata if scrape_result.metadata else {}

return {
"content": content,
"metadata": {
"source": url,
"title": metadata.get("title", url),
"description": metadata.get("description", ""),
"language": metadata.get("language", ""),
"sourceURL": metadata.get("sourceURL", url),
**metadata,
},
"crawler_type": "firecrawl",
}

async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
"""
Crawl URL using AsyncChromiumLoader.

Args:
url: URL to crawl

Returns:
Dict containing crawled content and metadata

Raises:
Exception: If crawling fails
"""
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
documents = await crawl_loader.aload()

if not documents:
raise ValueError(f"Failed to load content from {url}")

doc = documents[0]

# Extract basic metadata from the document
metadata = doc.metadata if doc.metadata else {}

return {
"content": doc.page_content,
"metadata": {
"source": url,
"title": metadata.get("title", url),
**metadata,
},
"crawler_type": "chromium",
}

def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str:
"""
Format crawl result as a structured document.

Args:
crawl_result: Result from crawl_url method

Returns:
Structured document string
"""
metadata = crawl_result["metadata"]
content = crawl_result["content"]

document_parts = ["<DOCUMENT>", "<METADATA>"]

# Add all metadata fields
for key, value in metadata.items():
document_parts.append(f"{key.upper()}: {value}")

document_parts.extend(
[
"</METADATA>",
"<CONTENT>",
"FORMAT: markdown",
"TEXT_START",
content,
"TEXT_END",
"</CONTENT>",
"</DOCUMENT>",
]
)

return "\n".join(document_parts)
1 change: 1 addition & 0 deletions surfsense_backend/app/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class SearchSourceConnectorType(str, Enum):
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
LUMA_CONNECTOR = "LUMA_CONNECTOR"
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR"


class ChatType(str, Enum):
Expand Down
7 changes: 0 additions & 7 deletions surfsense_backend/app/routes/documents_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,6 @@ async def create_documents(
process_extension_document_task.delay(
document_dict, request.search_space_id, str(user.id)
)
elif request.document_type == DocumentType.CRAWLED_URL:
from app.tasks.celery_tasks.document_tasks import process_crawled_url_task

for url in request.content:
process_crawled_url_task.delay(
url, request.search_space_id, str(user.id)
)
elif request.document_type == DocumentType.YOUTUBE_VIDEO:
from app.tasks.celery_tasks.document_tasks import process_youtube_video_task

Expand Down
Loading
Loading