Skip to content

Commit 18a1013

Browse files
committed
feat(web): add special handling for diferent domains - Add support for Twitter/X and GitHub URLs - Improve error messages with detailed explanations - Add content type metadata for better filtering
1 parent 33d7743 commit 18a1013

File tree

1 file changed

+64
-3
lines changed

1 file changed

+64
-3
lines changed

agentic_rag/web_processor.py

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import List, Dict, Any
55
from trafilatura import fetch_url, extract, extract_metadata
66
from urllib.parse import urlparse
7+
import re
78

89
def is_url(string: str) -> bool:
910
"""Check if a string is a valid URL"""
@@ -13,11 +14,53 @@ def is_url(string: str) -> bool:
1314
except:
1415
return False
1516

17+
def get_domain(url: str) -> str:
18+
"""Extract domain from URL"""
19+
parsed = urlparse(url)
20+
return parsed.netloc.lower()
21+
1622
class WebProcessor:
1723
def __init__(self, chunk_size: int = 500):
1824
"""Initialize web processor with chunk size"""
1925
self.chunk_size = chunk_size
26+
# Define domains that need special handling
27+
self.special_domains = {
28+
'x.com': 'twitter',
29+
'twitter.com': 'twitter',
30+
'github.com': 'github'
31+
}
32+
33+
def _handle_twitter(self, url: str) -> Dict[str, Any]:
34+
"""Special handling for Twitter/X URLs"""
35+
# Extract tweet ID from URL
36+
tweet_id = url.split('/')[-1]
37+
return {
38+
'text': f"Twitter/X content (Tweet ID: {tweet_id}). Note: Twitter content cannot be directly extracted. Please visit {url} to view the content.",
39+
'metadata': {
40+
'source': url,
41+
'type': 'twitter',
42+
'tweet_id': tweet_id
43+
}
44+
}
2045

46+
def _handle_github(self, url: str) -> Dict[str, Any]:
47+
"""Special handling for GitHub URLs"""
48+
# Extract repo info from URL
49+
parts = url.split('/')
50+
if len(parts) >= 5:
51+
owner = parts[3]
52+
repo = parts[4]
53+
return {
54+
'text': f"GitHub Repository: {owner}/{repo}. This is a GitHub repository. For better results, try accessing specific files or the README directly.",
55+
'metadata': {
56+
'source': url,
57+
'type': 'github',
58+
'owner': owner,
59+
'repo': repo
60+
}
61+
}
62+
return None
63+
2164
def _chunk_text(self, text: str) -> List[str]:
2265
"""Split text into chunks of roughly equal size"""
2366
# Split into sentences (roughly)
@@ -48,7 +91,20 @@ def _chunk_text(self, text: str) -> List[str]:
4891
def process_url(self, url: str) -> List[Dict[str, Any]]:
4992
"""Process a URL and return chunks of text with metadata"""
5093
try:
51-
# Download and extract content
94+
domain = get_domain(url)
95+
96+
# Check if this domain needs special handling
97+
if domain in self.special_domains:
98+
handler = getattr(self, f"_handle_{self.special_domains[domain]}", None)
99+
if handler:
100+
result = handler(url)
101+
if result:
102+
return [{
103+
"text": result["text"],
104+
"metadata": result["metadata"]
105+
}]
106+
107+
# Standard processing for other domains
52108
downloaded = fetch_url(url)
53109
if not downloaded:
54110
raise ValueError(f"Failed to fetch URL: {url}")
@@ -72,7 +128,11 @@ def process_url(self, url: str) -> List[Dict[str, Any]]:
72128
metadata = {}
73129

74130
if not text:
75-
raise ValueError(f"No text content extracted from URL: {url}")
131+
raise ValueError(f"No text content extracted from URL: {url}. This might be due to:\n" +
132+
"1. Website blocking automated access\n" +
133+
"2. Content requiring JavaScript\n" +
134+
"3. Content behind authentication\n" +
135+
"4. Website using non-standard HTML structure")
76136

77137
# Split into chunks
78138
text_chunks = self._chunk_text(text)
@@ -90,7 +150,8 @@ def process_url(self, url: str) -> List[Dict[str, Any]]:
90150
"sitename": metadata.get('sitename', ''),
91151
"categories": metadata.get('categories', []),
92152
"tags": metadata.get('tags', []),
93-
"chunk_id": i
153+
"chunk_id": i,
154+
"type": "webpage"
94155
}
95156
}
96157
processed_chunks.append(processed_chunk)

0 commit comments

Comments
 (0)