4
4
from typing import List , Dict , Any
5
5
from trafilatura import fetch_url , extract , extract_metadata
6
6
from urllib .parse import urlparse
7
+ import re
7
8
8
9
def is_url (string : str ) -> bool :
9
10
"""Check if a string is a valid URL"""
@@ -13,11 +14,53 @@ def is_url(string: str) -> bool:
13
14
except :
14
15
return False
15
16
17
+ def get_domain (url : str ) -> str :
18
+ """Extract domain from URL"""
19
+ parsed = urlparse (url )
20
+ return parsed .netloc .lower ()
21
+
16
22
class WebProcessor :
17
23
def __init__ (self , chunk_size : int = 500 ):
18
24
"""Initialize web processor with chunk size"""
19
25
self .chunk_size = chunk_size
26
+ # Define domains that need special handling
27
+ self .special_domains = {
28
+ 'x.com' : 'twitter' ,
29
+ 'twitter.com' : 'twitter' ,
30
+ 'github.com' : 'github'
31
+ }
32
+
33
+ def _handle_twitter (self , url : str ) -> Dict [str , Any ]:
34
+ """Special handling for Twitter/X URLs"""
35
+ # Extract tweet ID from URL
36
+ tweet_id = url .split ('/' )[- 1 ]
37
+ return {
38
+ 'text' : f"Twitter/X content (Tweet ID: { tweet_id } ). Note: Twitter content cannot be directly extracted. Please visit { url } to view the content." ,
39
+ 'metadata' : {
40
+ 'source' : url ,
41
+ 'type' : 'twitter' ,
42
+ 'tweet_id' : tweet_id
43
+ }
44
+ }
20
45
46
+ def _handle_github (self , url : str ) -> Dict [str , Any ]:
47
+ """Special handling for GitHub URLs"""
48
+ # Extract repo info from URL
49
+ parts = url .split ('/' )
50
+ if len (parts ) >= 5 :
51
+ owner = parts [3 ]
52
+ repo = parts [4 ]
53
+ return {
54
+ 'text' : f"GitHub Repository: { owner } /{ repo } . This is a GitHub repository. For better results, try accessing specific files or the README directly." ,
55
+ 'metadata' : {
56
+ 'source' : url ,
57
+ 'type' : 'github' ,
58
+ 'owner' : owner ,
59
+ 'repo' : repo
60
+ }
61
+ }
62
+ return None
63
+
21
64
def _chunk_text (self , text : str ) -> List [str ]:
22
65
"""Split text into chunks of roughly equal size"""
23
66
# Split into sentences (roughly)
@@ -48,7 +91,20 @@ def _chunk_text(self, text: str) -> List[str]:
48
91
def process_url (self , url : str ) -> List [Dict [str , Any ]]:
49
92
"""Process a URL and return chunks of text with metadata"""
50
93
try :
51
- # Download and extract content
94
+ domain = get_domain (url )
95
+
96
+ # Check if this domain needs special handling
97
+ if domain in self .special_domains :
98
+ handler = getattr (self , f"_handle_{ self .special_domains [domain ]} " , None )
99
+ if handler :
100
+ result = handler (url )
101
+ if result :
102
+ return [{
103
+ "text" : result ["text" ],
104
+ "metadata" : result ["metadata" ]
105
+ }]
106
+
107
+ # Standard processing for other domains
52
108
downloaded = fetch_url (url )
53
109
if not downloaded :
54
110
raise ValueError (f"Failed to fetch URL: { url } " )
@@ -72,7 +128,11 @@ def process_url(self, url: str) -> List[Dict[str, Any]]:
72
128
metadata = {}
73
129
74
130
if not text :
75
- raise ValueError (f"No text content extracted from URL: { url } " )
131
+ raise ValueError (f"No text content extracted from URL: { url } . This might be due to:\n " +
132
+ "1. Website blocking automated access\n " +
133
+ "2. Content requiring JavaScript\n " +
134
+ "3. Content behind authentication\n " +
135
+ "4. Website using non-standard HTML structure" )
76
136
77
137
# Split into chunks
78
138
text_chunks = self ._chunk_text (text )
@@ -90,7 +150,8 @@ def process_url(self, url: str) -> List[Dict[str, Any]]:
90
150
"sitename" : metadata .get ('sitename' , '' ),
91
151
"categories" : metadata .get ('categories' , []),
92
152
"tags" : metadata .get ('tags' , []),
93
- "chunk_id" : i
153
+ "chunk_id" : i ,
154
+ "type" : "webpage"
94
155
}
95
156
}
96
157
processed_chunks .append (processed_chunk )
0 commit comments