@@ -153,13 +153,6 @@ def _is_code_repository_url(self, url: str) -> bool:
153153 if re .match (pattern , url ):
154154 return True
155155
156- # Check if it's a GitHub/GitLab URL
157- parsed = urlparse (url )
158- if parsed .netloc in ["github.com" , "gitlab.com" ]:
159- path_parts = parsed .path .strip ("/" ).split ("/" )
160- if len (path_parts ) >= 2 :
161- return True
162-
163156 return False
164157
165158
@@ -359,14 +352,17 @@ async def _handle_download_link(
359352 # Get appropriate parser
360353 if file_type == "pdf" :
361354 from openviking .parse .parsers .pdf import PDFParser
355+
362356 parser = PDFParser ()
363357 result = await parser .parse (temp_path )
364358 elif file_type == "markdown" :
365359 from openviking .parse .parsers .markdown import MarkdownParser
360+
366361 parser = MarkdownParser ()
367362 result = await parser .parse (temp_path )
368363 elif file_type == "text" :
369364 from openviking .parse .parsers .text import TextParser
365+
370366 parser = TextParser ()
371367 result = await parser .parse (temp_path )
372368 elif file_type == "html" :
@@ -478,6 +474,22 @@ async def _fetch_html(self, url: str) -> str:
478474 response .raise_for_status ()
479475 return response .text
480476
477+ def _convert_to_raw_url (self , url : str ) -> str :
478+ """Convert GitHub/GitLab blob URL to raw URL."""
479+ parsed = urlparse (url )
480+
481+ if parsed .netloc == "github.com" :
482+ path_parts = parsed .path .strip ("/" ).split ("/" )
483+ if len (path_parts ) >= 4 and path_parts [2 ] == "blob" :
484+ # Remove 'blob'
485+ new_path = "/" .join (path_parts [:2 ] + path_parts [3 :])
486+ return f"https://raw.githubusercontent.com/{ new_path } "
487+
488+ if parsed .netloc == "gitlab.com" and "/blob/" in parsed .path :
489+ return url .replace ("/blob/" , "/raw/" )
490+
491+ return url
492+
481493 async def _download_file (self , url : str ) -> str :
482494 """
483495 Download file from URL to temporary location.
@@ -493,6 +505,8 @@ async def _download_file(self, url: str) -> str:
493505 """
494506 httpx = lazy_import ("httpx" )
495507
508+ url = self ._convert_to_raw_url (url )
509+
496510 # Determine file extension from URL
497511 parsed = urlparse (url )
498512 ext = Path (parsed .path ).suffix or ".tmp"
0 commit comments