fix: fix handle github url right (#180)

fatelei · web-flow · commit 68c66880fa99 · 2026-02-14T20:28:48.000+08:00
diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py
@@ -153,13 +153,6 @@ def _is_code_repository_url(self, url: str) -> bool:
             if re.match(pattern, url):
                 return True
 
-        # Check if it's a GitHub/GitLab URL
-        parsed = urlparse(url)
-        if parsed.netloc in ["github.com", "gitlab.com"]:
-            path_parts = parsed.path.strip("/").split("/")
-            if len(path_parts) >= 2:
-                return True
-
         return False
 
 
@@ -359,14 +352,17 @@ async def _handle_download_link(
             # Get appropriate parser
             if file_type == "pdf":
                 from openviking.parse.parsers.pdf import PDFParser
+
                 parser = PDFParser()
                 result = await parser.parse(temp_path)
             elif file_type == "markdown":
                 from openviking.parse.parsers.markdown import MarkdownParser
+
                 parser = MarkdownParser()
                 result = await parser.parse(temp_path)
             elif file_type == "text":
                 from openviking.parse.parsers.text import TextParser
+
                 parser = TextParser()
                 result = await parser.parse(temp_path)
             elif file_type == "html":
@@ -478,6 +474,22 @@ async def _fetch_html(self, url: str) -> str:
             response.raise_for_status()
             return response.text
 
+    def _convert_to_raw_url(self, url: str) -> str:
+        """Convert GitHub/GitLab blob URL to raw URL."""
+        parsed = urlparse(url)
+
+        if parsed.netloc == "github.com":
+            path_parts = parsed.path.strip("/").split("/")
+            if len(path_parts) >= 4 and path_parts[2] == "blob":
+                # Remove 'blob'
+                new_path = "/".join(path_parts[:2] + path_parts[3:])
+                return f"https://raw.githubusercontent.com/{new_path}"
+
+        if parsed.netloc == "gitlab.com" and "/blob/" in parsed.path:
+            return url.replace("/blob/", "/raw/")
+
+        return url
+
     async def _download_file(self, url: str) -> str:
         """
         Download file from URL to temporary location.
@@ -493,6 +505,8 @@ async def _download_file(self, url: str) -> str:
         """
         httpx = lazy_import("httpx")
 
+        url = self._convert_to_raw_url(url)
+
         # Determine file extension from URL
         parsed = urlparse(url)
         ext = Path(parsed.path).suffix or ".tmp"
diff --git a/tests/parse/test_html_parser_utils.py b/tests/parse/test_html_parser_utils.py
@@ -0,0 +1,53 @@
+import pytest
+from openviking.parse.parsers.html import HTMLParser
+
+
+class TestHTMLParserRawUrlConversion:
+    """Test suite for HTMLParser._convert_to_raw_url method."""
+
+    def setup_method(self):
+        self.parser = HTMLParser()
+
+    def test_github_blob_conversion(self):
+        blob_url = "https://github.com/volcengine/OpenViking/blob/main/docs/design.md"
+        expected = "https://raw.githubusercontent.com/volcengine/OpenViking/main/docs/design.md"
+        assert self.parser._convert_to_raw_url(blob_url) == expected
+
+        blob_deep = "https://github.com/user/repo/blob/feature/branch/src/components/Button.tsx"
+        expected_deep = (
+            "https://raw.githubusercontent.com/user/repo/feature/branch/src/components/Button.tsx"
+        )
+        assert self.parser._convert_to_raw_url(blob_deep) == expected_deep
+
+    def test_github_non_blob_urls(self):
+        repo_root = "https://github.com/volcengine/OpenViking"
+        assert self.parser._convert_to_raw_url(repo_root) == repo_root
+
+        issue_url = "https://github.com/volcengine/OpenViking/issues/1"
+        assert self.parser._convert_to_raw_url(issue_url) == issue_url
+
+        raw_url = "https://raw.githubusercontent.com/volcengine/OpenViking/main/README.md"
+        assert self.parser._convert_to_raw_url(raw_url) == raw_url
+
+    def test_gitlab_blob_conversion(self):
+        blob_url = "https://gitlab.com/gitlab-org/gitlab/-/blob/master/README.md"
+        expected = "https://gitlab.com/gitlab-org/gitlab/-/raw/master/README.md"
+        assert self.parser._convert_to_raw_url(blob_url) == expected
+
+        blob_deep = "https://gitlab.com/group/project/-/blob/dev/src/main.rs"
+        expected_deep = "https://gitlab.com/group/project/-/raw/dev/src/main.rs"
+        assert self.parser._convert_to_raw_url(blob_deep) == expected_deep
+
+    def test_gitlab_non_blob_urls(self):
+        root = "https://gitlab.com/gitlab-org/gitlab"
+        assert self.parser._convert_to_raw_url(root) == root
+
+        issue = "https://gitlab.com/gitlab-org/gitlab/-/issues/123"
+        assert self.parser._convert_to_raw_url(issue) == issue
+
+    def test_other_domains(self):
+        url = "https://example.com/blob/main/file.txt"
+        assert self.parser._convert_to_raw_url(url) == url
+
+        bitbucket = "https://bitbucket.org/user/repo/src/master/README.md"
+        assert self.parser._convert_to_raw_url(bitbucket) == bitbucket