Skip to content

Commit 68c6688

Browse files
authored
fix: fix handle github url right (#180)
1 parent 737befb commit 68c6688

File tree

2 files changed

+74
-7
lines changed

2 files changed

+74
-7
lines changed

openviking/parse/parsers/html.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -153,13 +153,6 @@ def _is_code_repository_url(self, url: str) -> bool:
153153
if re.match(pattern, url):
154154
return True
155155

156-
# Check if it's a GitHub/GitLab URL
157-
parsed = urlparse(url)
158-
if parsed.netloc in ["github.com", "gitlab.com"]:
159-
path_parts = parsed.path.strip("/").split("/")
160-
if len(path_parts) >= 2:
161-
return True
162-
163156
return False
164157

165158

@@ -359,14 +352,17 @@ async def _handle_download_link(
359352
# Get appropriate parser
360353
if file_type == "pdf":
361354
from openviking.parse.parsers.pdf import PDFParser
355+
362356
parser = PDFParser()
363357
result = await parser.parse(temp_path)
364358
elif file_type == "markdown":
365359
from openviking.parse.parsers.markdown import MarkdownParser
360+
366361
parser = MarkdownParser()
367362
result = await parser.parse(temp_path)
368363
elif file_type == "text":
369364
from openviking.parse.parsers.text import TextParser
365+
370366
parser = TextParser()
371367
result = await parser.parse(temp_path)
372368
elif file_type == "html":
@@ -478,6 +474,22 @@ async def _fetch_html(self, url: str) -> str:
478474
response.raise_for_status()
479475
return response.text
480476

477+
def _convert_to_raw_url(self, url: str) -> str:
478+
"""Convert GitHub/GitLab blob URL to raw URL."""
479+
parsed = urlparse(url)
480+
481+
if parsed.netloc == "github.com":
482+
path_parts = parsed.path.strip("/").split("/")
483+
if len(path_parts) >= 4 and path_parts[2] == "blob":
484+
# Remove 'blob'
485+
new_path = "/".join(path_parts[:2] + path_parts[3:])
486+
return f"https://raw.githubusercontent.com/{new_path}"
487+
488+
if parsed.netloc == "gitlab.com" and "/blob/" in parsed.path:
489+
return url.replace("/blob/", "/raw/")
490+
491+
return url
492+
481493
async def _download_file(self, url: str) -> str:
482494
"""
483495
Download file from URL to temporary location.
@@ -493,6 +505,8 @@ async def _download_file(self, url: str) -> str:
493505
"""
494506
httpx = lazy_import("httpx")
495507

508+
url = self._convert_to_raw_url(url)
509+
496510
# Determine file extension from URL
497511
parsed = urlparse(url)
498512
ext = Path(parsed.path).suffix or ".tmp"
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import pytest
2+
from openviking.parse.parsers.html import HTMLParser
3+
4+
5+
class TestHTMLParserRawUrlConversion:
6+
"""Test suite for HTMLParser._convert_to_raw_url method."""
7+
8+
def setup_method(self):
9+
self.parser = HTMLParser()
10+
11+
def test_github_blob_conversion(self):
12+
blob_url = "https://github.com/volcengine/OpenViking/blob/main/docs/design.md"
13+
expected = "https://raw.githubusercontent.com/volcengine/OpenViking/main/docs/design.md"
14+
assert self.parser._convert_to_raw_url(blob_url) == expected
15+
16+
blob_deep = "https://github.com/user/repo/blob/feature/branch/src/components/Button.tsx"
17+
expected_deep = (
18+
"https://raw.githubusercontent.com/user/repo/feature/branch/src/components/Button.tsx"
19+
)
20+
assert self.parser._convert_to_raw_url(blob_deep) == expected_deep
21+
22+
def test_github_non_blob_urls(self):
23+
repo_root = "https://github.com/volcengine/OpenViking"
24+
assert self.parser._convert_to_raw_url(repo_root) == repo_root
25+
26+
issue_url = "https://github.com/volcengine/OpenViking/issues/1"
27+
assert self.parser._convert_to_raw_url(issue_url) == issue_url
28+
29+
raw_url = "https://raw.githubusercontent.com/volcengine/OpenViking/main/README.md"
30+
assert self.parser._convert_to_raw_url(raw_url) == raw_url
31+
32+
def test_gitlab_blob_conversion(self):
33+
blob_url = "https://gitlab.com/gitlab-org/gitlab/-/blob/master/README.md"
34+
expected = "https://gitlab.com/gitlab-org/gitlab/-/raw/master/README.md"
35+
assert self.parser._convert_to_raw_url(blob_url) == expected
36+
37+
blob_deep = "https://gitlab.com/group/project/-/blob/dev/src/main.rs"
38+
expected_deep = "https://gitlab.com/group/project/-/raw/dev/src/main.rs"
39+
assert self.parser._convert_to_raw_url(blob_deep) == expected_deep
40+
41+
def test_gitlab_non_blob_urls(self):
42+
root = "https://gitlab.com/gitlab-org/gitlab"
43+
assert self.parser._convert_to_raw_url(root) == root
44+
45+
issue = "https://gitlab.com/gitlab-org/gitlab/-/issues/123"
46+
assert self.parser._convert_to_raw_url(issue) == issue
47+
48+
def test_other_domains(self):
49+
url = "https://example.com/blob/main/file.txt"
50+
assert self.parser._convert_to_raw_url(url) == url
51+
52+
bitbucket = "https://bitbucket.org/user/repo/src/master/README.md"
53+
assert self.parser._convert_to_raw_url(bitbucket) == bitbucket

0 commit comments

Comments
 (0)