Skip to content

Commit e4a8eb9

Browse files
XingLiu0923Xing Liu
andauthored
fix(repo_name): fix generate error for the same repo name but different owner name (#203)
Co-authored-by: Xing Liu <[email protected]>
1 parent f888426 commit e4a8eb9

9 files changed

+222
-19
lines changed

api/config/embedder.json.bak

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"embedder": {
3+
"client_class": "OpenAIClient",
4+
"batch_size": 500,
5+
"model_kwargs": {
6+
"model": "text-embedding-3-small",
7+
"dimensions": 256,
8+
"encoding_format": "float"
9+
}
10+
},
11+
"retriever": {
12+
"top_k": 20
13+
},
14+
"text_splitter": {
15+
"split_by": "word",
16+
"chunk_size": 350,
17+
"chunk_overlap": 100
18+
}
19+
}
File renamed without changes.

api/data_pipeline.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -663,12 +663,27 @@ def reset_database(self):
663663
self.repo_url_or_path = None
664664
self.repo_paths = None
665665

666-
def _create_repo(self, repo_url_or_path: str, type: str = "github", access_token: str = None) -> None:
666+
def _extract_repo_name_from_url(self, repo_url_or_path: str, repo_type: str) -> str:
667+
# Extract owner and repo name to create unique identifier
668+
url_parts = repo_url_or_path.rstrip('/').split('/')
669+
670+
if repo_type in ["github", "gitlab", "bitbucket"] and len(url_parts) >= 5:
671+
# GitHub URL format: https://github.com/owner/repo
672+
# GitLab URL format: https://gitlab.com/owner/repo or https://gitlab.com/group/subgroup/repo
673+
# Bitbucket URL format: https://bitbucket.org/owner/repo
674+
owner = url_parts[-2]
675+
repo = url_parts[-1].replace(".git", "")
676+
repo_name = f"{owner}_{repo}"
677+
else:
678+
repo_name = url_parts[-1].replace(".git", "")
679+
return repo_name
680+
681+
def _create_repo(self, repo_url_or_path: str, repo_type: str = "github", access_token: str = None) -> None:
667682
"""
668683
Download and prepare all paths.
669684
Paths:
670-
~/.adalflow/repos/{repo_name} (for url, local path will be the same)
671-
~/.adalflow/databases/{repo_name}.pkl
685+
~/.adalflow/repos/{owner}_{repo_name} (for url, local path will be the same)
686+
~/.adalflow/databases/{owner}_{repo_name}.pkl
672687
673688
Args:
674689
repo_url_or_path (str): The URL or local path of the repository
@@ -682,27 +697,16 @@ def _create_repo(self, repo_url_or_path: str, type: str = "github", access_token
682697
os.makedirs(root_path, exist_ok=True)
683698
# url
684699
if repo_url_or_path.startswith("https://") or repo_url_or_path.startswith("http://"):
685-
# Extract repo name based on the URL format
686-
if type == "github":
687-
# GitHub URL format: https://github.com/owner/repo
688-
repo_name = repo_url_or_path.split("/")[-1].replace(".git", "")
689-
elif type == "gitlab":
690-
# GitLab URL format: https://gitlab.com/owner/repo or https://gitlab.com/group/subgroup/repo
691-
# Use the last part of the URL as the repo name
692-
repo_name = repo_url_or_path.split("/")[-1].replace(".git", "")
693-
elif type == "bitbucket":
694-
# Bitbucket URL format: https://bitbucket.org/owner/repo
695-
repo_name = repo_url_or_path.split("/")[-1].replace(".git", "")
696-
else:
697-
# Generic handling for other Git URLs
698-
repo_name = repo_url_or_path.split("/")[-1].replace(".git", "")
700+
# Extract the repository name from the URL
701+
repo_name = self._extract_repo_name_from_url(repo_url_or_path, repo_type)
702+
logger.info(f"Extracted repo name: {repo_name}")
699703

700704
save_repo_dir = os.path.join(root_path, "repos", repo_name)
701705

702706
# Check if the repository directory already exists and is not empty
703707
if not (os.path.exists(save_repo_dir) and os.listdir(save_repo_dir)):
704708
# Only download if the repository doesn't exist or is empty
705-
download_repo(repo_url_or_path, save_repo_dir, type, access_token)
709+
download_repo(repo_url_or_path, save_repo_dir, repo_type, access_token)
706710
else:
707711
logger.info(f"Repository already exists at {save_repo_dir}. Using existing repository.")
708712
else: # local path

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@ dependencies = [
2020
"openai>=1.76.2",
2121
"ollama>=0.4.8",
2222
"aiohttp>=3.8.4",
23-
"boto3>=1.34.0"
23+
"boto3>=1.34.0",
24+
"pytest>=7.0.0"
2425
]

pytest.ini

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[tool:pytest]
2+
testpaths = test
3+
python_files = test_*.py *_test.py
4+
python_classes = Test*
5+
python_functions = test_*
6+
addopts =
7+
-v
8+
--strict-markers
9+
--disable-warnings
10+
--tb=short
11+
markers =
12+
unit: Unit tests
13+
integration: Integration tests
14+
slow: Slow tests that take more than a few seconds
15+
network: Tests that require network access

test/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Test package for deepwiki-open data pipeline

test/test_extract_repo_name.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Focused test script for the _extract_repo_name_from_url method
4+
5+
Run this script to test only the repository name extraction functionality.
6+
Usage: python test_extract_repo_name.py
7+
"""
8+
9+
import pytest
10+
import os
11+
import sys
12+
from unittest.mock import Mock, patch
13+
14+
# Add the parent directory to the path to import the data_pipeline module
15+
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
16+
17+
# Import the modules under test
18+
from api.data_pipeline import DatabaseManager
19+
20+
21+
class TestExtractRepoNameFromUrl:
22+
"""Comprehensive tests for the _extract_repo_name_from_url method"""
23+
24+
def setup_method(self):
25+
"""Set up test fixtures before each test method."""
26+
self.db_manager = DatabaseManager()
27+
28+
def test_extract_repo_name_github_standard_url(self):
29+
30+
# Test standard GitHub URL
31+
github_url = "https://github.com/owner/repo"
32+
result = self.db_manager._extract_repo_name_from_url(github_url, "github")
33+
assert result == "owner_repo"
34+
35+
# Test GitHub URL with .git suffix
36+
github_url_git = "https://github.com/owner/repo.git"
37+
result = self.db_manager._extract_repo_name_from_url(github_url_git, "github")
38+
assert result == "owner_repo"
39+
40+
# Test GitHub URL with trailing slash
41+
github_url_slash = "https://github.com/owner/repo/"
42+
result = self.db_manager._extract_repo_name_from_url(github_url_slash, "github")
43+
assert result == "owner_repo"
44+
45+
print("✓ GitHub URL tests passed")
46+
47+
def test_extract_repo_name_gitlab_urls(self):
48+
"""Test repository name extraction from GitLab URLs"""
49+
50+
# Test standard GitLab URL
51+
gitlab_url = "https://gitlab.com/owner/repo"
52+
result = self.db_manager._extract_repo_name_from_url(gitlab_url, "gitlab")
53+
assert result == "owner_repo"
54+
55+
# Test GitLab URL with subgroups
56+
gitlab_subgroup = "https://gitlab.com/group/subgroup/repo"
57+
result = self.db_manager._extract_repo_name_from_url(gitlab_subgroup, "gitlab")
58+
assert result == "subgroup_repo"
59+
60+
print("✓ GitLab URL tests passed")
61+
62+
def test_extract_repo_name_bitbucket_urls(self):
63+
"""Test repository name extraction from Bitbucket URLs"""
64+
bitbucket_url = "https://bitbucket.org/owner/repo"
65+
result = self.db_manager._extract_repo_name_from_url(bitbucket_url, "bitbucket")
66+
assert result == "owner_repo"
67+
68+
print("✓ Bitbucket URL tests passed")
69+
70+
def test_extract_repo_name_local_paths(self):
71+
"""Test repository name extraction from local paths"""
72+
result = self.db_manager._extract_repo_name_from_url("/home/user/projects/my-repo", "local")
73+
assert result == "my-repo"
74+
75+
result = self.db_manager._extract_repo_name_from_url("/var/repos/project.git", "local")
76+
assert result == "project"
77+
78+
print("✓ Local path tests passed")
79+
80+
def test_extract_repo_name_current_implementation_bug(self):
81+
"""Test that demonstrates the current implementation bug"""
82+
# The current implementation references 'type' which is not in scope
83+
try:
84+
# This should raise a NameError due to undefined 'type' variable
85+
result = self.db_manager._extract_repo_name_from_url("https://github.com/owner/repo")
86+
print("⚠️ WARNING: Expected the current implementation to fail due to undefined 'type' variable")
87+
print(f" But got result: {result}")
88+
except (NameError, TypeError) as e:
89+
print(f"✓ Current implementation correctly fails with: {type(e).__name__}: {e}")
90+
except Exception as e:
91+
print(f"⚠️ Unexpected error: {type(e).__name__}: {e}")
92+
93+
# Test absolute local path
94+
local_path = "/home/user/projects/my-repo"
95+
result = self.db_manager._extract_repo_name_from_url(local_path, "local")
96+
assert result == "my-repo"
97+
98+
# Test local path with .git suffix
99+
local_git = "/var/repos/project.git"
100+
result = self.db_manager._extract_repo_name_from_url(local_git, "local")
101+
assert result == "project"
102+
103+
print("✓ Local path tests passed")
104+
105+
def test_extract_repo_name_edge_cases(self):
106+
"""Test edge cases for repository name extraction"""
107+
108+
# Test URL with insufficient parts (should use fallback)
109+
short_url = "https://github.com/repo"
110+
result = self.db_manager._extract_repo_name_from_url(short_url, "github")
111+
assert result == "repo"
112+
113+
# Test single directory name
114+
single_name = "my-repo"
115+
result = self.db_manager._extract_repo_name_from_url(single_name, "local")
116+
assert result == "my-repo"
117+
118+
print("✓ Edge case tests passed")

uv.lock

Lines changed: 45 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)