Skip to content

Commit ea524ef

Browse files
committed
fix(utils/crawl_github_files): branch names that includes "/"
To fix this problem, my solution was getting branch names from Github api and checking for matching. If there is no match, this time check is a tree like this exists.
1 parent f257d0b commit ea524ef

File tree

1 file changed

+64
-10
lines changed

1 file changed

+64
-10
lines changed

utils/crawl_github_files.py

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -135,22 +135,76 @@ def should_include_file(file_path: str, file_name: str) -> bool:
135135
owner = path_parts[0]
136136
repo = path_parts[1]
137137

138+
# Setup for GitHub API
139+
headers = {"Accept": "application/vnd.github.v3+json"}
140+
if token:
141+
headers["Authorization"] = f"token {token}"
142+
143+
def fetch_branches(owner: str, repo: str):
144+
"""Get brancshes of the repository"""
145+
146+
url = f"https://api.github.com/repos/{owner}/{repo}/branches"
147+
response = requests.get(url, headers=headers)
148+
149+
if response.status_code == 404:
150+
if not token:
151+
print(f"Error 404: Repository not found or is private.\n"
152+
f"If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable.")
153+
else:
154+
print(f"Error 404: Path '{path}' not found in repository or insufficient permissions with the provided token.\n"
155+
f"Please verify the token has access to this repository and the path exists.")
156+
return []
157+
158+
if response.status_code != 200:
159+
print(f"Error fetching {path}: {response.status_code} - {response.text}")
160+
return []
161+
162+
return response.json()
163+
164+
def check_tree(owner: str, repo: str, tree: str):
165+
"""Check the repository has the given tree"""
166+
167+
url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{tree}"
168+
response = requests.get(url, headers=headers)
169+
170+
return True if response.status_code == 200 else False
171+
138172
# Check if URL contains a specific branch/commit
139-
if 'tree' in path_parts:
140-
tree_index = path_parts.index('tree')
141-
ref = path_parts[tree_index + 1]
173+
if len(path_parts) > 2 and 'tree' == path_parts[2]:
174+
join_parts = lambda i: '/'.join(path_parts[i:])
175+
176+
branches = fetch_branches(owner, repo)
177+
branch_names = map(lambda branch: branch.get("name"), branches)
178+
179+
# Fetching branches is not successfully
180+
if len(branches) == 0:
181+
return
182+
183+
# To check branch name
184+
relevant_path = join_parts(3)
185+
186+
# Find a match with relevant path and get the branch name
187+
filter_gen = (name for name in branch_names if relevant_path.startswith(name))
188+
ref = next(filter_gen, None)
189+
190+
# If match is not found, check for is it a tree
191+
if ref == None:
192+
tree = path_parts[3]
193+
ref = tree if check_tree(owner, repo, tree) else None
194+
195+
# If it is neither a tree nor a branch name
196+
if ref == None:
197+
print(f"The given path does not match with any branch and any tree in the repository.\n"
198+
f"Please verify the path is exists.")
199+
return
200+
142201
# Combine all parts after the ref as the path
143-
path_start = tree_index + 2
144-
specific_path = '/'.join(path_parts[path_start:]) if path_start < len(path_parts) else ""
202+
part_index = 5 if '/' in ref else 4
203+
specific_path = join_parts(part_index) if part_index < len(path_parts) else ""
145204
else:
146205
ref = "main" # Default branch
147206
specific_path = ""
148207

149-
# Setup for GitHub API
150-
headers = {"Accept": "application/vnd.github.v3+json"}
151-
if token:
152-
headers["Authorization"] = f"token {token}"
153-
154208
# Dictionary to store path -> content mapping
155209
files = {}
156210
skipped_files = []

0 commit comments

Comments
 (0)