Skip to content

Commit 98fc7b4

Browse files
authored
Merge pull request #46 from siaeyy/fix/issue-35
Fix for issue #35
2 parents f257d0b + ccd0481 commit 98fc7b4

File tree

1 file changed

+68
-12
lines changed

1 file changed

+68
-12
lines changed

utils/crawl_github_files.py

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -135,21 +135,77 @@ def should_include_file(file_path: str, file_name: str) -> bool:
135135
owner = path_parts[0]
136136
repo = path_parts[1]
137137

138-
# Check if URL contains a specific branch/commit
139-
if 'tree' in path_parts:
140-
tree_index = path_parts.index('tree')
141-
ref = path_parts[tree_index + 1]
142-
# Combine all parts after the ref as the path
143-
path_start = tree_index + 2
144-
specific_path = '/'.join(path_parts[path_start:]) if path_start < len(path_parts) else ""
145-
else:
146-
ref = "main" # Default branch
147-
specific_path = ""
148-
149138
# Setup for GitHub API
150139
headers = {"Accept": "application/vnd.github.v3+json"}
151140
if token:
152141
headers["Authorization"] = f"token {token}"
142+
143+
def fetch_branches(owner: str, repo: str):
144+
"""Get brancshes of the repository"""
145+
146+
url = f"https://api.github.com/repos/{owner}/{repo}/branches"
147+
response = requests.get(url, headers=headers)
148+
149+
if response.status_code == 404:
150+
if not token:
151+
print(f"Error 404: Repository not found or is private.\n"
152+
f"If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable.")
153+
else:
154+
print(f"Error 404: Repository not found or insufficient permissions with the provided token.\n"
155+
f"Please verify the repository exists and the token has access to this repository.")
156+
return []
157+
158+
if response.status_code != 200:
159+
print(f"Error fetching the branches of {owner}/{path}: {response.status_code} - {response.text}")
160+
return []
161+
162+
return response.json()
163+
164+
def check_tree(owner: str, repo: str, tree: str):
165+
"""Check the repository has the given tree"""
166+
167+
url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{tree}"
168+
response = requests.get(url, headers=headers)
169+
170+
return True if response.status_code == 200 else False
171+
172+
# Check if URL contains a specific branch/commit
173+
if len(path_parts) > 2 and 'tree' == path_parts[2]:
174+
join_parts = lambda i: '/'.join(path_parts[i:])
175+
176+
branches = fetch_branches(owner, repo)
177+
branch_names = map(lambda branch: branch.get("name"), branches)
178+
179+
# Fetching branches is not successfully
180+
if len(branches) == 0:
181+
return
182+
183+
# To check branch name
184+
relevant_path = join_parts(3)
185+
186+
# Find a match with relevant path and get the branch name
187+
filter_gen = (name for name in branch_names if relevant_path.startswith(name))
188+
ref = next(filter_gen, None)
189+
190+
# If match is not found, check for is it a tree
191+
if ref == None:
192+
tree = path_parts[3]
193+
ref = tree if check_tree(owner, repo, tree) else None
194+
195+
# If it is neither a tree nor a branch name
196+
if ref == None:
197+
print(f"The given path does not match with any branch and any tree in the repository.\n"
198+
f"Please verify the path is exists.")
199+
return
200+
201+
# Combine all parts after the ref as the path
202+
part_index = 5 if '/' in ref else 4
203+
specific_path = join_parts(part_index) if part_index < len(path_parts) else ""
204+
else:
205+
# Dont put the ref param to quiery
206+
# and let Github decide default branch
207+
ref = None
208+
specific_path = ""
153209

154210
# Dictionary to store path -> content mapping
155211
files = {}
@@ -158,7 +214,7 @@ def should_include_file(file_path: str, file_name: str) -> bool:
158214
def fetch_contents(path):
159215
"""Fetch contents of the repository at a specific path and commit"""
160216
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
161-
params = {"ref": ref}
217+
params = {"ref": ref} if ref != None else {}
162218

163219
response = requests.get(url, headers=headers, params=params)
164220

0 commit comments

Comments
 (0)