44import base64
55import re
66import time
7+ from typing import List , Tuple , Optional , Dict , Any
78from sklearn .feature_extraction .text import TfidfVectorizer
89from sklearn .metrics .pairwise import cosine_similarity
910
1516)
1617logger = logging .getLogger (__name__ )
1718
18- REPO = os .getenv ("GITHUB_REPOSITORY" ) # e.g., 'soodoku/bloomjoin'
19- TOKEN = os .getenv ("GITHUB_TOKEN" )
20- HEADERS = {
19+ REPO : Optional [ str ] = os .getenv ("GITHUB_REPOSITORY" ) # e.g., 'soodoku/bloomjoin'
20+ TOKEN : Optional [ str ] = os .getenv ("GITHUB_TOKEN" )
21+ HEADERS : Dict [ str , str ] = {
2122 "Accept" : "application/vnd.github+json" ,
2223 "Authorization" : f"Bearer { TOKEN } "
2324}
2425
25- def get_topics (owner , repo ) :
26+ def get_topics (owner : str , repo : str ) -> List [ str ] :
2627 logger .info (f"Fetching topics for { owner } /{ repo } " )
2728 url = f"https://api.github.com/repos/{ owner } /{ repo } /topics"
2829 r = requests .get (url , headers = HEADERS )
@@ -31,10 +32,10 @@ def get_topics(owner, repo):
3132 logger .info (f"Found { len (topics )} topics" )
3233 return topics
3334
34- def get_user_repos (owner ) :
35+ def get_user_repos (owner : str ) -> List [ Dict [ str , Any ]] :
3536 logger .info (f"Fetching repositories for { owner } " )
3637 url = f"https://api.github.com/users/{ owner } /repos?per_page=100&type=owner"
37- repos = []
38+ repos : List [ Dict [ str , Any ]] = []
3839 while url :
3940 r = requests .get (url , headers = HEADERS )
4041 time .sleep (1 ) # More cautious rate limit handling
@@ -44,7 +45,7 @@ def get_user_repos(owner):
4445 else :
4546 logger .warning (f"Unexpected response when fetching repos: { page_repos } " )
4647 break
47- link_header = r .headers .get ('Link' , '' )
48+ link_header : str = r .headers .get ('Link' , '' )
4849 url = None
4950 for link in link_header .split (',' ):
5051 if 'rel="next"' in link :
@@ -53,7 +54,7 @@ def get_user_repos(owner):
5354 logger .info (f"Total repositories found: { len (repos )} " )
5455 return repos
5556
56- def get_readme_content (owner , repo ) :
57+ def get_readme_content (owner : str , repo : str ) -> str :
5758 logger .info (f"Fetching README for { owner } /{ repo } " )
5859 url = f"https://api.github.com/repos/{ owner } /{ repo } /readme"
5960 r = requests .get (url , headers = HEADERS )
@@ -71,7 +72,7 @@ def get_readme_content(owner, repo):
7172 logger .info ("No README content found" )
7273 return ""
7374
74- def clean_markdown (text ) :
75+ def clean_markdown (text : str ) -> str :
7576 text = re .sub (r'```.*?```' , '' , text , flags = re .DOTALL )
7677 text = re .sub (r'`.*?`' , '' , text )
7778 text = re .sub (r'\[([^\]]+)\]\([^)]+\)' , r'\1' , text )
@@ -84,7 +85,7 @@ def clean_markdown(text):
8485 text = re .sub (r'\s+' , ' ' , text ).strip ()
8586 return text
8687
87- def compute_readme_similarity (text1 , text2 ) :
88+ def compute_readme_similarity (text1 : str , text2 : str ) -> float :
8889 if not text1 or not text2 :
8990 return 0.0
9091
@@ -98,69 +99,72 @@ def compute_readme_similarity(text1, text2):
9899 logger .warning (f"Error computing README similarity: { e } " )
99100 return 0.0
100101
101- def find_adjacent_by_topics (owner , repo_name , topics ) :
102+ def find_adjacent_by_topics (owner : str , repo_name : str , topics : List [ str ], exclude_repos : Optional [ List [ str ]] = None ) -> List [ Tuple [ str , str , List [ str ], float ]] :
102103 """Find adjacent repositories based on common topics"""
103- repos = get_user_repos (owner )
104- related = []
104+ repos : List [Dict [str , Any ]] = get_user_repos (owner )
105+ related : List [Tuple [str , str , List [str ], float ]] = []
106+ exclude_list = exclude_repos or []
105107 for r in repos :
106- if r ["name" ].lower () == repo_name .lower ():
108+ if r ["name" ].lower () == repo_name .lower () or r [ "name" ] in exclude_list :
107109 continue
108- t = get_topics (r ["owner" ]["login" ], r ["name" ])
109- common = set (t ) & set (topics )
110+ t : List [ str ] = get_topics (r ["owner" ]["login" ], r ["name" ])
111+ common : set = set (t ) & set (topics )
110112 if common :
111113 related .append ((r ["full_name" ], r .get ("description" , "" ), list (common ), len (common )/ len (set (t ) | set (topics ))))
112114 return sorted (related , key = lambda x : - x [3 ])
113115
114- def find_adjacent_by_readme (owner , repo_name , readme_content ) :
116+ def find_adjacent_by_readme (owner : str , repo_name : str , readme_content : str , exclude_repos : Optional [ List [ str ]] = None ) -> List [ Tuple [ str , str , List [ str ], float ]] :
115117 """Find adjacent repositories based on README content similarity"""
116- repos = get_user_repos (owner )
117- related = []
118+ repos : List [Dict [str , Any ]] = get_user_repos (owner )
119+ related : List [Tuple [str , str , List [str ], float ]] = []
120+ exclude_list = exclude_repos or []
118121 for r in repos :
119- if r ["name" ].lower () == repo_name .lower ():
122+ if r ["name" ].lower () == repo_name .lower () or r [ "name" ] in exclude_list :
120123 continue
121- other_readme = get_readme_content (r ["owner" ]["login" ], r ["name" ])
122- similarity = compute_readme_similarity (readme_content , other_readme )
124+ other_readme : str = get_readme_content (r ["owner" ]["login" ], r ["name" ])
125+ similarity : float = compute_readme_similarity (readme_content , other_readme )
123126 if similarity > 0.1 : # Threshold for considering repositories as related
124127 related .append ((r ["full_name" ], r .get ("description" , "" ), [], similarity ))
125128 return sorted (related , key = lambda x : - x [3 ])
126129
127- def find_adjacent_combined (owner , repo_name , topics , readme_content , weight_topics = 0.5 ) :
130+ def find_adjacent_combined (owner : str , repo_name : str , topics : List [ str ] , readme_content : str , weight_topics : float = 0.5 , exclude_repos : Optional [ List [ str ]] = None ) -> List [ Tuple [ str , str , List [ str ], float ]] :
128131 """Find adjacent repositories using a weighted combination of topics and README similarity"""
129- repos = get_user_repos (owner )
130- related = []
132+ repos : List [Dict [str , Any ]] = get_user_repos (owner )
133+ related : List [Tuple [str , str , List [str ], float ]] = []
134+ exclude_list = exclude_repos or []
131135
132136 # Check if we have topics and README content
133- has_topics = len (topics ) > 0
134- has_readme = len (readme_content ) > 0
137+ has_topics : bool = len (topics ) > 0
138+ has_readme : bool = len (readme_content ) > 0
135139
136140 # Adjust weights if one source is missing
137- effective_weight_topics = weight_topics
141+ effective_weight_topics : float = weight_topics
138142 if not has_topics :
139143 effective_weight_topics = 0
140144 if not has_readme :
141145 effective_weight_topics = 1
142146
143147 # Collect similarity scores for normalization if needed
144- all_topic_sims = []
145- all_readme_sims = []
146- repo_data = []
148+ all_topic_sims : List [ float ] = []
149+ all_readme_sims : List [ float ] = []
150+ repo_data : List [ Tuple [ str , str , List [ str ], float , float ]] = []
147151
148152 # First pass to collect all scores
149153 for r in repos :
150- if r ["name" ].lower () == repo_name .lower ():
154+ if r ["name" ].lower () == repo_name .lower () or r [ "name" ] in exclude_list :
151155 continue
152156
153157 # Get topic similarity
154- t = get_topics (r ["owner" ]["login" ], r ["name" ])
155- common = set (t ) & set (topics )
156- topic_sim = 0
158+ t : List [ str ] = get_topics (r ["owner" ]["login" ], r ["name" ])
159+ common : set = set (t ) & set (topics )
160+ topic_sim : float = 0
157161 if has_topics and t :
158162 topic_sim = len (common )/ max (1 , len (set (t ) | set (topics )))
159163 all_topic_sims .append (topic_sim )
160164
161165 # Get README similarity
162- other_readme = ""
163- readme_sim = 0
166+ other_readme : str = ""
167+ readme_sim : float = 0
164168 if has_readme :
165169 other_readme = get_readme_content (r ["owner" ]["login" ], r ["name" ])
166170 readme_sim = compute_readme_similarity (readme_content , other_readme )
@@ -169,17 +173,17 @@ def find_adjacent_combined(owner, repo_name, topics, readme_content, weight_topi
169173 repo_data .append ((r ["full_name" ], r .get ("description" , "" ), list (common ), topic_sim , readme_sim ))
170174
171175 # Normalize scores if we have data
172- topic_max = max (all_topic_sims ) if all_topic_sims else 1
173- readme_max = max (all_readme_sims ) if all_readme_sims else 1
176+ topic_max : float = max (all_topic_sims ) if all_topic_sims else 1
177+ readme_max : float = max (all_readme_sims ) if all_readme_sims else 1
174178
175179 # Second pass to calculate combined scores
176180 for full_name , desc , common , topic_sim , readme_sim in repo_data :
177181 # Normalize if we have non-zero maximums
178- norm_topic_sim = topic_sim / topic_max if topic_max > 0 else 0
179- norm_readme_sim = readme_sim / readme_max if readme_max > 0 else 0
182+ norm_topic_sim : float = topic_sim / topic_max if topic_max > 0 else 0
183+ norm_readme_sim : float = readme_sim / readme_max if readme_max > 0 else 0
180184
181185 # Combined score
182- combined_score = (
186+ combined_score : float = (
183187 effective_weight_topics * norm_topic_sim +
184188 (1 - effective_weight_topics ) * norm_readme_sim
185189 )
@@ -189,7 +193,7 @@ def find_adjacent_combined(owner, repo_name, topics, readme_content, weight_topi
189193
190194 return sorted (related , key = lambda x : - x [3 ])
191195
192- def update_readme (related ) :
196+ def update_readme (related : List [ Tuple [ str , str , List [ str ], float ]], max_repos : int = 5 ) -> None :
193197 logger .info ("Updating README with adjacent repositories" )
194198
195199 try :
@@ -201,7 +205,7 @@ def update_readme(related):
201205 header = "## 🔗 Adjacent Repositories"
202206 block = [f"{ header } \n \n " ]
203207
204- for full_name , desc , tags , score in related [:5 ]:
208+ for full_name , desc , tags , score in related [:max_repos ]:
205209 url = f"https://github.com/{ full_name } "
206210 clean_desc = desc .strip () if desc else ""
207211 desc_str = f" — { clean_desc } " if clean_desc else ""
@@ -240,11 +244,18 @@ def update_readme(related):
240244 logger .info ("README update complete" )
241245
242246if __name__ == "__main__" :
247+ if not REPO :
248+ raise ValueError ("GITHUB_REPOSITORY environment variable not set" )
243249 owner , repo = REPO .split ("/" )
244- topics = get_topics (owner , repo )
245- method = os .getenv ("SIMILARITY_METHOD" , "topics" ).lower ()
250+ topics : List [str ] = get_topics (owner , repo )
251+ method : str = os .getenv ("SIMILARITY_METHOD" , "topics" ).lower ()
252+ exclude_repos_str : str = os .getenv ("EXCLUDE_REPOS" , "" )
253+ exclude_repos : List [str ] = [r .strip () for r in exclude_repos_str .split ("," ) if r .strip ()]
254+ max_repos : int = int (os .getenv ("MAX_REPOS" , "5" ))
246255
247256 print (f"Finding adjacent repositories using method: { method } " )
257+ if exclude_repos :
258+ print (f"Excluding repositories: { ', ' .join (exclude_repos )} " )
248259
249260 # Check if we have topics
250261 has_topics = len (topics ) > 0
@@ -258,47 +269,48 @@ def update_readme(related):
258269 print ("Warning: No README content found or failed to parse" )
259270
260271 # Determine which method to use, with fallbacks if necessary
272+ related : List [Tuple [str , str , List [str ], float ]] = []
261273 if method == "topics" :
262274 if has_topics :
263- related = find_adjacent_by_topics (owner , repo , topics )
275+ related = find_adjacent_by_topics (owner , repo , topics , exclude_repos )
264276 else :
265277 print ("Falling back to README similarity since no topics are available" )
266278 if has_readme :
267- related = find_adjacent_by_readme (owner , repo , readme_content )
279+ related = find_adjacent_by_readme (owner , repo , readme_content , exclude_repos )
268280 else :
269281 print ("No viable similarity method available. Both topics and README are missing." )
270282 related = []
271283 elif method == "readme" :
272284 if has_readme :
273- related = find_adjacent_by_readme (owner , repo , readme_content )
285+ related = find_adjacent_by_readme (owner , repo , readme_content , exclude_repos )
274286 else :
275287 print ("Falling back to topic similarity since README is not available" )
276288 if has_topics :
277- related = find_adjacent_by_topics (owner , repo , topics )
289+ related = find_adjacent_by_topics (owner , repo , topics , exclude_repos )
278290 else :
279291 print ("No viable similarity method available. Both topics and README are missing." )
280292 related = []
281293 elif method == "combined" :
282- weight = float (os .getenv ("TOPIC_WEIGHT" , "0.5" ))
283- related = find_adjacent_combined (owner , repo , topics , readme_content , weight )
294+ weight : float = float (os .getenv ("TOPIC_WEIGHT" , "0.5" ))
295+ related = find_adjacent_combined (owner , repo , topics , readme_content , weight , exclude_repos )
284296 else :
285297 print (f"Unrecognized method '{ method } ', using best available method" )
286298 if has_topics and has_readme :
287299 print ("Using combined similarity" )
288300 weight = float (os .getenv ("TOPIC_WEIGHT" , "0.5" ))
289- related = find_adjacent_combined (owner , repo , topics , readme_content , weight )
301+ related = find_adjacent_combined (owner , repo , topics , readme_content , weight , exclude_repos )
290302 elif has_topics :
291303 print ("Using topic similarity" )
292- related = find_adjacent_by_topics (owner , repo , topics )
304+ related = find_adjacent_by_topics (owner , repo , topics , exclude_repos )
293305 elif has_readme :
294306 print ("Using README similarity" )
295- related = find_adjacent_by_readme (owner , repo , readme_content )
307+ related = find_adjacent_by_readme (owner , repo , readme_content , exclude_repos )
296308 else :
297309 print ("No viable similarity method available. Both topics and README are missing." )
298310 related = []
299311
300312 if related :
301- update_readme (related )
302- print ("README updated with adjacent repositories." )
313+ update_readme (related , max_repos )
314+ print (f "README updated with { min ( len ( related ), max_repos ) } adjacent repositories." )
303315 else :
304316 print ("No adjacent repos found." )
0 commit comments