Skip to content

Commit d71b79b

Browse files
authored
Update adjacent.py
1 parent 5073ed5 commit d71b79b

File tree

1 file changed

+31
-20
lines changed

1 file changed

+31
-20
lines changed

.github/scripts/adjacent.py

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,20 @@
11
import os
2+
import logging
23
import requests
34
import base64
45
import re
6+
import time
57
from sklearn.feature_extraction.text import TfidfVectorizer
68
from sklearn.metrics.pairwise import cosine_similarity
79

10+
# Configure logging
11+
logging.basicConfig(
12+
level=logging.INFO,
13+
format='%(asctime)s - %(levelname)s: %(message)s',
14+
datefmt='%Y-%m-%d %H:%M:%S'
15+
)
16+
logger = logging.getLogger(__name__)
17+
818
REPO = os.getenv("GITHUB_REPOSITORY") # e.g., 'soodoku/bloomjoin'
919
TOKEN = os.getenv("GITHUB_TOKEN")
1020
HEADERS = {
@@ -13,74 +23,75 @@
1323
}
1424

1525
def get_topics(owner, repo):
26+
logger.info(f"Fetching topics for {owner}/{repo}")
1627
url = f"https://api.github.com/repos/{owner}/{repo}/topics"
1728
r = requests.get(url, headers=HEADERS)
18-
return r.json().get("names", []) if r.status_code == 200 else []
29+
time.sleep(0.5) # Rate limit handling
30+
topics = r.json().get("names", []) if r.status_code == 200 else []
31+
logger.info(f"Found {len(topics)} topics")
32+
return topics
1933

2034
def get_user_repos(owner):
35+
logger.info(f"Fetching repositories for {owner}")
2136
url = f"https://api.github.com/users/{owner}/repos?per_page=100&type=owner"
2237
repos = []
2338
while url:
2439
r = requests.get(url, headers=HEADERS)
25-
repos.extend(r.json())
40+
time.sleep(1) # More cautious rate limit handling
41+
page_repos = r.json()
42+
repos.extend(page_repos)
2643
link_header = r.headers.get('Link', '')
2744
url = None
2845
for link in link_header.split(','):
2946
if 'rel="next"' in link:
3047
url = link.split(';')[0].strip('<>')
3148
break
49+
logger.info(f"Total repositories found: {len(repos)}")
3250
return repos
3351

3452
def get_readme_content(owner, repo):
35-
"""Fetch README content from a repository"""
53+
logger.info(f"Fetching README for {owner}/{repo}")
3654
url = f"https://api.github.com/repos/{owner}/{repo}/readme"
3755
r = requests.get(url, headers=HEADERS)
56+
time.sleep(0.5) # Rate limit handling
3857
if r.status_code == 200:
3958
content = r.json().get("content", "")
4059
if content:
4160
try:
4261
decoded = base64.b64decode(content).decode('utf-8')
43-
# Clean the markdown content
4462
cleaned = clean_markdown(decoded)
63+
logger.info(f"README successfully retrieved and cleaned (length: {len(cleaned)} chars)")
4564
return cleaned
4665
except Exception as e:
47-
print(f"Error decoding README for {owner}/{repo}: {e}")
66+
logger.warning(f"Error decoding README: {e}")
67+
logger.info("No README content found")
4868
return ""
4969

5070
def clean_markdown(text):
51-
"""Clean markdown content to improve text similarity comparison"""
52-
# Remove code blocks
5371
text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
54-
# Remove inline code
5572
text = re.sub(r'`.*?`', '', text)
56-
# Remove links but keep the text
5773
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
58-
# Remove headers
5974
text = re.sub(r'#+\s+', '', text)
60-
# Remove HTML tags
6175
text = re.sub(r'<[^>]+>', '', text)
62-
# Remove images
6376
text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text)
64-
# Remove extra whitespace
77+
text = re.sub(r'^[-*]\s+', '', text, flags=re.MULTILINE)
78+
text = re.sub(r'\|.*?\|', '', text)
79+
text = re.sub(r'---+', '', text)
6580
text = re.sub(r'\s+', ' ', text).strip()
6681
return text
6782

6883
def compute_readme_similarity(text1, text2):
69-
"""Compute cosine similarity between two README texts using TF-IDF"""
7084
if not text1 or not text2:
7185
return 0.0
72-
73-
# Create a TF-IDF vectorizer
86+
7487
vectorizer = TfidfVectorizer(stop_words='english')
75-
7688
try:
77-
# Calculate TF-IDF matrix
7889
tfidf_matrix = vectorizer.fit_transform([text1, text2])
79-
# Calculate cosine similarity
8090
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
91+
logger.info(f"README similarity computed: {similarity}")
8192
return similarity
8293
except Exception as e:
83-
print(f"Error computing similarity: {e}")
94+
logger.warning(f"Error computing README similarity: {e}")
8495
return 0.0
8596

8697
def find_adjacent_by_topics(owner, repo_name, topics):

0 commit comments

Comments
 (0)