|
1 | 1 | import os |
| 2 | +import logging |
2 | 3 | import requests |
3 | 4 | import base64 |
4 | 5 | import re |
| 6 | +import time |
5 | 7 | from sklearn.feature_extraction.text import TfidfVectorizer |
6 | 8 | from sklearn.metrics.pairwise import cosine_similarity |
7 | 9 |
|
| 10 | +# Configure logging |
| 11 | +logging.basicConfig( |
| 12 | + level=logging.INFO, |
| 13 | + format='%(asctime)s - %(levelname)s: %(message)s', |
| 14 | + datefmt='%Y-%m-%d %H:%M:%S' |
| 15 | +) |
| 16 | +logger = logging.getLogger(__name__) |
| 17 | + |
8 | 18 | REPO = os.getenv("GITHUB_REPOSITORY") # e.g., 'soodoku/bloomjoin' |
9 | 19 | TOKEN = os.getenv("GITHUB_TOKEN") |
10 | 20 | HEADERS = { |
|
13 | 23 | } |
14 | 24 |
|
15 | 25 | def get_topics(owner, repo): |
| 26 | + logger.info(f"Fetching topics for {owner}/{repo}") |
16 | 27 | url = f"https://api.github.com/repos/{owner}/{repo}/topics" |
17 | 28 | r = requests.get(url, headers=HEADERS) |
18 | | - return r.json().get("names", []) if r.status_code == 200 else [] |
| 29 | + time.sleep(0.5) # Rate limit handling |
| 30 | + topics = r.json().get("names", []) if r.status_code == 200 else [] |
| 31 | + logger.info(f"Found {len(topics)} topics") |
| 32 | + return topics |
19 | 33 |
|
20 | 34 | def get_user_repos(owner): |
| 35 | + logger.info(f"Fetching repositories for {owner}") |
21 | 36 | url = f"https://api.github.com/users/{owner}/repos?per_page=100&type=owner" |
22 | 37 | repos = [] |
23 | 38 | while url: |
24 | 39 | r = requests.get(url, headers=HEADERS) |
25 | | - repos.extend(r.json()) |
| 40 | + time.sleep(1) # More cautious rate limit handling |
| 41 | + page_repos = r.json() |
| 42 | + repos.extend(page_repos) |
26 | 43 | link_header = r.headers.get('Link', '') |
27 | 44 | url = None |
28 | 45 | for link in link_header.split(','): |
29 | 46 | if 'rel="next"' in link: |
30 | 47 | url = link.split(';')[0].strip('<>') |
31 | 48 | break |
| 49 | + logger.info(f"Total repositories found: {len(repos)}") |
32 | 50 | return repos |
33 | 51 |
|
34 | 52 | def get_readme_content(owner, repo): |
35 | | - """Fetch README content from a repository""" |
| 53 | + logger.info(f"Fetching README for {owner}/{repo}") |
36 | 54 | url = f"https://api.github.com/repos/{owner}/{repo}/readme" |
37 | 55 | r = requests.get(url, headers=HEADERS) |
| 56 | + time.sleep(0.5) # Rate limit handling |
38 | 57 | if r.status_code == 200: |
39 | 58 | content = r.json().get("content", "") |
40 | 59 | if content: |
41 | 60 | try: |
42 | 61 | decoded = base64.b64decode(content).decode('utf-8') |
43 | | - # Clean the markdown content |
44 | 62 | cleaned = clean_markdown(decoded) |
| 63 | + logger.info(f"README successfully retrieved and cleaned (length: {len(cleaned)} chars)") |
45 | 64 | return cleaned |
46 | 65 | except Exception as e: |
47 | | - print(f"Error decoding README for {owner}/{repo}: {e}") |
| 66 | + logger.warning(f"Error decoding README: {e}") |
| 67 | + logger.info("No README content found") |
48 | 68 | return "" |
49 | 69 |
|
50 | 70 | def clean_markdown(text): |
51 | | - """Clean markdown content to improve text similarity comparison""" |
52 | | - # Remove code blocks |
53 | 71 | text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) |
54 | | - # Remove inline code |
55 | 72 | text = re.sub(r'`.*?`', '', text) |
56 | | - # Remove links but keep the text |
57 | 73 | text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) |
58 | | - # Remove headers |
59 | 74 | text = re.sub(r'#+\s+', '', text) |
60 | | - # Remove HTML tags |
61 | 75 | text = re.sub(r'<[^>]+>', '', text) |
62 | | - # Remove images |
63 | 76 | text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text) |
64 | | - # Remove extra whitespace |
| 77 | + text = re.sub(r'^[-*]\s+', '', text, flags=re.MULTILINE) |
| 78 | + text = re.sub(r'\|.*?\|', '', text) |
| 79 | + text = re.sub(r'---+', '', text) |
65 | 80 | text = re.sub(r'\s+', ' ', text).strip() |
66 | 81 | return text |
67 | 82 |
|
68 | 83 | def compute_readme_similarity(text1, text2): |
69 | | - """Compute cosine similarity between two README texts using TF-IDF""" |
70 | 84 | if not text1 or not text2: |
71 | 85 | return 0.0 |
72 | | - |
73 | | - # Create a TF-IDF vectorizer |
| 86 | + |
74 | 87 | vectorizer = TfidfVectorizer(stop_words='english') |
75 | | - |
76 | 88 | try: |
77 | | - # Calculate TF-IDF matrix |
78 | 89 | tfidf_matrix = vectorizer.fit_transform([text1, text2]) |
79 | | - # Calculate cosine similarity |
80 | 90 | similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] |
| 91 | + logger.info(f"README similarity computed: {similarity}") |
81 | 92 | return similarity |
82 | 93 | except Exception as e: |
83 | | - print(f"Error computing similarity: {e}") |
| 94 | + logger.warning(f"Error computing README similarity: {e}") |
84 | 95 | return 0.0 |
85 | 96 |
|
86 | 97 | def find_adjacent_by_topics(owner, repo_name, topics): |
|
0 commit comments