-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
211 lines (171 loc) · 7.83 KB
/
scrape.py
File metadata and controls
211 lines (171 loc) · 7.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import sys
import time
import requests
from requests.exceptions import HTTPError
from http import HTTPStatus
from bs4 import BeautifulSoup, Tag
import os
import json
import google.generativeai as genai
# Replace with your actual Google Cloud API Key
API_KEY = os.environ.get("API_KEY")
MIN_STARS = 20
EXCLUDED_USERS = ["nesttyy"] # Add more users to this list if needed
def scrape_github_page(query, page):
"""Scrapes a single page of GitHub search results.
Args:
query (str): The search query.
page (int): The page number to scrape.
Returns:
list: A list of dictionaries containing link, description, and stars for repositories on the page.
"""
retries = 3
retry_codes = [
HTTPStatus.TOO_MANY_REQUESTS,
HTTPStatus.INTERNAL_SERVER_ERROR,
HTTPStatus.BAD_GATEWAY,
HTTPStatus.SERVICE_UNAVAILABLE,
HTTPStatus.GATEWAY_TIMEOUT,
]
url = f"https://github.com/search?q={query}&type=repositories&s=stars&o=desc&p={page}"
repositories = []
for n in range(retries):
try:
response = requests.get(url)
response.raise_for_status() # Raise for any HTTP errors
soup = BeautifulSoup(response.content, "html.parser")
results_list = soup.find("div", {"data-testid": "results-list"})
if results_list is None:
sys.exit("Seems the html structure has changed since last time, time to code")
if type(results_list) is Tag:
results = results_list.find_all("div", recursive=False)
else:
results = []
for result in results:
link_element = result.find("a", href=True)
if link_element:
link = link_element['href']
# Extract the username from the link
username = link.split("/")[1].lower()
if username in [user.lower() for user in EXCLUDED_USERS]: # Check if user is excluded (case-insensitive)
print(f"Skipping result: {link} (user: {username})")
continue
description_element = link_element.parent.parent.parent.find_next_sibling("div")
description = description_element.children.__next__().text.strip() if description_element and description_element.children else link # Use link if description is missing
stars_element = link_element.parent.parent.parent.find_next_sibling("ul").find("a")
stars = int(stars_element.text.strip().replace(',', '')) if stars_element else None
repositories.append({"link": link, "description": description, "stars": stars})
return repositories
except HTTPError as exc:
code = exc.response.status_code
print(exc.response.headers)
if code in retry_codes:
# retry after n seconds
time.sleep(n)
continue
raise
except Exception as e:
soup = BeautifulSoup(response.content, "html.parser")
error_message = soup.find("div", class_="container").text.strip()
if error_message == "Whoa there! You have exceeded a secondary rate limit. Please wait a few minutes before you try again; in some cases this may take up to an hour.":
print(f"Rate limit exceeded: {error_message}")
time.sleep(60 * (n + 1)) # Exponential backoff with a minimum of 1 minute
continue
raise
def classify_repositories(repositories):
"""Classifies repositories based on their descriptions using Google Generative AI in a single call.
Args:
repositories (list): A list of dictionaries containing repository data.
Returns:
dict: A dictionary containing all scraped repositories categorized by their descriptions.
"""
if API_KEY is None:
print("Please set API_KEY environment variable.")
return {} # Return empty dict if key isn't provided
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel("gemini-2.0-flash")
# Build the prompt with the full JSON structure and output format
prompt = f"""
You will be provided with a JSON structure representing GitHub repositories.
Your task is to classify each repository into one of these categories:
- **Finanzas**: Anything related to price APIs, banks, and similar things
- **Mapas**: Postal Codes, City names, geographic data, etc
- **Identificación**: Anything related to goverment ID (Cedula), passport, RIF, etc
- **Comunidades**: Social network groups
- **Paquetes**: Tech stack - specific software that is related to Venezuela, i.e: Odoo, wordpress, woocommerce, shopify, etc.
- **Otros:** Anything else that doesn't fit into the above categories.
You must return a JSON object in the following format:
```json
{{
"Finanzas": [
{{ "link": "...", "description": "...", "stars": "..." }},
{{ "link": "...", "description": "...", "stars": "..." }},
...
],
"Mapas": [
...
],
"Identificación": [
...
],
"Comunidades": [
...
],
"Paquetes": [
...
],
"Otros": [
...
]
}}
```
Repositories:
{json.dumps(repositories, indent=4)}
"""
try:
response = model.generate_content(prompt)
response_text = response.text.strip()
# Check for the code blocks and strip them
if response_text.startswith("```json") and response_text.endswith("```"):
response_text = response_text[7:-3].strip()
# Now try parsing the response as JSON
categorized_repositories = json.loads(response_text)
return categorized_repositories
except Exception as e:
print(f"Google Generative AI request failed or returned invalid JSON: {e}")
print(f"Raw Gemini response: {response_text}")
return {}
def write_markdown(categorized_repositories, filename="README.md"):
"""Writes a markdown file with categorized repositories.
Args:
categorized_repositories (dict): A dictionary containing categorized repositories.
filename (str, optional): The filename for the markdown file. Defaults to "awesome_venezuela.md".
"""
with open(filename, "w") as f:
f.write("# Awesome Venezuela\n")
f.write("Recursos para desarrolladores  !\n\n")
for category, repos in categorized_repositories.items():
f.write(f"## {category}\n\n")
for repo in repos:
link = repo["link"]
description = repo["description"]
# Construct the markdown line with the desired badges
f.write(f"- **[{link[1:]}](https://github.com{link})**{': '+description if description != link else ''} "
f"[[1]}/{link.split('/')[2]})]({link}) "
f"[[1]}/{link.split('/')[2]})]({link})\n\n")
def main():
"""Main function to scrape repositories and classify them."""
query = "venezuela"
repositories = []
for page in range(1, 6):
page_results = scrape_github_page(query, page)
# Check if the first result has less than 10 stars
if page_results and page_results[0]["stars"] < MIN_STARS:
break
repositories.extend(page_results)
# Classify all repositories at once
categorized_repositories = classify_repositories(repositories)
print(json.dumps(categorized_repositories, indent=4))
write_markdown(categorized_repositories)
if __name__ == "__main__":
main()