Skip to content

Commit 2780486

Browse files
Merge pull request #2 from PoCInnovation/scrape/github-hf-scrapers
scrape github & hugging-face
2 parents 768b030 + abe6be6 commit 2780486

File tree

2 files changed

+360
-0
lines changed

2 files changed

+360
-0
lines changed

scrape_github.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
#!/usr/bin/env python3
2+
"""
3+
github_ai_theme_watcher.py
4+
5+
Veille thématique GitHub orientée IA — recherche de projets par thème (ex: "LLM", "diffusion", "RAG", ...)
6+
Stocke des résultats synthétiques dans une base SQLite pour consommation par un dashboard / newsletter / alertes.
7+
8+
Usage:
9+
python github_ai_theme_watcher.py # tourne en continu (sleep INTERVAL)
10+
python github_ai_theme_watcher.py --once # exécute une seule itération (utile pour cron/tests)
11+
12+
Configure via variables en tête du fichier ou via variables d'environnement:
13+
- GITHUB_TOKEN: token (optionnel mais recommandé)
14+
"""
15+
16+
import os
17+
import sys
18+
import time
19+
import sqlite3
20+
import requests
21+
import argparse
22+
from datetime import datetime
23+
from typing import List
24+
25+
26+
THEMES = [
27+
"large-language-model",
28+
"llm",
29+
"transformer",
30+
"text-generation",
31+
"retrieval-augmented-generation",
32+
"rag",
33+
"agents",
34+
"chatbot",
35+
"fine-tuning",
36+
"quantization",
37+
"lora",
38+
"peft",
39+
"diffusion",
40+
"stable-diffusion",
41+
"image-generation",
42+
"multimodal",
43+
"speech-to-text",
44+
"speech-synthesis",
45+
"audio",
46+
"reinforcement-learning",
47+
"computer-vision",
48+
]
49+
50+
RESULTS_PER_THEME = 20
51+
52+
INTERVAL = int(os.getenv("GITHUB_WATCHER_INTERVAL", 21600))
53+
54+
DB_FILE = os.path.join(os.path.dirname(__file__), "github_ai_trending.db")
55+
56+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
57+
58+
HEADERS = {
59+
"Accept": "application/vnd.github+json",
60+
"User-Agent": "github-ai-theme-watcher/1.0"
61+
}
62+
if GITHUB_TOKEN:
63+
HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"
64+
65+
conn = sqlite3.connect(DB_FILE)
66+
cur = conn.cursor()
67+
68+
cur.execute("""
69+
CREATE TABLE IF NOT EXISTS trending_ai_projects (
70+
full_name TEXT PRIMARY KEY,
71+
name TEXT,
72+
description TEXT,
73+
stars INTEGER,
74+
language TEXT,
75+
theme TEXT,
76+
updated_at TEXT,
77+
html_url TEXT,
78+
last_seen TIMESTAMP
79+
)
80+
""")
81+
conn.commit()
82+
83+
cur.execute("""
84+
CREATE TABLE IF NOT EXISTS project_history (
85+
id INTEGER PRIMARY KEY AUTOINCREMENT,
86+
full_name TEXT,
87+
stars INTEGER,
88+
updated_at TEXT,
89+
captured_at TIMESTAMP
90+
)
91+
""")
92+
conn.commit()
93+
94+
95+
def search_github_repos(query: str, per_page: int = RESULTS_PER_THEME) -> List[dict]:
96+
"""
97+
Recherche des repositories GitHub via l'API Search.
98+
`query` doit être la Q de recherche (ex: "transformer language:python").
99+
"""
100+
url = "https://api.github.com/search/repositories"
101+
params = {
102+
"q": query,
103+
"sort": "stars",
104+
"order": "desc",
105+
"per_page": per_page
106+
}
107+
resp = requests.get(url, headers=HEADERS, params=params, timeout=20)
108+
if resp.status_code == 403:
109+
retry_after = resp.headers.get("Retry-After")
110+
raise RateLimitError(retry_after=int(retry_after) if retry_after and retry_after.isdigit() else None)
111+
resp.raise_for_status()
112+
data = resp.json()
113+
return data.get("items", [])
114+
115+
def sanitize_text(s):
116+
if s is None:
117+
return ""
118+
return str(s)
119+
120+
def save_project(repo: dict, theme: str):
121+
"""INSERT OR REPLACE de l'enregistrement principal + ajout historique."""
122+
full_name = repo.get("full_name")
123+
name = repo.get("name")
124+
desc = sanitize_text(repo.get("description"))
125+
stars = repo.get("stargazers_count", 0)
126+
language = repo.get("language") or ""
127+
updated_at = repo.get("updated_at") or repo.get("pushed_at") or datetime.utcnow().isoformat()
128+
html_url = repo.get("html_url") or f"https://github.com/{full_name}"
129+
now = datetime.utcnow().isoformat()
130+
131+
cur.execute("""
132+
INSERT OR REPLACE INTO trending_ai_projects
133+
(full_name, name, description, stars, language, theme, updated_at, html_url, last_seen)
134+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
135+
""", (full_name, name, desc, stars, language, theme, updated_at, html_url, now))
136+
conn.commit()
137+
138+
cur.execute("""
139+
INSERT INTO project_history (full_name, stars, updated_at, captured_at)
140+
VALUES (?, ?, ?, ?)
141+
""", (full_name, stars, updated_at, now))
142+
conn.commit()
143+
144+
145+
class RateLimitError(Exception):
146+
def __init__(self, retry_after=None):
147+
self.retry_after = retry_after
148+
super().__init__("Rate limit hit on GitHub API. Retry after: {}".format(retry_after))
149+
150+
151+
def build_query_for_theme(theme: str) -> str:
152+
theme_token = theme.replace(" ", "+")
153+
q = f"{theme_token} in:name,description,readme stars:>50"
154+
155+
return q
156+
157+
def run_once(themes=THEMES):
158+
print(f"[{datetime.utcnow().isoformat()}] Démarrage d'une itération de veille (thèmes: {len(themes)})")
159+
total_saved = 0
160+
for theme in themes:
161+
try:
162+
q = build_query_for_theme(theme)
163+
print(f"-> Recherche thème '{theme}' (q={q})")
164+
items = search_github_repos(q)
165+
print(f" ↳ {len(items)} résultats récupérés pour '{theme}'")
166+
for repo in items:
167+
save_project(repo, theme)
168+
total_saved += 1
169+
except RateLimitError as rle:
170+
wait = rle.retry_after or 60
171+
print(f"[RATE LIMIT] Limit atteint. Pause {wait} secondes.")
172+
time.sleep(wait)
173+
except Exception as e:
174+
print(f"[ERREUR] thème '{theme}': {e}")
175+
print(f"[{datetime.utcnow().isoformat()}] Itération terminée — {total_saved} enregistrements traités.")
176+
return total_saved
177+
178+
def main_loop(interval=INTERVAL, once=False):
179+
if once:
180+
run_once()
181+
return
182+
183+
try:
184+
while True:
185+
run_once()
186+
print(f"Attente {interval} secondes avant la prochaine vérification...")
187+
time.sleep(interval)
188+
except KeyboardInterrupt:
189+
print("")
190+
finally:
191+
conn.close()
192+
193+
def parse_args():
194+
p = argparse.ArgumentParser(description="Veille thématique GitHub orientée IA")
195+
p.add_argument("--once", action="store_true", help="Exécuter une unique itération et quitter")
196+
p.add_argument("--interval", type=int, default=INTERVAL, help="Intervalle entre itérations (secondes)")
197+
p.add_argument("--themes", type=str, help="Liste de thèmes séparés par des virgules (remplace la config)")
198+
return p.parse_args()
199+
200+
if __name__ == "__main__":
201+
args = parse_args()
202+
if args.themes:
203+
THEMES = [t.strip() for t in args.themes.split(",") if t.strip()]
204+
print(f"Themes remplacés: {THEMES}")
205+
206+
INTERVAL = args.interval
207+
208+
print("Github AI Theme Watcher démarré.")
209+
if GITHUB_TOKEN:
210+
print("")
211+
else:
212+
print("")
213+
214+
main_loop(interval=INTERVAL, once=args.once)

scrape_hf.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import os
2+
import time
3+
import sqlite3
4+
import requests
5+
from datetime import datetime
6+
7+
INTERVAL = 300
8+
9+
DB_FILE = os.path.join(os.path.dirname(__file__), "huggingface_hub.db")
10+
conn = sqlite3.connect(DB_FILE)
11+
cur = conn.cursor()
12+
13+
cur.execute("""
14+
CREATE TABLE IF NOT EXISTS hubs (
15+
id TEXT PRIMARY KEY,
16+
name TEXT,
17+
author TEXT,
18+
likes INTEGER,
19+
downloads INTEGER,
20+
task TEXT,
21+
last_modified TEXT,
22+
type TEXT,
23+
url TEXT
24+
)
25+
""")
26+
conn.commit()
27+
28+
29+
def fetch_models():
30+
"""Récupère les modèles récents via l’API Hugging Face"""
31+
url = "https://huggingface.co/api/models?sort=lastModified&direction=-1&limit=20"
32+
r = requests.get(url, timeout=20)
33+
r.raise_for_status()
34+
return r.json()
35+
36+
def fetch_datasets():
37+
"""Récupère les datasets récents"""
38+
url = "https://huggingface.co/api/datasets?sort=lastModified&direction=-1&limit=20"
39+
r = requests.get(url, timeout=20)
40+
r.raise_for_status()
41+
return r.json()
42+
43+
def fetch_spaces():
44+
"""Récupère les Spaces récents"""
45+
url = "https://huggingface.co/api/spaces?sort=lastModified&direction=-1&limit=20"
46+
r = requests.get(url, timeout=20)
47+
r.raise_for_status()
48+
return r.json()
49+
50+
def fetch_collections():
51+
"""Récupère les collections récentes"""
52+
url = "https://huggingface.co/api/collections?sort=lastModified&direction=-1&limit=20"
53+
r = requests.get(url, timeout=20)
54+
if r.status_code == 404:
55+
return []
56+
r.raise_for_status()
57+
return r.json()
58+
59+
def fetch_organizations():
60+
"""Récupère les organisations récentes"""
61+
url = "https://huggingface.co/api/organizations?limit=20"
62+
r = requests.get(url, timeout=20)
63+
r.raise_for_status()
64+
return r.json()
65+
66+
def fetch_papers():
67+
"""Récupère les papiers de recherche récents (si API accessible)"""
68+
url = "https://huggingface.co/api/papers?sort=lastModified&direction=-1&limit=20"
69+
try:
70+
r = requests.get(url, timeout=20)
71+
if r.status_code == 404:
72+
return []
73+
r.raise_for_status()
74+
return r.json()
75+
except Exception:
76+
return []
77+
78+
79+
def save_item(item, item_type):
80+
cur.execute("""
81+
INSERT OR IGNORE INTO hubs (id, name, author, likes, downloads, task, last_modified, type, url)
82+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
83+
""", (
84+
item.get("id"),
85+
item.get("name") or item.get("modelId") or item.get("id"),
86+
item.get("author") or item.get("organization", ""),
87+
item.get("likes", 0),
88+
item.get("downloads", 0),
89+
", ".join(item.get("pipeline_tag", "")) if isinstance(item.get("pipeline_tag"), list) else item.get("pipeline_tag", ""),
90+
item.get("lastModified") or item.get("last_modified") or datetime.utcnow().isoformat(),
91+
item_type,
92+
build_url(item, item_type)
93+
))
94+
conn.commit()
95+
96+
def build_url(item, item_type):
97+
"""Construit l’URL publique de l’élément"""
98+
base = "https://huggingface.co"
99+
if item_type in ("model", "dataset", "space", "collection", "organization"):
100+
return f"{base}/{item.get('id')}"
101+
elif item_type == "paper":
102+
return f"{base}/papers/{item.get('id')}"
103+
return base
104+
105+
def load_seen_ids():
106+
cur.execute("SELECT id FROM hubs")
107+
return set(row[0] for row in cur.fetchall())
108+
109+
110+
def main():
111+
print("Initialisation Hugging Face Hub Watcher...")
112+
seen_ids = load_seen_ids()
113+
print(f"{len(seen_ids)} éléments déjà enregistrés.")
114+
115+
fetchers = [
116+
("model", fetch_models),
117+
("dataset", fetch_datasets),
118+
("space", fetch_spaces),
119+
("collection", fetch_collections),
120+
("organization", fetch_organizations),
121+
("paper", fetch_papers)
122+
]
123+
124+
try:
125+
while True:
126+
for item_type, fetch_func in fetchers:
127+
try:
128+
items = fetch_func()
129+
for item in items:
130+
item_id = item.get("id")
131+
if item_id and item_id not in seen_ids:
132+
print(f"[NOUVEAU {item_type.upper()}] {item_id}")
133+
save_item(item, item_type)
134+
seen_ids.add(item_id)
135+
except Exception as e:
136+
print(f"[ERREUR] {item_type}: {e}")
137+
138+
print(f"Attente {INTERVAL}s avant prochaine vérification...\n")
139+
time.sleep(INTERVAL)
140+
except KeyboardInterrupt:
141+
print("Arrêt manuel.")
142+
finally:
143+
conn.close()
144+
145+
if __name__ == "__main__":
146+
main()

0 commit comments

Comments
 (0)