Merge pull request #2 from PoCInnovation/scrape/github-hf-scrapers

SachaHenneveux · web-flow · commit 27804866509c · 2025-11-07T22:47:05.000+01:00
scrape github &amp; hugging-face
diff --git a/scrape_github.py b/scrape_github.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""
+github_ai_theme_watcher.py
+
+Veille thématique GitHub orientée IA — recherche de projets par thème (ex: "LLM", "diffusion", "RAG", ...)
+Stocke des résultats synthétiques dans une base SQLite pour consommation par un dashboard / newsletter / alertes.
+
+Usage:
+    python github_ai_theme_watcher.py        # tourne en continu (sleep INTERVAL)
+    python github_ai_theme_watcher.py --once # exécute une seule itération (utile pour cron/tests)
+
+Configure via variables en tête du fichier ou via variables d'environnement:
+    - GITHUB_TOKEN: token (optionnel mais recommandé)
+"""
+
+import os
+import sys
+import time
+import sqlite3
+import requests
+import argparse
+from datetime import datetime
+from typing import List
+
+
+THEMES = [
+    "large-language-model",
+    "llm",
+    "transformer",
+    "text-generation",
+    "retrieval-augmented-generation",
+    "rag",
+    "agents",
+    "chatbot",
+    "fine-tuning",
+    "quantization",
+    "lora",
+    "peft",
+    "diffusion",
+    "stable-diffusion",
+    "image-generation",
+    "multimodal",
+    "speech-to-text",
+    "speech-synthesis",
+    "audio",
+    "reinforcement-learning",
+    "computer-vision",
+]
+
+RESULTS_PER_THEME = 20
+
+INTERVAL = int(os.getenv("GITHUB_WATCHER_INTERVAL", 21600))
+
+DB_FILE = os.path.join(os.path.dirname(__file__), "github_ai_trending.db")
+
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
+
+HEADERS = {
+    "Accept": "application/vnd.github+json",
+    "User-Agent": "github-ai-theme-watcher/1.0"
+}
+if GITHUB_TOKEN:
+    HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"
+
+conn = sqlite3.connect(DB_FILE)
+cur = conn.cursor()
+
+cur.execute("""
+CREATE TABLE IF NOT EXISTS trending_ai_projects (
+    full_name TEXT PRIMARY KEY,
+    name TEXT,
+    description TEXT,
+    stars INTEGER,
+    language TEXT,
+    theme TEXT,
+    updated_at TEXT,
+    html_url TEXT,
+    last_seen TIMESTAMP
+)
+""")
+conn.commit()
+
+cur.execute("""
+CREATE TABLE IF NOT EXISTS project_history (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    full_name TEXT,
+    stars INTEGER,
+    updated_at TEXT,
+    captured_at TIMESTAMP
+)
+""")
+conn.commit()
+
+
+def search_github_repos(query: str, per_page: int = RESULTS_PER_THEME) -> List[dict]:
+    """
+    Recherche des repositories GitHub via l'API Search.
+    `query` doit être la Q de recherche (ex: "transformer language:python").
+    """
+    url = "https://api.github.com/search/repositories"
+    params = {
+        "q": query,
+        "sort": "stars",
+        "order": "desc",
+        "per_page": per_page
+    }
+    resp = requests.get(url, headers=HEADERS, params=params, timeout=20)
+    if resp.status_code == 403:
+        retry_after = resp.headers.get("Retry-After")
+        raise RateLimitError(retry_after=int(retry_after) if retry_after and retry_after.isdigit() else None)
+    resp.raise_for_status()
+    data = resp.json()
+    return data.get("items", [])
+
+def sanitize_text(s):
+    if s is None:
+        return ""
+    return str(s)
+
+def save_project(repo: dict, theme: str):
+    """INSERT OR REPLACE de l'enregistrement principal + ajout historique."""
+    full_name = repo.get("full_name")
+    name = repo.get("name")
+    desc = sanitize_text(repo.get("description"))
+    stars = repo.get("stargazers_count", 0)
+    language = repo.get("language") or ""
+    updated_at = repo.get("updated_at") or repo.get("pushed_at") or datetime.utcnow().isoformat()
+    html_url = repo.get("html_url") or f"https://github.com/{full_name}"
+    now = datetime.utcnow().isoformat()
+
+    cur.execute("""
+    INSERT OR REPLACE INTO trending_ai_projects
+    (full_name, name, description, stars, language, theme, updated_at, html_url, last_seen)
+    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """, (full_name, name, desc, stars, language, theme, updated_at, html_url, now))
+    conn.commit()
+
+    cur.execute("""
+    INSERT INTO project_history (full_name, stars, updated_at, captured_at)
+    VALUES (?, ?, ?, ?)
+    """, (full_name, stars, updated_at, now))
+    conn.commit()
+
+
+class RateLimitError(Exception):
+    def __init__(self, retry_after=None):
+        self.retry_after = retry_after
+        super().__init__("Rate limit hit on GitHub API. Retry after: {}".format(retry_after))
+
+
+def build_query_for_theme(theme: str) -> str:
+    theme_token = theme.replace(" ", "+")
+    q = f"{theme_token} in:name,description,readme stars:>50"
+
+    return q
+
+def run_once(themes=THEMES):
+    print(f"[{datetime.utcnow().isoformat()}] Démarrage d'une itération de veille (thèmes: {len(themes)})")
+    total_saved = 0
+    for theme in themes:
+        try:
+            q = build_query_for_theme(theme)
+            print(f"-> Recherche thème '{theme}' (q={q})")
+            items = search_github_repos(q)
+            print(f"   ↳ {len(items)} résultats récupérés pour '{theme}'")
+            for repo in items:
+                save_project(repo, theme)
+                total_saved += 1
+        except RateLimitError as rle:
+            wait = rle.retry_after or 60
+            print(f"[RATE LIMIT] Limit atteint. Pause {wait} secondes.")
+            time.sleep(wait)
+        except Exception as e:
+            print(f"[ERREUR] thème '{theme}': {e}")
+    print(f"[{datetime.utcnow().isoformat()}] Itération terminée — {total_saved} enregistrements traités.")
+    return total_saved
+
+def main_loop(interval=INTERVAL, once=False):
+    if once:
+        run_once()
+        return
+
+    try:
+        while True:
+            run_once()
+            print(f"Attente {interval} secondes avant la prochaine vérification...")
+            time.sleep(interval)
+    except KeyboardInterrupt:
+        print("")
+    finally:
+        conn.close()
+
+def parse_args():
+    p = argparse.ArgumentParser(description="Veille thématique GitHub orientée IA")
+    p.add_argument("--once", action="store_true", help="Exécuter une unique itération et quitter")
+    p.add_argument("--interval", type=int, default=INTERVAL, help="Intervalle entre itérations (secondes)")
+    p.add_argument("--themes", type=str, help="Liste de thèmes séparés par des virgules (remplace la config)")
+    return p.parse_args()
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.themes:
+        THEMES = [t.strip() for t in args.themes.split(",") if t.strip()]
+        print(f"Themes remplacés: {THEMES}")
+
+    INTERVAL = args.interval
+
+    print("Github AI Theme Watcher démarré.")
+    if GITHUB_TOKEN:
+        print("")
+    else:
+        print("")
+
+    main_loop(interval=INTERVAL, once=args.once)
diff --git a/scrape_hf.py b/scrape_hf.py
@@ -0,0 +1,146 @@
+import os
+import time
+import sqlite3
+import requests
+from datetime import datetime
+
+INTERVAL = 300  
+
+DB_FILE = os.path.join(os.path.dirname(__file__), "huggingface_hub.db")
+conn = sqlite3.connect(DB_FILE)
+cur = conn.cursor()
+
+cur.execute("""
+CREATE TABLE IF NOT EXISTS hubs (
+    id TEXT PRIMARY KEY,
+    name TEXT,
+    author TEXT,
+    likes INTEGER,
+    downloads INTEGER,
+    task TEXT,
+    last_modified TEXT,
+    type TEXT,
+    url TEXT
+)
+""")
+conn.commit()
+
+
+def fetch_models():
+    """Récupère les modèles récents via l’API Hugging Face"""
+    url = "https://huggingface.co/api/models?sort=lastModified&direction=-1&limit=20"
+    r = requests.get(url, timeout=20)
+    r.raise_for_status()
+    return r.json()
+
+def fetch_datasets():
+    """Récupère les datasets récents"""
+    url = "https://huggingface.co/api/datasets?sort=lastModified&direction=-1&limit=20"
+    r = requests.get(url, timeout=20)
+    r.raise_for_status()
+    return r.json()
+
+def fetch_spaces():
+    """Récupère les Spaces récents"""
+    url = "https://huggingface.co/api/spaces?sort=lastModified&direction=-1&limit=20"
+    r = requests.get(url, timeout=20)
+    r.raise_for_status()
+    return r.json()
+
+def fetch_collections():
+    """Récupère les collections récentes"""
+    url = "https://huggingface.co/api/collections?sort=lastModified&direction=-1&limit=20"
+    r = requests.get(url, timeout=20)
+    if r.status_code == 404:
+        return []
+    r.raise_for_status()
+    return r.json()
+
+def fetch_organizations():
+    """Récupère les organisations récentes"""
+    url = "https://huggingface.co/api/organizations?limit=20"
+    r = requests.get(url, timeout=20)
+    r.raise_for_status()
+    return r.json()
+
+def fetch_papers():
+    """Récupère les papiers de recherche récents (si API accessible)"""
+    url = "https://huggingface.co/api/papers?sort=lastModified&direction=-1&limit=20"
+    try:
+        r = requests.get(url, timeout=20)
+        if r.status_code == 404:
+            return []
+        r.raise_for_status()
+        return r.json()
+    except Exception:
+        return []
+
+
+def save_item(item, item_type):
+    cur.execute("""
+    INSERT OR IGNORE INTO hubs (id, name, author, likes, downloads, task, last_modified, type, url)
+    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """, (
+        item.get("id"),
+        item.get("name") or item.get("modelId") or item.get("id"),
+        item.get("author") or item.get("organization", ""),
+        item.get("likes", 0),
+        item.get("downloads", 0),
+        ", ".join(item.get("pipeline_tag", "")) if isinstance(item.get("pipeline_tag"), list) else item.get("pipeline_tag", ""),
+        item.get("lastModified") or item.get("last_modified") or datetime.utcnow().isoformat(),
+        item_type,
+        build_url(item, item_type)
+    ))
+    conn.commit()
+
+def build_url(item, item_type):
+    """Construit l’URL publique de l’élément"""
+    base = "https://huggingface.co"
+    if item_type in ("model", "dataset", "space", "collection", "organization"):
+        return f"{base}/{item.get('id')}"
+    elif item_type == "paper":
+        return f"{base}/papers/{item.get('id')}"
+    return base
+
+def load_seen_ids():
+    cur.execute("SELECT id FROM hubs")
+    return set(row[0] for row in cur.fetchall())
+
+
+def main():
+    print("Initialisation Hugging Face Hub Watcher...")
+    seen_ids = load_seen_ids()
+    print(f"{len(seen_ids)} éléments déjà enregistrés.")
+
+    fetchers = [
+        ("model", fetch_models),
+        ("dataset", fetch_datasets),
+        ("space", fetch_spaces),
+        ("collection", fetch_collections),
+        ("organization", fetch_organizations),
+        ("paper", fetch_papers)
+    ]
+
+    try:
+        while True:
+            for item_type, fetch_func in fetchers:
+                try:
+                    items = fetch_func()
+                    for item in items:
+                        item_id = item.get("id")
+                        if item_id and item_id not in seen_ids:
+                            print(f"[NOUVEAU {item_type.upper()}] {item_id}")
+                            save_item(item, item_type)
+                            seen_ids.add(item_id)
+                except Exception as e:
+                    print(f"[ERREUR] {item_type}: {e}")
+
+            print(f"Attente {INTERVAL}s avant prochaine vérification...\n")
+            time.sleep(INTERVAL)
+    except KeyboardInterrupt:
+        print("Arrêt manuel.")
+    finally:
+        conn.close()
+
+if __name__ == "__main__":
+    main()