1+ import os
2+ import requests
3+ from datetime import datetime , UTC
4+ from typing import List , Dict
5+
6+ SOURCE_SITE = "github"
7+ GITHUB_TOKEN = os .getenv ("GITHUB_TOKEN" )
8+
9+
10+ THEMES = [
11+ "large-language-model" , "llm" , "transformer" , "text-generation" , "retrieval-augmented-generation" ,
12+ "rag" , "agents" , "chatbot" , "fine-tuning" , "quantization" , "lora" , "peft" ,
13+ "diffusion" , "stable-diffusion" , "image-generation" , "multimodal" ,
14+ "speech-to-text" , "speech-synthesis" , "audio" , "reinforcement-learning" ,
15+ "computer-vision" ,
16+ ]
17+
18+ HEADERS = {
19+ "Accept" : "application/vnd.github+json" ,
20+ "User-Agent" : "github-ai-theme-watcher/1.0"
21+ }
22+ if GITHUB_TOKEN :
23+ HEADERS ["Authorization" ] = f"Bearer { GITHUB_TOKEN } "
24+
25+ class RateLimitError (Exception ):
26+ def __init__ (self , retry_after = None ):
27+ self .retry_after = retry_after
28+ super ().__init__ ("Rate limit hit on GitHub API. Retry after: {}" .format (retry_after ))
29+
30+ def sanitize_text (s ):
31+ return str (s ) if s is not None else ""
32+
33+ def normalize_github_repo (repo : Dict , theme : str ) -> Dict :
34+ full_name = repo .get ("full_name" )
35+ keywords_list = [theme , repo .get ("language" ) or "" ]
36+ if repo .get ("topics" ):
37+ keywords_list .extend (repo .get ("topics" ))
38+ updated_at = repo .get ("updated_at" ) or repo .get ("pushed_at" ) or datetime .now (UTC ).isoformat ()
39+ return {
40+ "id" : full_name , "source_site" : SOURCE_SITE , "title" : repo .get ("name" ),
41+ "description" : sanitize_text (repo .get ("description" )), "author_info" : repo .get ("owner" , {}).get ("login" , "" ),
42+ "keywords" : ", " .join (filter (None , keywords_list )), "content_url" : repo .get ("html_url" ) or f"https://github.com/{ full_name } " ,
43+ "published_date" : updated_at , "item_type" : "repository" ,
44+ }
45+
46+ def build_query_for_theme (theme : str ) -> str :
47+ theme_token = theme .replace (" " , "+" )
48+ q = f"{ theme_token } in:name,description,readme stars:>50"
49+ return q
50+
51+
52+ def search_github_repos (query : str , per_page : int = 20 ) -> List [Dict ]:
53+ """
54+ Recherche des repositories GitHub.
55+ Lève RateLimitError ou retourne List[Dict] (vide ou pleine).
56+ """
57+ url = "https://api.github.com/search/repositories"
58+ params = {
59+ "q" : query ,
60+ "sort" : "stars" ,
61+ "order" : "desc" ,
62+ "per_page" : per_page
63+ }
64+
65+ try :
66+ resp = requests .get (url , headers = HEADERS , params = params , timeout = 20 )
67+
68+ if resp .status_code == 403 :
69+ retry_after = resp .headers .get ("Retry-After" )
70+ raise RateLimitError (retry_after = int (retry_after ) if retry_after and retry_after .isdigit () else None )
71+
72+ if resp .status_code != 200 :
73+ print (f"[WARN] HTTP Status { resp .status_code } for query: { query } " )
74+ return []
75+
76+ data = resp .json ()
77+ return data .get ("items" , [])
78+
79+ except RateLimitError :
80+ raise
81+ except requests .exceptions .RequestException as e :
82+ print (f"[ERREUR CONNEXION/HTTP] GitHub Search: { e } " )
83+ return []
84+ except Exception as e :
85+ print (f"[ERREUR INCONNUE/JSON] GitHub Search: { e } " )
86+ return []
87+
88+
89+ def scrape_github (themes : List [str ] = THEMES , limit_per_theme : int = 20 ) -> List [Dict ]:
90+ """Scrape GitHub pour les thèmes donnés et retourne les éléments unifiés."""
91+
92+ all_items = []
93+ stop_scraping = False
94+
95+ try :
96+ for theme in themes :
97+ if stop_scraping :
98+ break
99+
100+ q = build_query_for_theme (theme )
101+ print (f"-> Recherche thème '{ theme } ' (q={ q } )" )
102+
103+ try :
104+ items = search_github_repos (q , limit_per_theme )
105+
106+ if not isinstance (items , list ):
107+ print (f"[FATAL WARN] search_github_repos a retourné { type (items )} au lieu de list. Arrêt." )
108+ stop_scraping = True
109+ continue
110+
111+ normalized_items = [normalize_github_repo (repo , theme ) for repo in items ]
112+ all_items .extend (normalized_items )
113+
114+ except RateLimitError :
115+ print (f"[RATE LIMIT] Limite atteinte. Arrêt de la veille GitHub pour cette itération." )
116+ stop_scraping = True
117+ except Exception as e :
118+ print (f"[ERREUR THÈME] '{ theme } ': { e } " )
119+ continue
120+
121+ finally :
122+ return all_items
123+
124+ if __name__ == "__main__" :
125+ results = scrape_github (themes = ["llm" ], limit_per_theme = 5 )
126+ print (f"Total GitHub items scraped: { len (results )} " )
127+ if results :
128+ import json
129+ print ("\n Exemple d'élément unifié:" )
130+ print (json .dumps (results [0 ], indent = 2 ))
0 commit comments