Skip to content

Commit c4c5d94

Browse files
authored
Merge pull request #4 from PoCInnovation/scrapper_formating
Scrapper formating
2 parents 2780486 + 95d9b1d commit c4c5d94

File tree

6 files changed

+563
-0
lines changed

6 files changed

+563
-0
lines changed

scrap/medium_scraping.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import feedparser
2+
from datetime import datetime
3+
from typing import List, Dict, Optional
4+
import time
5+
6+
SOURCE_SITE = "medium"
7+
8+
RSS_FEEDS = [
9+
"https://medium.com/feed/tag/artificial-intelligence",
10+
"https://medium.com/feed/tag/machine-learning",
11+
"https://medium.com/feed/tag/deep-learning",
12+
"https://medium.com/feed/tag/ai",
13+
]
14+
15+
def normalize_medium_entry(entry: feedparser.FeedParserDict) -> Dict:
16+
"""Normalise une entrée RSS Medium dans le format unifié."""
17+
entry_id = entry.get('link', '')
18+
19+
published_date = datetime.utcnow().isoformat()
20+
if getattr(entry, "published_parsed", None):
21+
published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat()
22+
23+
keywords = [tag.term for tag in entry.get('tags', [])] if 'tags' in entry else []
24+
25+
return {
26+
"id": entry_id,
27+
"source_site": SOURCE_SITE,
28+
"title": entry.get('title', 'N/A'),
29+
"description": entry.get('summary', 'N/A'),
30+
"author_info": entry.get('author', 'N/A'),
31+
"keywords": ", ".join(keywords),
32+
"content_url": entry_id,
33+
"published_date": published_date,
34+
"item_type": "article",
35+
}
36+
37+
def scrape_medium(max_articles_per_feed: int = 10) -> List[Dict]:
38+
"""Scrape les flux RSS Medium et retourne les éléments unifiés."""
39+
all_items = []
40+
unique_links = set()
41+
42+
for feed_url in RSS_FEEDS:
43+
print(f"📡 Fetching RSS: {feed_url}")
44+
try:
45+
feed = feedparser.parse(feed_url)
46+
47+
for entry in feed.entries[:max_articles_per_feed]:
48+
link = entry.get('link')
49+
if link and link not in unique_links:
50+
all_items.append(normalize_medium_entry(entry))
51+
unique_links.add(link)
52+
53+
except Exception as e:
54+
print(f"❌ Error fetching {feed_url}: {e}")
55+
time.sleep(1)
56+
57+
return all_items
58+
59+
if __name__ == "__main__":
60+
results = scrape_medium(max_articles_per_feed=2)
61+
print(f"Total Medium items scraped: {len(results)}")
62+
if results:
63+
print("\nExemple d'élément unifié:")
64+
import json
65+
print(json.dumps(results[0], indent=2))

scrap/scrap_arxiv.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import arxiv
2+
from datetime import datetime
3+
from typing import List, Dict
4+
5+
# Constantes de l'outil de veille
6+
SOURCE_SITE = "arxiv"
7+
CATEGORY = "cs.LG"
8+
9+
def normalize_arxiv_result(paper: arxiv.Result) -> Dict:
10+
"""Normalise un résultat arXiv dans le format unifié."""
11+
12+
authors = ", ".join([a.name for a in paper.authors])
13+
14+
link = paper.entry_id
15+
16+
keywords_list = [paper.primary_category]
17+
if paper.categories:
18+
keywords_list.extend(paper.categories)
19+
20+
return {
21+
"id": link,
22+
"source_site": SOURCE_SITE,
23+
"title": paper.title.replace('\n', ' '),
24+
"description": paper.summary.replace('\n', ' '),
25+
"author_info": authors,
26+
"keywords": ", ".join(keywords_list),
27+
"content_url": link,
28+
"published_date": paper.published.isoformat(),
29+
"item_type": "paper",
30+
}
31+
32+
def scrape_arxiv(category: str = CATEGORY, max_results: int = 10) -> List[Dict]:
33+
"""Scrape arXiv pour une catégorie et retourne les éléments unifiés."""
34+
35+
try:
36+
search = arxiv.Search(
37+
query=f"cat:{category}",
38+
max_results=max_results,
39+
sort_by=arxiv.SortCriterion.SubmittedDate,
40+
sort_order=arxiv.SortOrder.Descending
41+
)
42+
43+
normalized_results = []
44+
for result in search.results():
45+
normalized_results.append(normalize_arxiv_result(result))
46+
47+
return normalized_results
48+
49+
except Exception as e:
50+
print(f"[ERREUR] arXiv Search: {e}")
51+
return []
52+
53+
if __name__ == "__main__":
54+
results = scrape_arxiv(max_results=5)
55+
print(f"Total arXiv items scraped: {len(results)}")
56+
if results:
57+
print("\nExemple d'élément unifié:")
58+
import json
59+
print(json.dumps(results[0], indent=2))

scrap/scrap_le_monde.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import feedparser
2+
import time
3+
from datetime import datetime
4+
from typing import List, Dict
5+
6+
SOURCE_SITE = "le_monde"
7+
8+
FEEDS = [
9+
"https://www.lemonde.fr/international/rss_full.xml",
10+
"https://www.lemonde.fr/actualite-medias/rss_full.xml",
11+
"https://www.lemonde.fr/en_continu/rss_full.xml"
12+
]
13+
14+
def normalize_lemonde_entry(entry: feedparser.FeedParserDict, feed_url: str) -> Dict:
15+
"""Normalise une entrée RSS Le Monde dans le format unifié."""
16+
entry_id = getattr(entry, "id", None) or getattr(entry, "link", None)
17+
18+
published_date = datetime.utcnow().isoformat()
19+
if getattr(entry, "published_parsed", None):
20+
published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat()
21+
elif getattr(entry, "updated_parsed", None):
22+
published_date = datetime.fromtimestamp(time.mktime(entry.updated_parsed)).isoformat()
23+
24+
category = "actualité générale"
25+
if "international" in feed_url:
26+
category = "international"
27+
elif "medias" in feed_url:
28+
category = "actualité médias"
29+
elif "continu" in feed_url:
30+
category = "en continu"
31+
32+
return {
33+
"id": entry_id,
34+
"source_site": SOURCE_SITE,
35+
"title": getattr(entry, "title", ""),
36+
"description": getattr(entry, "summary", ""),
37+
"author_info": getattr(entry, "author", "Le Monde"),
38+
"keywords": category,
39+
"content_url": getattr(entry, "link", ""),
40+
"published_date": published_date,
41+
"item_type": "article",
42+
}
43+
44+
def scrape_lemonde(feeds: List[str] = FEEDS) -> List[Dict]:
45+
"""Scrape les flux RSS Le Monde et retourne les éléments unifiés."""
46+
all_items = []
47+
unique_ids = set()
48+
49+
for feed_url in feeds:
50+
try:
51+
d = feedparser.parse(feed_url)
52+
53+
for entry in d.entries:
54+
entry_id = getattr(entry, "id", None) or getattr(entry, "link", None)
55+
if entry_id and entry_id not in unique_ids:
56+
all_items.append(normalize_lemonde_entry(entry, feed_url))
57+
unique_ids.add(entry_id)
58+
59+
except Exception as e:
60+
print(f"[ERREUR] du fetch du feed {feed_url}: {e}")
61+
time.sleep(1)
62+
63+
return all_items
64+
65+
if __name__ == "__main__":
66+
results = scrape_lemonde(feeds=FEEDS[:1])
67+
print(f"Total Le Monde items scraped: {len(results)}")
68+
if results:
69+
print("\nExemple d'élément unifié:")
70+
import json
71+
print(json.dumps(results[0], indent=2))

scrap/scrape_github.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import os
2+
import requests
3+
from datetime import datetime, UTC
4+
from typing import List, Dict
5+
6+
SOURCE_SITE = "github"
7+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
8+
9+
10+
THEMES = [
11+
"large-language-model", "llm", "transformer", "text-generation", "retrieval-augmented-generation",
12+
"rag", "agents", "chatbot", "fine-tuning", "quantization", "lora", "peft",
13+
"diffusion", "stable-diffusion", "image-generation", "multimodal",
14+
"speech-to-text", "speech-synthesis", "audio", "reinforcement-learning",
15+
"computer-vision",
16+
]
17+
18+
HEADERS = {
19+
"Accept": "application/vnd.github+json",
20+
"User-Agent": "github-ai-theme-watcher/1.0"
21+
}
22+
if GITHUB_TOKEN:
23+
HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"
24+
25+
class RateLimitError(Exception):
26+
def __init__(self, retry_after=None):
27+
self.retry_after = retry_after
28+
super().__init__("Rate limit hit on GitHub API. Retry after: {}".format(retry_after))
29+
30+
def sanitize_text(s):
31+
return str(s) if s is not None else ""
32+
33+
def normalize_github_repo(repo: Dict, theme: str) -> Dict:
34+
full_name = repo.get("full_name")
35+
keywords_list = [theme, repo.get("language") or ""]
36+
if repo.get("topics"):
37+
keywords_list.extend(repo.get("topics"))
38+
updated_at = repo.get("updated_at") or repo.get("pushed_at") or datetime.now(UTC).isoformat()
39+
return {
40+
"id": full_name, "source_site": SOURCE_SITE, "title": repo.get("name"),
41+
"description": sanitize_text(repo.get("description")), "author_info": repo.get("owner", {}).get("login", ""),
42+
"keywords": ", ".join(filter(None, keywords_list)), "content_url": repo.get("html_url") or f"https://github.com/{full_name}",
43+
"published_date": updated_at, "item_type": "repository",
44+
}
45+
46+
def build_query_for_theme(theme: str) -> str:
47+
theme_token = theme.replace(" ", "+")
48+
q = f"{theme_token} in:name,description,readme stars:>50"
49+
return q
50+
51+
52+
def search_github_repos(query: str, per_page: int = 20) -> List[Dict]:
53+
"""
54+
Recherche des repositories GitHub.
55+
Lève RateLimitError ou retourne List[Dict] (vide ou pleine).
56+
"""
57+
url = "https://api.github.com/search/repositories"
58+
params = {
59+
"q": query,
60+
"sort": "stars",
61+
"order": "desc",
62+
"per_page": per_page
63+
}
64+
65+
try:
66+
resp = requests.get(url, headers=HEADERS, params=params, timeout=20)
67+
68+
if resp.status_code == 403:
69+
retry_after = resp.headers.get("Retry-After")
70+
raise RateLimitError(retry_after=int(retry_after) if retry_after and retry_after.isdigit() else None)
71+
72+
if resp.status_code != 200:
73+
print(f"[WARN] HTTP Status {resp.status_code} for query: {query}")
74+
return []
75+
76+
data = resp.json()
77+
return data.get("items", [])
78+
79+
except RateLimitError:
80+
raise
81+
except requests.exceptions.RequestException as e:
82+
print(f"[ERREUR CONNEXION/HTTP] GitHub Search: {e}")
83+
return []
84+
except Exception as e:
85+
print(f"[ERREUR INCONNUE/JSON] GitHub Search: {e}")
86+
return []
87+
88+
89+
def scrape_github(themes: List[str] = THEMES, limit_per_theme: int = 20) -> List[Dict]:
90+
"""Scrape GitHub pour les thèmes donnés et retourne les éléments unifiés."""
91+
92+
all_items = []
93+
stop_scraping = False
94+
95+
try:
96+
for theme in themes:
97+
if stop_scraping:
98+
break
99+
100+
q = build_query_for_theme(theme)
101+
print(f"-> Recherche thème '{theme}' (q={q})")
102+
103+
try:
104+
items = search_github_repos(q, limit_per_theme)
105+
106+
if not isinstance(items, list):
107+
print(f"[FATAL WARN] search_github_repos a retourné {type(items)} au lieu de list. Arrêt.")
108+
stop_scraping = True
109+
continue
110+
111+
normalized_items = [normalize_github_repo(repo, theme) for repo in items]
112+
all_items.extend(normalized_items)
113+
114+
except RateLimitError:
115+
print(f"[RATE LIMIT] Limite atteinte. Arrêt de la veille GitHub pour cette itération.")
116+
stop_scraping = True
117+
except Exception as e:
118+
print(f"[ERREUR THÈME] '{theme}': {e}")
119+
continue
120+
121+
finally:
122+
return all_items
123+
124+
if __name__ == "__main__":
125+
results = scrape_github(themes=["llm"], limit_per_theme=5)
126+
print(f"Total GitHub items scraped: {len(results)}")
127+
if results:
128+
import json
129+
print("\nExemple d'élément unifié:")
130+
print(json.dumps(results[0], indent=2))

0 commit comments

Comments
 (0)