diff --git a/scrap/arxiv/arxiv_papers.db b/scrap/arxiv/arxiv_papers.db new file mode 100644 index 0000000..526e51f Binary files /dev/null and b/scrap/arxiv/arxiv_papers.db differ diff --git a/scrap/arxiv/category.md b/scrap/arxiv/category.md new file mode 100644 index 0000000..ccdc1db --- /dev/null +++ b/scrap/arxiv/category.md @@ -0,0 +1,8 @@ +| Code | Domaine | +| -------------- | --------------------------------------- | +| cs.AI | Artificial Intelligence | +| cs.CL | Computation and Language | +| cs.CV | Computer Vision and Pattern Recognition | +| math.PR | Probability | +| stat.ML | Machine Learning (Statistics) | +| physics.gen-ph | General Physics | diff --git a/scrap/arxiv/scrap_arxiv.py b/scrap/arxiv/scrap_arxiv.py new file mode 100644 index 0000000..3a10db7 --- /dev/null +++ b/scrap/arxiv/scrap_arxiv.py @@ -0,0 +1,61 @@ +import time +import os +import sqlite3 +import arxiv + +CATEGORY = "cs.LG" # check in category.md +INTERVAL = 300 # secondes +DB_FILE = os.path.join(os.path.dirname(__file__), "arxiv_papers.db") + +conn = sqlite3.connect(DB_FILE) +cursor = conn.cursor() +cursor.execute(""" +CREATE TABLE IF NOT EXISTS papers ( + id TEXT PRIMARY KEY, + title TEXT, + authors TEXT, + published TEXT, + summary TEXT, + link TEXT +) +""") +conn.commit() + +def save_paper(paper): + cursor.execute(""" + INSERT OR IGNORE INTO papers (id, title, authors, published, summary, link) + VALUES (?, ?, ?, ?, ?, ?) + """, ( + paper.entry_id, + paper.title, + ", ".join([a.name for a in paper.authors]), + paper.published.isoformat(), + paper.summary, + paper.entry_id + )) + conn.commit() + +cursor.execute("SELECT id FROM papers") +seen_ids = set(row[0] for row in cursor.fetchall()) + +while True: + search = arxiv.Search( + query=f"cat:{CATEGORY}", + max_results=10, + sort_by=arxiv.SortCriterion.SubmittedDate, + sort_order=arxiv.SortOrder.Descending + ) + + for result in search.results(): + if result.entry_id not in seen_ids: + print("NOUVEAU PAPIER !") + print("Title:", result.title) + print("Authors:", ", ".join([author.name for author in result.authors])) + print("Published:", result.published) + print("Link:", result.entry_id) + print("="*80) + + save_paper(result) + seen_ids.add(result.entry_id) + + time.sleep(INTERVAL) diff --git a/scrap/le_monde/lemonde_articles.db b/scrap/le_monde/lemonde_articles.db new file mode 100644 index 0000000..a04385a Binary files /dev/null and b/scrap/le_monde/lemonde_articles.db differ diff --git a/scrap/le_monde/scrap_le_monde.py b/scrap/le_monde/scrap_le_monde.py new file mode 100644 index 0000000..beae010 --- /dev/null +++ b/scrap/le_monde/scrap_le_monde.py @@ -0,0 +1,95 @@ +import time +import os +import sqlite3 +import feedparser +from datetime import datetime + +FEEDS = [ + # Flux par catégorie + "https://www.lemonde.fr/bresil/rss_full.xml", + "https://www.lemonde.fr/international/rss_full.xml", + "https://www.lemonde.fr/actualite-medias/rss_full.xml", + # Flux "en continu" + "https://www.lemonde.fr/en_continu/rss_full.xml" +] + +DB_FILE = os.path.join(os.path.dirname(__file__), "lemonde_articles.db") +INTERVAL = 300 + +conn = sqlite3.connect(DB_FILE, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) +cur = conn.cursor() +cur.execute(""" +CREATE TABLE IF NOT EXISTS articles ( + id TEXT PRIMARY KEY, + title TEXT, + published TIMESTAMP, + summary TEXT, + link TEXT, + feed TEXT +) +""") +conn.commit() + +def save_article(entry, feed_url): + """ + entry: objet feedparser entry + """ + entry_id = getattr(entry, "id", None) or getattr(entry, "link", None) + title = getattr(entry, "title", "") + link = getattr(entry, "link", "") + summary = getattr(entry, "summary", "") + published = None + if getattr(entry, "published_parsed", None): + published = datetime.fromtimestamp(time.mktime(entry.published_parsed)) + elif getattr(entry, "updated_parsed", None): + published = datetime.fromtimestamp(time.mktime(entry.updated_parsed)) + else: + published = datetime.utcnow() + + cur.execute(""" + INSERT OR IGNORE INTO articles (id, title, published, summary, link, feed) + VALUES (?, ?, ?, ?, ?, ?) + """, (entry_id, title, published, summary, link, feed_url)) + conn.commit() + +def load_seen_ids(): + cur.execute("SELECT id FROM articles") + return set(row[0] for row in cur.fetchall()) + +def fetch_feed(feed_url): + return feedparser.parse(feed_url) + +def main(): + print("Initialisation...") + seen_ids = load_seen_ids() + print(f"{len(seen_ids)} articles déjà en base.") + try: + while True: + for feed in FEEDS: + try: + d = fetch_feed(feed) + if d.bozo: + print(f"[WARN] Problème lecture flux {feed}: {getattr(d, 'bozo_exception', '')}") + continue + + for entry in d.entries: + entry_id = getattr(entry, "id", None) or getattr(entry, "link", None) + if entry_id is None: + continue + if entry_id not in seen_ids: + print(f"[NOUVEAU] {entry.get('title','(no title)')}") + print(" ->", entry.get("link","")) + save_article(entry, feed) + seen_ids.add(entry_id) + except Exception as e: + print(f"[ERREUR] du fetch du feed {feed}: {e}") + + print(f"Attente {INTERVAL}s avant prochaine vérification...") + time.sleep(INTERVAL) + except KeyboardInterrupt: + print("Arrêt par l'utilisateur.") + finally: + conn.close() + +if __name__ == "__main__": + main()