xvblack
diff --git a/‎backend/.python-version‎
Lines changed: 1 addition & 0 deletions b/‎backend/.python-version‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backend/README.md‎ b/‎backend/README.md‎
diff --git a/‎backend/app.py‎
Lines changed: 25 additions & 0 deletions b/‎backend/app.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎backend/clients.py‎
Lines changed: 29 additions & 0 deletions b/‎backend/clients.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎backend/data_types.py‎
Lines changed: 14 additions & 0 deletions b/‎backend/data_types.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎backend/pyproject.toml‎
Lines changed: 67 additions & 0 deletions b/‎backend/pyproject.toml‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎backend/scripts/crawl_hn.py‎
Lines changed: 162 additions & 0 deletions b/‎backend/scripts/crawl_hn.py‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎backend/scripts/crawl_reddit.py‎ b/‎backend/scripts/crawl_reddit.py‎
diff --git a/‎backend/scripts/index_posts.py‎
Lines changed: 51 additions & 0 deletions b/‎backend/scripts/index_posts.py‎
Lines changed: 51 additions & 0 deletions
@@ -0,0 +1 @@
+3.13
@@ -0,0 +1,25 @@
+import time
+import os
+import fastapi
+from supabase import create_client
+from fastapi.responses import HTMLResponse
+
+app = fastapi.FastAPI()
+supabase = create_client(
+    os.environ["SUPABASE_URL"], os.environ["SUPABASE_SERVICE_ROLE_KEY"]
+)
+
+@app.get("/save")
+async def save_post(url: str):
+    supabase.table("Post").upsert({
+        "url": url,
+        "source": "pocket",
+        "time_added": int(time.time()),
+        "tags": ["pocket:unread"],
+        "attrs": {},
+            "links": {},
+        },
+        on_conflict="source,url",
+    ).execute()
+
+    return HTMLResponse(content="Saved", status_code=200)
@@ -0,0 +1,29 @@
+import os
+import re
+from typing import Dict, Any
+from supabase import create_client
+import typesense
+
+supabase_client = create_client(
+    os.environ["SUPABASE_URL"], os.environ["SUPABASE_SERVICE_ROLE_KEY"]
+)
+
+
+def parse_typesense_addr(addr: str) -> Dict[str, Any]:
+    match = re.match(r"([a-z]+)://([^:]+):([0-9]+)(.*)", addr)
+    if not match:
+        raise Exception(f"invalid typesense addr {addr}")
+    return {
+        'protocol': match.group(1),
+        'host': match.group(2),
+        'port': int(match.group(3)),
+        'path': match.group(4)
+    }
+
+# Client initialization functions
+addr = parse_typesense_addr(os.getenv('TYPESENSE_ADDR'))
+typesense_client = typesense.Client({
+    'api_key': os.getenv('TYPESENSE_API_KEY'),
+    'nodes': [addr],
+    'cache_search_results_for_seconds': 0
+})
@@ -0,0 +1,14 @@
+from typing import TypedDict, Optional, List, Dict
+
+
+class Post(TypedDict):
+    url: str
+    time_added: int
+    time_added_as_date: Optional[str]
+    source: str
+    tags: List[str]
+    attrs: Dict[str, any]
+    links: Dict[str, str]
+    title: Optional[str]
+    abstract: Optional[str]
+    content: Optional[str]
@@ -0,0 +1,67 @@
+[project]
+name = "backend"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "structlog",
+    "tqdm",
+    "requests",
+    "supabase",
+    "html2text",
+    "ruff",
+    "uvicorn>=0.34.2",
+    "fastapi>=0.115.12",
+    "typesense>=1.1.1",
+    "trafilatura>=2.0.0",
+    "bs4>=0.0.2",
+    "click>=8.2.1",
+    "feedparser>=6.0.11",
+    "opml>=0.5",
+]
+
+[tool.ruff]
+# Same as Black.
+line-length = 88
+
+# Assume Python 3.13
+target-version = "py313"
+
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+]
+
+[tool.ruff.lint]
+# Enable pycodestyle ('E'), Pyflakes ('F'), and isort ('I') rules
+select = ["E", "F", "I"]
+ignore = []
+
+# Allow autofix for all enabled rules (when `--fix`) is provided.
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+unfixable = []
+
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
@@ -0,0 +1,162 @@
+import asyncio
+import datetime
+import os
+from typing import List, Optional
+import aiohttp
+import click
+import structlog
+from html2text import HTML2Text
+
+from data_types import Post
+from clients import supabase_client
+import tqdm
+# Set up logging
+logger = structlog.get_logger()
+
+# Initialize HTML to text converter
+h = HTML2Text()
+h.ignore_links = True
+
+async def fetch_item(session: aiohttp.ClientSession, item_id: int) -> Optional[Post]:
+    """Fetch a single HN item and convert it to a Post."""
+    async with session.get(f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json") as response:
+        if response.status != 200:
+            logger.error("Failed to fetch item %d: %s", item_id, await response.text())
+            return None
+            
+        item = await response.json()
+        if not item or item.get("dead") or item.get("deleted"):
+            return None
+            
+        return {
+            "url": item.get("url", f"https://news.ycombinator.com/item?id={item_id}"),
+            "time_added": item["time"],
+            "source": "hackernews",
+            "tags": ["hackernews:top"],
+            "title": item["title"],
+            "abstract": h.handle(item.get("text", "")),
+            "attrs": {
+                "hn_id": item_id,
+                "score": item.get("score", 0),
+                "by": item.get("by", ""),
+                "descendants": item.get("descendants", 0),
+            },
+            "links": {},
+        }
+
+
+
+async def fetch_top_stories(limit: int) -> List[Post]:
+    """Fetch top stories from HN API."""
+    async with aiohttp.ClientSession() as session:
+        # First get the list of top story IDs
+        async with session.get("https://hacker-news.firebaseio.com/v0/topstories.json") as response:
+            if response.status != 200:
+                logger.error("Failed to fetch top stories: %s", await response.text())
+                return []
+                
+            story_ids = await response.json()
+            story_ids = story_ids[:limit]
+            
+        # Fetch each story concurrently
+        tasks = [fetch_item(session, story_id) for story_id in story_ids]
+        posts = await asyncio.gather(*tasks)
+        
+        # Filter out None values
+        return [post for post in posts if post is not None]
+
+async def fetch_stories_by_date(date_str: str) -> List[Post]:
+    """Fetch stories from HN Algolia API for a specific date.
+    
+    Args:
+        date_str: Date in YYYY-MM-DD format
+    """
+    # Convert date string to timestamp
+    from datetime import datetime
+    date = datetime.strptime(date_str, "%Y-%m-%d")
+    start_timestamp = int(date.timestamp())
+    end_timestamp = start_timestamp + 86400  # Add 24 hours in seconds
+    
+    # Construct Algolia API URL
+    url = f"https://hn.algolia.com/api/v1/search_by_date"
+    params = {
+        "tags": "story",
+        "numericFilters": f"created_at_i>={start_timestamp},created_at_i<{end_timestamp}",
+        "hitsPerPage": 100
+    }
+    
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url, params=params) as response:
+            if response.status != 200:
+                logger.error("Failed to fetch stories: %s", await response.text())
+                return []
+                
+            data = await response.json()
+            hits = data.get("hits", [])
+
+            posts = []
+            for hit in hits:
+                if hit.get("dead") or hit.get("deleted"):
+                    continue
+
+                posts.append({
+                    "url": hit.get("url", f"https://news.ycombinator.com/item?id={hit['objectID']}"),
+                    "time_added": hit["created_at_i"],
+                    "source": "hackernews",
+                    "tags": ["hackernews:top"],
+                    "title": hit["title"],
+                    "abstract": h.handle(hit.get("story_text", "")),
+                    "attrs": {
+                        "hn_id": int(hit["objectID"]),
+                        "score": hit.get("points", 0),
+                        "by": hit.get("author", ""),
+                        "descendants": hit.get("num_comments", 0),
+                    },
+                    "links": {},
+                })
+
+            return posts
+
+@click.command()
+@click.argument('limit', type=int)
+def crawl_hn(limit: int):
+    """Crawl top stories from Hacker News."""
+    logger.info("Starting HN crawl with limit %d", limit)
+    
+    # Run the async function
+    posts = asyncio.run(fetch_top_stories(limit))
+    logger.info("Fetched %d posts from HN", len(posts))
+    
+    # Save to database
+    if posts:
+        supabase_client.table("Post").upsert(posts, on_conflict="source,url").execute()
+        logger.info("Successfully saved %d posts to database", len(posts))
+
+@click.command()
+@click.option('--date_start', type=str)
+@click.option('--date_end', type=str)
+def crawl_hn_by_date(date_start: str, date_end: str):
+    """Crawl stories from Hacker News for a specific date."""
+    logger.info("Starting HN crawl for date %s to %s", date_start, date_end)
+    
+    date_start = datetime.datetime.strptime(date_start, "%Y-%m-%d")
+    date_end = datetime.datetime.strptime(date_end, "%Y-%m-%d")
+
+    def date_range(start: datetime.datetime, end: datetime.datetime):
+        for n in range(int((end - start).days)):
+            yield (start + datetime.timedelta(n)).strftime("%Y-%m-%d")
+
+    for date in tqdm.tqdm(date_range(date_start, date_end)):
+        # Run the async function
+        import time
+        time.sleep(5)
+        posts = asyncio.run(fetch_stories_by_date(date))
+        logger.info("Fetched %d posts from HN", len(posts))
+        
+    # Save to database
+    if posts:
+        supabase_client.table("Post").upsert(posts, on_conflict="source,url").execute()
+        logger.info("Successfully saved %d posts to database", len(posts))
+
+if __name__ == "__main__":
+    crawl_hn_by_date()
@@ -0,0 +1,51 @@
+import asyncio
+import os
+import re
+import json
+import logging
+from typing import List, Dict, Optional, Any, Tuple, TypedDict
+import requests
+from bs4 import BeautifulSoup
+# from readability import Document
+import trafilatura
+import typesense
+from clients import supabase_client, typesense_client
+from data_types import Post
+import tqdm
+import click
+
+async def index_post(post: Post) -> None:
+    collection = typesense_client.collections[os.getenv('TYPESENSE_INDEX_NAME')]
+    post['id'] = str(post['id'])
+    post['links'] = json.dumps(post['links'])
+    post['content'] = post['content'] or ''
+    collection.documents.upsert(post)
+    print(f"Indexed post {post['id']}")
+
+async def run_indexing(limit, page_size=100, concurrency=10):
+    semaphore = asyncio.Semaphore(concurrency)
+    async def index_post_with_semaphore(post: Post) -> None:
+        async with semaphore:
+            await index_post(post)
+
+    for begin in tqdm.trange(0, limit, page_size):
+        statement = supabase_client.table("Post").select("*")
+        statement = statement.order("time_added", desc=True)
+        posts = statement.range(begin, begin + page_size).execute().data
+        if not posts:
+            break
+        await asyncio.gather(*[index_post_with_semaphore(post) for post in posts])
+
+@click.command()
+@click.option("--limit", default=10)
+@click.option("--page-size", default=100)
+@click.option("--concurrency", default=10)
+def main(limit, page_size, concurrency):
+    asyncio.run(run_indexing(limit, page_size, concurrency))
+
+if __name__ == "__main__":
+    main()
+
+# async def search_post(query: Dict[str, Any]) -> Post:
+#     collection = get_typesense_client().collections[os.getenv('TYPESENSE_INDEX_NAME')]
+#     return await collection.documents.search(query)