Skip to content

Commit 959ea8f

Browse files
committed
backup
1 parent 6738cfd commit 959ea8f

31 files changed

+2832
-99
lines changed

backend/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.13

backend/README.md

Whitespace-only changes.

backend/app.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import time
2+
import os
3+
import fastapi
4+
from supabase import create_client
5+
from fastapi.responses import HTMLResponse
6+
7+
app = fastapi.FastAPI()
8+
supabase = create_client(
9+
os.environ["SUPABASE_URL"], os.environ["SUPABASE_SERVICE_ROLE_KEY"]
10+
)
11+
12+
@app.get("/save")
13+
async def save_post(url: str):
14+
supabase.table("Post").upsert({
15+
"url": url,
16+
"source": "pocket",
17+
"time_added": int(time.time()),
18+
"tags": ["pocket:unread"],
19+
"attrs": {},
20+
"links": {},
21+
},
22+
on_conflict="source,url",
23+
).execute()
24+
25+
return HTMLResponse(content="Saved", status_code=200)

backend/clients.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import os
2+
import re
3+
from typing import Dict, Any
4+
from supabase import create_client
5+
import typesense
6+
7+
supabase_client = create_client(
8+
os.environ["SUPABASE_URL"], os.environ["SUPABASE_SERVICE_ROLE_KEY"]
9+
)
10+
11+
12+
def parse_typesense_addr(addr: str) -> Dict[str, Any]:
13+
match = re.match(r"([a-z]+)://([^:]+):([0-9]+)(.*)", addr)
14+
if not match:
15+
raise Exception(f"invalid typesense addr {addr}")
16+
return {
17+
'protocol': match.group(1),
18+
'host': match.group(2),
19+
'port': int(match.group(3)),
20+
'path': match.group(4)
21+
}
22+
23+
# Client initialization functions
24+
addr = parse_typesense_addr(os.getenv('TYPESENSE_ADDR'))
25+
typesense_client = typesense.Client({
26+
'api_key': os.getenv('TYPESENSE_API_KEY'),
27+
'nodes': [addr],
28+
'cache_search_results_for_seconds': 0
29+
})

backend/data_types.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from typing import TypedDict, Optional, List, Dict
2+
3+
4+
class Post(TypedDict):
5+
url: str
6+
time_added: int
7+
time_added_as_date: Optional[str]
8+
source: str
9+
tags: List[str]
10+
attrs: Dict[str, any]
11+
links: Dict[str, str]
12+
title: Optional[str]
13+
abstract: Optional[str]
14+
content: Optional[str]

backend/pyproject.toml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
[project]
2+
name = "backend"
3+
version = "0.1.0"
4+
description = "Add your description here"
5+
readme = "README.md"
6+
requires-python = ">=3.13"
7+
dependencies = [
8+
"structlog",
9+
"tqdm",
10+
"requests",
11+
"supabase",
12+
"html2text",
13+
"ruff",
14+
"uvicorn>=0.34.2",
15+
"fastapi>=0.115.12",
16+
"typesense>=1.1.1",
17+
"trafilatura>=2.0.0",
18+
"bs4>=0.0.2",
19+
"click>=8.2.1",
20+
"feedparser>=6.0.11",
21+
"opml>=0.5",
22+
]
23+
24+
[tool.ruff]
25+
# Same as Black.
26+
line-length = 88
27+
28+
# Assume Python 3.13
29+
target-version = "py313"
30+
31+
# Exclude a variety of commonly ignored directories.
32+
exclude = [
33+
".bzr",
34+
".direnv",
35+
".eggs",
36+
".git",
37+
".git-rewrite",
38+
".hg",
39+
".mypy_cache",
40+
".nox",
41+
".pants.d",
42+
".pytype",
43+
".ruff_cache",
44+
".svn",
45+
".tox",
46+
".venv",
47+
"__pypackages__",
48+
"_build",
49+
"buck-out",
50+
"build",
51+
"dist",
52+
"node_modules",
53+
"venv",
54+
]
55+
56+
[tool.ruff.lint]
57+
# Enable pycodestyle ('E'), Pyflakes ('F'), and isort ('I') rules
58+
select = ["E", "F", "I"]
59+
ignore = []
60+
61+
# Allow autofix for all enabled rules (when `--fix`) is provided.
62+
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
63+
unfixable = []
64+
65+
# Allow unused variables when underscore-prefixed.
66+
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
67+

backend/scripts/crawl_hn.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
import asyncio
2+
import datetime
3+
import os
4+
from typing import List, Optional
5+
import aiohttp
6+
import click
7+
import structlog
8+
from html2text import HTML2Text
9+
10+
from data_types import Post
11+
from clients import supabase_client
12+
import tqdm
13+
# Set up logging
14+
logger = structlog.get_logger()
15+
16+
# Initialize HTML to text converter
17+
h = HTML2Text()
18+
h.ignore_links = True
19+
20+
async def fetch_item(session: aiohttp.ClientSession, item_id: int) -> Optional[Post]:
21+
"""Fetch a single HN item and convert it to a Post."""
22+
async with session.get(f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json") as response:
23+
if response.status != 200:
24+
logger.error("Failed to fetch item %d: %s", item_id, await response.text())
25+
return None
26+
27+
item = await response.json()
28+
if not item or item.get("dead") or item.get("deleted"):
29+
return None
30+
31+
return {
32+
"url": item.get("url", f"https://news.ycombinator.com/item?id={item_id}"),
33+
"time_added": item["time"],
34+
"source": "hackernews",
35+
"tags": ["hackernews:top"],
36+
"title": item["title"],
37+
"abstract": h.handle(item.get("text", "")),
38+
"attrs": {
39+
"hn_id": item_id,
40+
"score": item.get("score", 0),
41+
"by": item.get("by", ""),
42+
"descendants": item.get("descendants", 0),
43+
},
44+
"links": {},
45+
}
46+
47+
48+
49+
async def fetch_top_stories(limit: int) -> List[Post]:
50+
"""Fetch top stories from HN API."""
51+
async with aiohttp.ClientSession() as session:
52+
# First get the list of top story IDs
53+
async with session.get("https://hacker-news.firebaseio.com/v0/topstories.json") as response:
54+
if response.status != 200:
55+
logger.error("Failed to fetch top stories: %s", await response.text())
56+
return []
57+
58+
story_ids = await response.json()
59+
story_ids = story_ids[:limit]
60+
61+
# Fetch each story concurrently
62+
tasks = [fetch_item(session, story_id) for story_id in story_ids]
63+
posts = await asyncio.gather(*tasks)
64+
65+
# Filter out None values
66+
return [post for post in posts if post is not None]
67+
68+
async def fetch_stories_by_date(date_str: str) -> List[Post]:
69+
"""Fetch stories from HN Algolia API for a specific date.
70+
71+
Args:
72+
date_str: Date in YYYY-MM-DD format
73+
"""
74+
# Convert date string to timestamp
75+
from datetime import datetime
76+
date = datetime.strptime(date_str, "%Y-%m-%d")
77+
start_timestamp = int(date.timestamp())
78+
end_timestamp = start_timestamp + 86400 # Add 24 hours in seconds
79+
80+
# Construct Algolia API URL
81+
url = f"https://hn.algolia.com/api/v1/search_by_date"
82+
params = {
83+
"tags": "story",
84+
"numericFilters": f"created_at_i>={start_timestamp},created_at_i<{end_timestamp}",
85+
"hitsPerPage": 100
86+
}
87+
88+
async with aiohttp.ClientSession() as session:
89+
async with session.get(url, params=params) as response:
90+
if response.status != 200:
91+
logger.error("Failed to fetch stories: %s", await response.text())
92+
return []
93+
94+
data = await response.json()
95+
hits = data.get("hits", [])
96+
97+
posts = []
98+
for hit in hits:
99+
if hit.get("dead") or hit.get("deleted"):
100+
continue
101+
102+
posts.append({
103+
"url": hit.get("url", f"https://news.ycombinator.com/item?id={hit['objectID']}"),
104+
"time_added": hit["created_at_i"],
105+
"source": "hackernews",
106+
"tags": ["hackernews:top"],
107+
"title": hit["title"],
108+
"abstract": h.handle(hit.get("story_text", "")),
109+
"attrs": {
110+
"hn_id": int(hit["objectID"]),
111+
"score": hit.get("points", 0),
112+
"by": hit.get("author", ""),
113+
"descendants": hit.get("num_comments", 0),
114+
},
115+
"links": {},
116+
})
117+
118+
return posts
119+
120+
@click.command()
121+
@click.argument('limit', type=int)
122+
def crawl_hn(limit: int):
123+
"""Crawl top stories from Hacker News."""
124+
logger.info("Starting HN crawl with limit %d", limit)
125+
126+
# Run the async function
127+
posts = asyncio.run(fetch_top_stories(limit))
128+
logger.info("Fetched %d posts from HN", len(posts))
129+
130+
# Save to database
131+
if posts:
132+
supabase_client.table("Post").upsert(posts, on_conflict="source,url").execute()
133+
logger.info("Successfully saved %d posts to database", len(posts))
134+
135+
@click.command()
136+
@click.option('--date_start', type=str)
137+
@click.option('--date_end', type=str)
138+
def crawl_hn_by_date(date_start: str, date_end: str):
139+
"""Crawl stories from Hacker News for a specific date."""
140+
logger.info("Starting HN crawl for date %s to %s", date_start, date_end)
141+
142+
date_start = datetime.datetime.strptime(date_start, "%Y-%m-%d")
143+
date_end = datetime.datetime.strptime(date_end, "%Y-%m-%d")
144+
145+
def date_range(start: datetime.datetime, end: datetime.datetime):
146+
for n in range(int((end - start).days)):
147+
yield (start + datetime.timedelta(n)).strftime("%Y-%m-%d")
148+
149+
for date in tqdm.tqdm(date_range(date_start, date_end)):
150+
# Run the async function
151+
import time
152+
time.sleep(5)
153+
posts = asyncio.run(fetch_stories_by_date(date))
154+
logger.info("Fetched %d posts from HN", len(posts))
155+
156+
# Save to database
157+
if posts:
158+
supabase_client.table("Post").upsert(posts, on_conflict="source,url").execute()
159+
logger.info("Successfully saved %d posts to database", len(posts))
160+
161+
if __name__ == "__main__":
162+
crawl_hn_by_date()

backend/scripts/crawl_reddit.py

Whitespace-only changes.

backend/scripts/index_posts.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import asyncio
2+
import os
3+
import re
4+
import json
5+
import logging
6+
from typing import List, Dict, Optional, Any, Tuple, TypedDict
7+
import requests
8+
from bs4 import BeautifulSoup
9+
# from readability import Document
10+
import trafilatura
11+
import typesense
12+
from clients import supabase_client, typesense_client
13+
from data_types import Post
14+
import tqdm
15+
import click
16+
17+
async def index_post(post: Post) -> None:
18+
collection = typesense_client.collections[os.getenv('TYPESENSE_INDEX_NAME')]
19+
post['id'] = str(post['id'])
20+
post['links'] = json.dumps(post['links'])
21+
post['content'] = post['content'] or ''
22+
collection.documents.upsert(post)
23+
print(f"Indexed post {post['id']}")
24+
25+
async def run_indexing(limit, page_size=100, concurrency=10):
26+
semaphore = asyncio.Semaphore(concurrency)
27+
async def index_post_with_semaphore(post: Post) -> None:
28+
async with semaphore:
29+
await index_post(post)
30+
31+
for begin in tqdm.trange(0, limit, page_size):
32+
statement = supabase_client.table("Post").select("*")
33+
statement = statement.order("time_added", desc=True)
34+
posts = statement.range(begin, begin + page_size).execute().data
35+
if not posts:
36+
break
37+
await asyncio.gather(*[index_post_with_semaphore(post) for post in posts])
38+
39+
@click.command()
40+
@click.option("--limit", default=10)
41+
@click.option("--page-size", default=100)
42+
@click.option("--concurrency", default=10)
43+
def main(limit, page_size, concurrency):
44+
asyncio.run(run_indexing(limit, page_size, concurrency))
45+
46+
if __name__ == "__main__":
47+
main()
48+
49+
# async def search_post(query: Dict[str, Any]) -> Post:
50+
# collection = get_typesense_client().collections[os.getenv('TYPESENSE_INDEX_NAME')]
51+
# return await collection.documents.search(query)

0 commit comments

Comments
 (0)