From 04d406c0490229fc44bd8847fdd84174f3d57b06 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Fri, 30 Jan 2026 13:10:54 -0800 Subject: [PATCH 1/7] feat: add dynamic Open Graph meta tags for social media link previews Enable entity-specific link previews on Slack, Twitter, Discord, Mastodon, Bluesky, and other social platforms by serving dynamic OG tags to crawler bots. Closes #1242 --- backend/pyproject.toml | 1 + backend/src/monarch_py/api/main.py | 3 +- backend/src/monarch_py/api/meta.py | 92 ++++++++++++++ .../src/monarch_py/api/templates/meta.html | 35 ++++++ backend/tests/api/test_meta.py | 46 +++++++ backend/uv.lock | 2 + docs/link-previews.md | 117 ++++++++++++++++++ services/nginx/config/default.conf | 47 +++++++ services/nginx/tests/test_bot_detection.sh | 110 ++++++++++++++++ 9 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 backend/src/monarch_py/api/meta.py create mode 100644 backend/src/monarch_py/api/templates/meta.html create mode 100644 backend/tests/api/test_meta.py create mode 100644 docs/link-previews.md create mode 100755 services/nginx/tests/test_bot_detection.sh diff --git a/backend/pyproject.toml b/backend/pyproject.toml index f3f932e47..0287ae5c7 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "docker>=7.1.0", "fastapi>=0.115.12,<1", "gunicorn>=23.0.0", + "jinja2>=3.0", "linkml==1.9.3", "loguru", "oaklib>=0.6.6", diff --git a/backend/src/monarch_py/api/main.py b/backend/src/monarch_py/api/main.py index 16e32bcf5..270726d30 100644 --- a/backend/src/monarch_py/api/main.py +++ b/backend/src/monarch_py/api/main.py @@ -4,7 +4,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import RedirectResponse -from monarch_py.api import association, entity, histopheno, search, semsim, text_annotation +from monarch_py.api import association, entity, histopheno, meta, search, semsim, text_annotation from monarch_py.api.config import semsimian, spacyner, settings from monarch_py.api.middleware.logging_middleware import LoggingMiddleware from monarch_py.utils.utils import get_release_metadata, get_release_versions @@ -28,6 +28,7 @@ async def lifespan(app: FastAPI): app.include_router(association.router, prefix=f"{PREFIX}/association") app.include_router(entity.router, prefix=f"{PREFIX}/entity") app.include_router(histopheno.router, prefix=f"{PREFIX}/histopheno") +app.include_router(meta.router, prefix=PREFIX) app.include_router(search.router, prefix=PREFIX) app.include_router(semsim.router, prefix=f"{PREFIX}/semsim") app.include_router(text_annotation.router, prefix=PREFIX) diff --git a/backend/src/monarch_py/api/meta.py b/backend/src/monarch_py/api/meta.py new file mode 100644 index 000000000..aadfe9649 --- /dev/null +++ b/backend/src/monarch_py/api/meta.py @@ -0,0 +1,92 @@ +"""Meta endpoint for serving HTML with dynamic Open Graph tags to social media crawlers.""" + +import logging +from pathlib import Path + +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import HTMLResponse +from jinja2 import Environment, FileSystemLoader, select_autoescape + +from monarch_py.api.config import solr + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["meta"]) + +TEMPLATES_DIR = Path(__file__).parent / "templates" +jinja_env = Environment( + loader=FileSystemLoader(TEMPLATES_DIR), + autoescape=select_autoescape(["html", "xml"]), +) + + +def get_base_url(request: Request) -> str: + """Derive base URL from the request headers (supports beta/prod via same stack).""" + scheme = request.headers.get("x-forwarded-proto", request.url.scheme) + host = request.headers.get("host", request.url.netloc) + return f"{scheme}://{host}" + + +def get_default_image(request: Request) -> str: + """Get the default OG image URL.""" + return f"{get_base_url(request)}/share-thumbnail.jpg" + + +@router.get("/meta/{entity_id:path}", response_class=HTMLResponse) +async def get_meta_page(entity_id: str, request: Request) -> HTMLResponse: + """ + Return an HTML page with dynamic Open Graph meta tags for the given entity. + + This endpoint is designed to be called by social media crawlers (Slackbot, + Twitterbot, etc.) to get entity-specific link previews. Regular users + should be served the SPA directly by Nginx. + + Args: + entity_id: The entity identifier (e.g., MONDO:0005148) + request: The FastAPI request object + + Returns: + HTML page with entity-specific OG meta tags + + Raises: + HTTPException: 404 if entity not found + """ + try: + entity = solr().get_entity(entity_id, extra=False) + except Exception as e: + logger.warning(f"Failed to fetch entity {entity_id}: {e}") + raise HTTPException(status_code=404, detail=f"Entity not found: {entity_id}") + + if entity is None: + raise HTTPException(status_code=404, detail=f"Entity not found: {entity_id}") + + base_url = get_base_url(request) + entity_url = f"{base_url}/{entity_id}" + + entity_name = entity.name or entity_id + title = f"{entity_name} | Monarch Initiative" + + description_parts = [] + if entity.name: + description_parts.append(entity.name) + if entity.description: + description_parts.append(entity.description) + + if description_parts: + description = " - ".join(description_parts) + else: + description = f"View {entity_id} on Monarch Initiative" + + max_description_length = 300 + if len(description) > max_description_length: + description = description[: max_description_length - 3] + "..." + + template = jinja_env.get_template("meta.html") + html_content = template.render( + title=title, + description=description, + url=entity_url, + image=get_default_image(request), + ) + + return HTMLResponse(content=html_content, status_code=200) diff --git a/backend/src/monarch_py/api/templates/meta.html b/backend/src/monarch_py/api/templates/meta.html new file mode 100644 index 000000000..3933a38ae --- /dev/null +++ b/backend/src/monarch_py/api/templates/meta.html @@ -0,0 +1,35 @@ + + + + + + + + {{ title }} + + + + + + + + + + + + + + + + + + + + + + +

{{ title }}

+

{{ description }}

+

Redirecting to {{ url }}...

+ + diff --git a/backend/tests/api/test_meta.py b/backend/tests/api/test_meta.py new file mode 100644 index 000000000..fa97d43ac --- /dev/null +++ b/backend/tests/api/test_meta.py @@ -0,0 +1,46 @@ +"""Tests for the meta endpoint that serves dynamic OG tags to crawlers.""" + +import pytest +from fastapi.testclient import TestClient + +from monarch_py.api.main import app + + +@pytest.fixture +def client(): + return TestClient(app) + + +def test_meta_endpoint_returns_html_with_og_tags(client): + """Test that /meta/{entity_id} returns HTML with entity-specific OG tags.""" + response = client.get("/v3/api/meta/MONDO:0005148") + + assert response.status_code == 200 + assert response.headers["content-type"] == "text/html; charset=utf-8" + + html = response.text + # Check for entity-specific content (diabetes mellitus) + assert "MONDO:0005148" in html or "diabetes" in html.lower() + assert 'og:title' in html + assert 'og:description' in html + assert 'og:url' in html + # URL is derived from request host (testserver in tests, monarchinitiative.org in prod) + assert 'testserver/MONDO:0005148' in html + + +def test_meta_endpoint_returns_404_for_unknown_entity(client): + """Test that /meta/{entity_id} returns 404 for non-existent entities.""" + response = client.get("/v3/api/meta/FAKE:9999999") + + assert response.status_code == 404 + + +def test_meta_endpoint_escapes_html_in_content(client): + """Test that entity content is properly HTML-escaped to prevent XSS.""" + # This tests that special characters in entity names/descriptions + # are properly escaped in the HTML output + response = client.get("/v3/api/meta/MONDO:0005148") + + assert response.status_code == 200 + # Should not contain unescaped angle brackets from entity content + # (the HTML tags themselves are fine, but entity content should be escaped) diff --git a/backend/uv.lock b/backend/uv.lock index b3551b9cf..62ca06ba0 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -1649,6 +1649,7 @@ dependencies = [ { name = "docker" }, { name = "fastapi" }, { name = "gunicorn" }, + { name = "jinja2" }, { name = "linkml" }, { name = "loguru" }, { name = "oaklib" }, @@ -1686,6 +1687,7 @@ requires-dist = [ { name = "fastapi", specifier = ">=0.115.12,<1" }, { name = "gunicorn", specifier = ">=23.0.0" }, { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.27" }, + { name = "jinja2", specifier = ">=3.0" }, { name = "linkml", specifier = "==1.9.3" }, { name = "loguru" }, { name = "mkdocs", marker = "extra == 'dev'", specifier = ">=1.6.0" }, diff --git a/docs/link-previews.md b/docs/link-previews.md new file mode 100644 index 000000000..f4ba9a6f0 --- /dev/null +++ b/docs/link-previews.md @@ -0,0 +1,117 @@ +# Link Previews (Dynamic Open Graph Meta Tags) + +## Overview + +When sharing Monarch Initiative entity links on social media platforms (Slack, Twitter, Discord, Mastodon, Bluesky, etc.), the platform displays a "link preview" with the page title, description, and image. These previews are generated from Open Graph (OG) meta tags in the HTML. + +Since Monarch is a Single Page Application (SPA), the initial HTML served by nginx contains generic site-wide meta tags. Social media crawlers don't execute JavaScript, so they can't see the dynamic, entity-specific content. + +## Solution + +We detect social media crawler requests at the nginx level and route them to a special `/meta` endpoint that serves HTML with entity-specific OG tags. + +### Architecture + +``` +Request for /MONDO:0005148 + │ + ▼ + Nginx + │ + ├─── Is User-Agent a bot? ───► Yes ──► Proxy to /v3/api/meta/MONDO:0005148 + │ │ + │ ▼ + │ FastAPI fetches entity from Solr, + │ returns HTML with dynamic OG tags + │ + └─── No (regular user) ──► Serve index.html (SPA loads normally) +``` + +### Supported Crawlers + +The following User-Agents are detected and served dynamic meta tags: + +- Slackbot +- Twitterbot +- facebookexternalhit +- LinkedInBot +- Discordbot +- WhatsApp +- TelegramBot +- Mastodon +- Bluesky +- Googlebot +- bingbot +- Embedly +- Pinterest +- Applebot +- Quora Link Preview +- Outbrain + +### Entity Path Detection + +Only paths matching the CURIE format are eligible for dynamic meta tags: +- `/MONDO:0005148` ✓ +- `/HP:0001234` ✓ +- `/HGNC:1234` ✓ +- `/about` ✗ (not a CURIE) +- `/results` ✗ (not a CURIE) + +### Dynamic URL Support + +The meta endpoint derives URLs from the request headers, so the same deployment works for both: +- `beta.monarchinitiative.org` → OG tags reference beta URLs +- `monarchinitiative.org` → OG tags reference production URLs + +No configuration changes needed between environments. + +## Testing + +### Manual Testing + +Test as a regular browser (should get the SPA): +```bash +curl -s http://localhost:8080/MONDO:0005148 | head -20 +``` + +Test as Slackbot (should get entity-specific OG tags): +```bash +curl -s -H "User-Agent: Slackbot" http://localhost:8080/MONDO:0005148 | grep og: +``` + +Test as Mastodon (should get entity-specific OG tags): +```bash +curl -s -H "User-Agent: Mastodon/4.0" http://localhost:8080/MONDO:0005148 | grep og: +``` + +Test as Bluesky (should get entity-specific OG tags): +```bash +curl -s -H "User-Agent: Bluesky" http://localhost:8080/MONDO:0005148 | grep og: +``` + +### Automated Testing + +Run the nginx bot detection tests: +```bash +NGINX_URL=http://localhost:8080 ./services/nginx/tests/test_bot_detection.sh +``` + +Run the backend unit tests: +```bash +cd backend && uv run pytest tests/api/test_meta.py -v +``` + +## Configuration + +### Customizing Detected Bots + +Edit `services/nginx/config/default.conf` to add or remove User-Agent patterns from the `$is_bot` map. + +### Files + +| File | Purpose | +|------|---------| +| `backend/src/monarch_py/api/meta.py` | FastAPI endpoint that renders OG tags | +| `backend/src/monarch_py/api/templates/meta.html` | Jinja2 template for HTML response | +| `services/nginx/config/default.conf` | Nginx bot detection and routing | +| `services/nginx/tests/test_bot_detection.sh` | Integration test script | diff --git a/services/nginx/config/default.conf b/services/nginx/config/default.conf index f64431a0e..bc717ee52 100644 --- a/services/nginx/config/default.conf +++ b/services/nginx/config/default.conf @@ -1,15 +1,62 @@ +# Map to detect social media crawlers/bots +map $http_user_agent $is_bot { + default 0; + ~*Slackbot 1; + ~*Twitterbot 1; + ~*facebookexternalhit 1; + ~*LinkedInBot 1; + ~*Discordbot 1; + ~*WhatsApp 1; + ~*TelegramBot 1; + ~*Mastodon 1; + ~*Bluesky 1; + ~*Googlebot 1; + ~*bingbot 1; + ~*Embedly 1; + ~*Quora\ Link\ Preview 1; + ~*outbrain 1; + ~*pinterest 1; + ~*Applebot 1; +} + +# Map to detect entity paths (CURIE format: PREFIX:ID) +# Matches paths like /MONDO:0005148, /HP:0001234, /HGNC:1234, etc. +map $uri $is_entity_path { + default 0; + ~^/[A-Za-z_]+:[A-Za-z0-9_.-]+$ 1; +} + upstream api { server api:8000; } + server { listen 80 default_server; listen [::]:80 default_server; server_name _; + # Main location - handle bot detection for entity pages location / { root /var/www; index index.html; + + # If it's a bot AND an entity path, proxy to meta endpoint + set $serve_meta 0; + if ($is_bot = 1) { + set $serve_meta "B"; + } + if ($is_entity_path = 1) { + set $serve_meta "${serve_meta}E"; + } + + # Both conditions met - proxy to backend meta endpoint + if ($serve_meta = "BE") { + rewrite ^/(.*)$ /v3/api/meta/$1 break; + proxy_pass http://api; + } + + # Regular users get the SPA try_files $uri /index.html; } diff --git a/services/nginx/tests/test_bot_detection.sh b/services/nginx/tests/test_bot_detection.sh new file mode 100755 index 000000000..948f6d213 --- /dev/null +++ b/services/nginx/tests/test_bot_detection.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Test script for verifying bot detection in nginx config +# Run this against a running nginx instance + +set -e + +NGINX_URL="${NGINX_URL:-http://localhost:8080}" +ENTITY_PATH="/MONDO:0005148" + +echo "Testing bot detection at $NGINX_URL$ENTITY_PATH" +echo "================================================" + +# Test 1: Regular browser should get index.html (200, text/html, contains Vue app markers) +echo -n "Test 1: Regular browser request... " +RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" \ + -H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" \ + "$NGINX_URL$ENTITY_PATH") +if [ "$RESPONSE" = "200" ]; then + echo "PASS (HTTP $RESPONSE)" +else + echo "FAIL (HTTP $RESPONSE, expected 200)" + exit 1 +fi + +# Test 2: Slackbot should be proxied to backend (check for og: tags in response) +echo -n "Test 2: Slackbot request... " +RESPONSE=$(curl -s \ + -H "User-Agent: Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)" \ + "$NGINX_URL$ENTITY_PATH") +if echo "$RESPONSE" | grep -q "og:title"; then + echo "PASS (contains og:title)" +else + echo "FAIL (no og:title in response)" + exit 1 +fi + +# Test 3: Twitterbot should be proxied to backend +echo -n "Test 3: Twitterbot request... " +RESPONSE=$(curl -s \ + -H "User-Agent: Twitterbot/1.0" \ + "$NGINX_URL$ENTITY_PATH") +if echo "$RESPONSE" | grep -q "og:title"; then + echo "PASS (contains og:title)" +else + echo "FAIL (no og:title in response)" + exit 1 +fi + +# Test 4: Facebook crawler should be proxied to backend +echo -n "Test 4: Facebook crawler request... " +RESPONSE=$(curl -s \ + -H "User-Agent: facebookexternalhit/1.1" \ + "$NGINX_URL$ENTITY_PATH") +if echo "$RESPONSE" | grep -q "og:title"; then + echo "PASS (contains og:title)" +else + echo "FAIL (no og:title in response)" + exit 1 +fi + +# Test 5: Discord bot should be proxied to backend +echo -n "Test 5: Discordbot request... " +RESPONSE=$(curl -s \ + -H "User-Agent: Discordbot/2.0" \ + "$NGINX_URL$ENTITY_PATH") +if echo "$RESPONSE" | grep -q "og:title"; then + echo "PASS (contains og:title)" +else + echo "FAIL (no og:title in response)" + exit 1 +fi + +# Test 6: LinkedIn bot should be proxied to backend +echo -n "Test 6: LinkedInBot request... " +RESPONSE=$(curl -s \ + -H "User-Agent: LinkedInBot/1.0" \ + "$NGINX_URL$ENTITY_PATH") +if echo "$RESPONSE" | grep -q "og:title"; then + echo "PASS (contains og:title)" +else + echo "FAIL (no og:title in response)" + exit 1 +fi + +# Test 7: Mastodon should be proxied to backend +echo -n "Test 7: Mastodon request... " +RESPONSE=$(curl -s \ + -H "User-Agent: Mastodon/4.3.2 (+https://mastodon.social/)" \ + "$NGINX_URL$ENTITY_PATH") +if echo "$RESPONSE" | grep -q "og:title"; then + echo "PASS (contains og:title)" +else + echo "FAIL (no og:title in response)" + exit 1 +fi + +# Test 8: Bluesky should be proxied to backend +echo -n "Test 8: Bluesky request... " +RESPONSE=$(curl -s \ + -H "User-Agent: Mozilla/5.0 (compatible; Bluesky Cardyb/1.1; +mailto:support@bsky.app)" \ + "$NGINX_URL$ENTITY_PATH") +if echo "$RESPONSE" | grep -q "og:title"; then + echo "PASS (contains og:title)" +else + echo "FAIL (no og:title in response)" + exit 1 +fi + +echo "" +echo "All tests passed!" From a91bc19a4f297b65a768b9cdf724abdae7ac795e Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Mon, 9 Feb 2026 09:42:36 -0800 Subject: [PATCH 2/7] fixed tests --- backend/tests/api/test_meta.py | 40 +++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/backend/tests/api/test_meta.py b/backend/tests/api/test_meta.py index fa97d43ac..05a51deaa 100644 --- a/backend/tests/api/test_meta.py +++ b/backend/tests/api/test_meta.py @@ -1,9 +1,12 @@ """Tests for the meta endpoint that serves dynamic OG tags to crawlers.""" +from unittest.mock import patch + import pytest from fastapi.testclient import TestClient from monarch_py.api.main import app +from monarch_py.datamodels.model import Node @pytest.fixture @@ -11,36 +14,47 @@ def client(): return TestClient(app) -def test_meta_endpoint_returns_html_with_og_tags(client): +@patch("monarch_py.implementations.solr.solr_implementation.SolrImplementation.get_entity") +def test_meta_endpoint_returns_html_with_og_tags(mock_get_entity, client, node): """Test that /meta/{entity_id} returns HTML with entity-specific OG tags.""" - response = client.get("/v3/api/meta/MONDO:0005148") + mock_get_entity.return_value = Node(**node) + response = client.get("/v3/api/meta/MONDO:0020121") assert response.status_code == 200 assert response.headers["content-type"] == "text/html; charset=utf-8" html = response.text - # Check for entity-specific content (diabetes mellitus) - assert "MONDO:0005148" in html or "diabetes" in html.lower() + assert "MONDO:0020121" in html or "muscular dystrophy" in html.lower() assert 'og:title' in html assert 'og:description' in html assert 'og:url' in html - # URL is derived from request host (testserver in tests, monarchinitiative.org in prod) - assert 'testserver/MONDO:0005148' in html + assert 'testserver/MONDO:0020121' in html -def test_meta_endpoint_returns_404_for_unknown_entity(client): +@patch("monarch_py.implementations.solr.solr_implementation.SolrImplementation.get_entity") +def test_meta_endpoint_returns_404_for_unknown_entity(mock_get_entity, client): """Test that /meta/{entity_id} returns 404 for non-existent entities.""" + mock_get_entity.return_value = None response = client.get("/v3/api/meta/FAKE:9999999") assert response.status_code == 404 -def test_meta_endpoint_escapes_html_in_content(client): +@patch("monarch_py.implementations.solr.solr_implementation.SolrImplementation.get_entity") +def test_meta_endpoint_escapes_html_in_content(mock_get_entity, client): """Test that entity content is properly HTML-escaped to prevent XSS.""" - # This tests that special characters in entity names/descriptions - # are properly escaped in the HTML output - response = client.get("/v3/api/meta/MONDO:0005148") + mock_get_entity.return_value = Node( + id="TEST:001", + category="biolink:Disease", + name='', + description='A "test" entity with & special chars', + provided_by="test", + association_counts=[], + ) + response = client.get("/v3/api/meta/TEST:001") assert response.status_code == 200 - # Should not contain unescaped angle brackets from entity content - # (the HTML tags themselves are fine, but entity content should be escaped) + html = response.text + # Jinja2 autoescape should escape angle brackets + assert "