redd-archiver/search_server.py at main · 19-84/redd-archiver · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
#!/usr/bin/env python
# ABOUTME: Flask search server for PostgreSQL full-text search with Google-style operators
# ABOUTME: Secure, Alpine-ready, with rate limiting and HTML rendering via Jinja2

import os
import time
from datetime import datetime

from flask import Flask, jsonify, render_template, request
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
from flask_wtf.csrf import CSRFProtect
from markupsafe import escape

from core.postgres_search import PostgresSearch, SearchQuery
from utils.console_output import print_error, print_info, print_success
from utils.error_handling import format_user_error
from utils.input_validation import validator
from utils.search_operators import format_search_breadcrumb, parse_search_operators

# ============================================================================
# FLASK APPLICATION SETUP
# ============================================================================

app = Flask(__name__)

# Security configuration
app.config["SECRET_KEY"] = os.environ.get("FLASK_SECRET_KEY", os.urandom(24))
app.config["MAX_CONTENT_LENGTH"] = 1024 * 1024  # 1MB max request size

# Enforce SECRET_KEY in production
if os.environ.get("FLASK_ENV") == "production" and not os.environ.get("FLASK_SECRET_KEY"):
    raise ValueError(
        "FLASK_SECRET_KEY must be set in production! "
        "Generate with: python3 -c 'import secrets; print(secrets.token_urlsafe(32))'"
    )

# Site configuration
SITE_NAME = os.environ.get("REDDARCHIVER_SITE_NAME", "Redd Archive")
PROJECT_URL = os.environ.get("REDDARCHIVER_PROJECT_URL", "https://github.com/19-84/redd-archiver")

# CSRF protection configuration
# Note: Search uses GET method (inherently safe from CSRF), but we add
# CSRF protection for consistency and future POST endpoints
app.config["WTF_CSRF_ENABLED"] = True
app.config["WTF_CSRF_CHECK_DEFAULT"] = True
app.config["WTF_CSRF_METHODS"] = ["POST", "PUT", "PATCH", "DELETE"]  # Not GET
app.config["WTF_CSRF_TIME_LIMIT"] = None  # No expiry for tokens
csrf = CSRFProtect(app)

# Rate limiting configuration
# 30 requests per minute per IP (generous for search)
limiter = Limiter(get_remote_address, app=app, default_limits=["30 per minute"], storage_uri="memory://")

# ============================================================================
# API REGISTRATION
# ============================================================================

# Register REST API v1 blueprint at /api/v1
from api import api_v1, register_api

register_api(app)

# Exempt API from CSRF protection (API uses CORS and rate limiting instead)
csrf.exempt(api_v1)

# ============================================================================
# GLOBAL SEARCH ENGINE (reused across all requests)
# ============================================================================

# Initialize search engine once at startup (not per-request)
# This reuses the database connection pool and avoids schema recreation
_search_engine = None


def get_search_engine():
    """Get or create global search engine instance."""
    global _search_engine
    if _search_engine is None:
        print_info("Initializing global search engine...")
        _search_engine = PostgresSearch()
        print_success("Search engine ready for requests")
    return _search_engine


# ============================================================================
# HELPER FUNCTIONS
# ============================================================================


def sanitize_query(query: str) -> str:
    """
    Sanitize search query for safety.

    Args:
        query: Raw query string

    Returns:
        Sanitized query (length-limited, stripped)
    """
    if not query:
        return ""

    # Strip whitespace
    query = query.strip()

    # Limit length to prevent abuse
    MAX_QUERY_LENGTH = 500
    if len(query) > MAX_QUERY_LENGTH:
        query = query[:MAX_QUERY_LENGTH]

    return query


def format_date(timestamp: int) -> str:
    """
    Format Unix timestamp to human-readable date.

    Args:
        timestamp: Unix timestamp

    Returns:
        Formatted date string (e.g., "2024-01-15")
    """
    try:
        dt = datetime.utcfromtimestamp(timestamp)
        return dt.strftime("%Y-%m-%d")
    except (ValueError, OSError):
        return "Unknown date"


def format_relative_date(timestamp: int) -> str:
    """
    Format Unix timestamp to relative date (e.g., "2 days ago").

    Args:
        timestamp: Unix timestamp

    Returns:
        Relative date string
    """
    try:
        dt = datetime.utcfromtimestamp(timestamp)
        now = datetime.utcnow()
        delta = now - dt

        if delta.days > 365:
            years = delta.days // 365
            return f"{years} year{'s' if years > 1 else ''} ago"
        elif delta.days > 30:
            months = delta.days // 30
            return f"{months} month{'s' if months > 1 else ''} ago"
        elif delta.days > 0:
            return f"{delta.days} day{'s' if delta.days > 1 else ''} ago"
        elif delta.seconds > 3600:
            hours = delta.seconds // 3600
            return f"{hours} hour{'s' if hours > 1 else ''} ago"
        elif delta.seconds > 60:
            minutes = delta.seconds // 60
            return f"{minutes} minute{'s' if minutes > 1 else ''} ago"
        else:
            return "just now"
    except (ValueError, OSError):
        return "Unknown"


def get_score_badge_class(score: int) -> str:
    """
    Get Bootstrap badge class for score display.

    Args:
        score: Post/comment score

    Returns:
        CSS class name for badge
    """
    if score < 0:
        return "badge-danger"
    elif score == 0:
        return "badge-warning-orange"
    elif score < 10:
        return "badge-warning"
    elif score < 100:
        return "badge-light"
    elif score < 1000:
        return "badge-success"
    else:
        return "badge-success-bright"


# ============================================================================
# ROUTES
# ============================================================================


@app.route("/")
def index():
    """Redirect to search page."""
    return render_template("pages/search_form.html", site_name=SITE_NAME, url_project=PROJECT_URL)


@app.route("/search")
@limiter.limit("30 per minute")
def search():
    """
    Main search endpoint.

    Query parameters:
        q: Search query (with optional operators)
        page: Page number (default: 1)
        limit: Results per page (default: 25, max: 100)

    Returns:
        Rendered HTML with search results
    """
    # Get raw parameters
    raw_query = request.args.get("q", "").strip()
    raw_page = request.args.get("page", "1")
    raw_limit = request.args.get("limit", "25")

    # Convert pagination parameters to integers
    try:
        page = int(raw_page)
        limit = int(raw_limit)
    except (ValueError, TypeError):
        return render_template(
            "pages/search_form.html",
            error="Invalid pagination parameters",
            site_name=SITE_NAME,
            url_project=PROJECT_URL,
        ), 400

    # Parse search operators first (extracts filters from query text)
    parsed_query = parse_search_operators(raw_query)

    # Validate all parameters comprehensively
    # Use parsed query text (may be empty after operator extraction) or empty string
    # Don't fall back to raw_query as it includes operators we already extracted
    validation_result = validator.validate_all(
        query=parsed_query.query_text or "",
        subreddit=parsed_query.subreddit,
        author=parsed_query.author,
        min_score=parsed_query.min_score,
        limit=limit,
        page=page,
        result_type=parsed_query.result_type,
        sort_by=parsed_query.sort_by,
    )

    # Check validation result
    if not validation_result.is_valid:
        error_messages = validation_result.get_error_messages()
        return render_template(
            "pages/search_form.html", error="; ".join(error_messages), site_name=SITE_NAME, url_project=PROJECT_URL
        ), 400

    # Use sanitized values
    sanitized = validation_result.sanitized_values
    query_text = sanitized["query"]
    sanitized["offset"]

    # Check if query is empty after operator extraction
    if not query_text or query_text.strip() == "":
        # User only provided operators (e.g., "sub:example" with no search terms)
        # Use wildcard search to show all results matching filters
        query_text = "*"

    # Build search query with validated/sanitized parameters
    search_query = SearchQuery(
        query_text=query_text,
        subreddit=sanitized["subreddit"],
        author=sanitized["author"],
        min_score=sanitized["min_score"],
        result_type=sanitized["result_type"],
        limit=sanitized["limit"],
        offset=sanitized["offset"],
        order_by=sanitized["sort_by"],
    )

    # Execute search
    start_time = time.time()

    # DEBUG: Log search parameters
    print(
        f"[DEBUG] Executing search: query_text='{search_query.query_text}', subreddit={search_query.subreddit}, limit={search_query.limit}"
    )

    try:
        search_engine = get_search_engine()
        results, total_count = search_engine.search(search_query)
        # Don't cleanup - reuse connection pool

        search_time = time.time() - start_time
        print(f"[DEBUG] Search returned: {len(results)} results, {total_count} total")

    except Exception as e:
        # Use safe error handler - prevents information disclosure
        safe_error = format_user_error(e, "search")
        return render_template(
            "pages/search_error.html", error=safe_error, query=raw_query, site_name=SITE_NAME, url_project=PROJECT_URL
        ), 500

    # Format results for display
    formatted_results = []
    for result in results:
        formatted_result = {
            "type": result.result_type,
            "id": result.id,
            "subreddit": result.subreddit,
            "platform": result.platform,
            "author": result.author,
            "score": result.score,
            "score_badge_class": get_score_badge_class(result.score),
            "date": format_date(result.created_utc),
            "relative_date": format_relative_date(result.created_utc),
            "rank": result.rank,
        }

        if result.result_type == "post":
            # Use permalink from database (preserves original case)
            permalink = result.permalink

            formatted_result.update(
                {
                    "title": result.title,
                    "num_comments": result.num_comments,
                    "permalink": permalink,
                    "url": result.url,
                    "is_self": result.selftext and result.selftext.strip() != "",
                    "excerpt": result.headline or (result.selftext[:200] if result.selftext else ""),
                }
            )
        else:  # comment
            # Fix permalink for comments - convert from /path/comment_id/ to /path/#comment_id
            # Comments don't have separate HTML files, they're anchored in post pages
            permalink = result.permalink
            if permalink and (
                permalink.startswith("/r/") or permalink.startswith("/v/") or permalink.startswith("/g/")
            ):
                parts = permalink.split("/")

                # Platform-specific permalink handling
                if permalink.startswith("/r/"):
                    # Reddit: /r/sub/comments/post_id/slug/comment_id/
                    if len(parts) >= 7 and parts[-2]:
                        # Remove comment_id directory and trailing slash, use prefixed ID from database
                        permalink = "/".join(parts[:-2]) + "/#comment-" + result.id
                    else:
                        permalink = "/".join(parts)

                elif permalink.startswith("/v/"):
                    # Voat: /v/subverse/comments/post_id#raw_comment_id
                    if "#" in permalink:
                        # Replace raw anchor with prefixed format
                        post_part, raw_comment_id = permalink.split("#", 1)
                        permalink = post_part + "#comment-" + result.id
                    else:
                        permalink = permalink + "#comment-" + result.id

                elif permalink.startswith("/g/"):
                    # Ruqqus: /g/guild/post/post_id/slug/raw_comment_id
                    if len(parts) >= 6 and parts[-1]:
                        # Last part is raw comment ID, remove it and add as anchor with prefix
                        permalink = "/".join(parts[:-1]) + "#comment-" + result.id
                    else:
                        permalink = "/".join(parts) + "#comment-" + result.id

            formatted_result.update(
                {
                    "title": result.post_title or "Comment",
                    "body": result.body,
                    "permalink": permalink,
                    "post_id": result.post_id,
                    "excerpt": result.headline or (result.body[:200] if result.body else ""),
                }
            )

        formatted_results.append(formatted_result)

    # Calculate pagination
    total_pages = (total_count + limit - 1) // limit
    has_prev = page > 1
    has_next = page < total_pages

    # Generate breadcrumb
    breadcrumb = format_search_breadcrumb(parsed_query)

    # Render results
    return render_template(
        "pages/search_results.html",
        query=raw_query,
        parsed_query=parsed_query,
        breadcrumb=breadcrumb,
        results=formatted_results,
        total_count=total_count,
        search_time=search_time,
        page=page,
        limit=limit,
        total_pages=total_pages,
        has_prev=has_prev,
        has_next=has_next,
        site_name=SITE_NAME,
        url_project=PROJECT_URL,
    )


@app.route("/health")
def health():
    """
    Health check endpoint for Docker.

    Returns:
        JSON with health status
    """
    try:
        # Test database connection
        search_engine = get_search_engine()
        health_ok = search_engine.db.health_check()
        # Don't cleanup - reuse connection pool

        if health_ok:
            return jsonify(
                {"status": "healthy", "database": "connected", "timestamp": datetime.utcnow().isoformat()}
            ), 200
        else:
            return jsonify(
                {"status": "unhealthy", "database": "disconnected", "timestamp": datetime.utcnow().isoformat()}
            ), 503

    except Exception as e:
        # Use safe error handler - don't expose exception details
        format_user_error(e, "healthcheck")
        return jsonify(
            {"status": "unhealthy", "error": "Service unavailable", "timestamp": datetime.utcnow().isoformat()}
        ), 503


@app.errorhandler(404)
def not_found(error):
    """Handle 404 errors."""
    return render_template(
        "pages/search_error.html", error="Page not found", query="", site_name=SITE_NAME, url_project=PROJECT_URL
    ), 404


@app.errorhandler(500)
def internal_error(error):
    """Handle 500 errors with safe error messages."""
    # Log error internally but show generic message to user
    format_user_error(error, "server")
    return render_template(
        "pages/search_error.html",
        error="An internal error occurred. Please try again.",
        query="",
        site_name=SITE_NAME,
        url_project=PROJECT_URL,
    ), 500


@app.errorhandler(429)
def rate_limit_exceeded(error):
    """Handle rate limit errors."""
    return render_template(
        "pages/search_error.html",
        error="Rate limit exceeded. Please wait a moment and try again.",
        query="",
        site_name=SITE_NAME,
        url_project=PROJECT_URL,
    ), 429


@app.errorhandler(400)
def csrf_error(error):
    """Handle CSRF validation errors."""
    # Log the error but show generic message
    format_user_error(error, "csrf")
    return render_template(
        "pages/search_error.html",
        error="Invalid request. Please refresh the page and try again.",
        query="",
        site_name=SITE_NAME,
        url_project=PROJECT_URL,
    ), 400


# ============================================================================
# JINJA2 FILTERS
# ============================================================================


@app.template_filter("highlight")
def highlight_filter(text: str, query: str) -> str:
    """
    Highlight search terms in text (simple version).

    Args:
        text: Text to highlight
        query: Search query

    Returns:
        HTML with highlighted terms (already escaped)
    """
    if not text or not query:
        return escape(text)

    # Basic HTML escaping for now
    # Feature planned for v2.1: PostgreSQL ts_headline for advanced highlighting
    # See CHANGELOG.md roadmap for details
    return escape(text)


@app.template_filter("number_format")
def number_format_filter(value: int) -> str:
    """
    Format numbers with thousands separators (e.g., 1000 → "1,000").

    Args:
        value: Integer value to format

    Returns:
        Formatted string with comma separators
    """
    try:
        return f"{int(value):,}"
    except (ValueError, TypeError):
        return str(value)


# ============================================================================
# GUNICORN HOOKS (pre-initialize search engine in each worker)
# ============================================================================


def on_starting(server):
    """Called just before the master process is initialized."""
    print_info("Gunicorn master process starting...")


def post_worker_init(worker):
    """Called just after a worker has been initialized.

    This ensures each Gunicorn worker pre-initializes the search engine
    before handling any requests, avoiding 3-second delay on first request.
    """
    print_info(f"Worker {worker.pid}: Pre-initializing search engine...")
    try:
        # Force initialization now instead of waiting for first request
        get_search_engine()
        print_success(f"Worker {worker.pid}: Search engine ready")
    except Exception as e:
        print_error(f"Worker {worker.pid}: Failed to initialize search engine: {e}")


# ============================================================================
# MAIN
# ============================================================================

if __name__ == "__main__":
    # Development server
    print_info("Starting Flask search server...")
    print_info("Access at: http://localhost:5000")
    print_info("Health check: http://localhost:5000/health")

    # Run with debug mode in development
    debug_mode = os.environ.get("FLASK_DEBUG", "False").lower() == "true"

    app.run(host="0.0.0.0", port=5000, debug=debug_mode)