diff --git a/config/settings/base.py b/config/settings/base.py index 3b1deea2..d312ea97 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -912,6 +912,14 @@ → Need high-level overview of document structure → Understanding document organization before detailed search +Use `generate_annotation_hyperlink` when: + 🔴 MANDATORY: When returning annotations from similarity_search or search_exact_text + → You found relevant annotations and want to cite them in your response + → You need to reference specific passages that are stored as annotations + → Creating clickable links to relevant passages for the user + → Providing easy navigation to source material + → The tool creates markdown links like [annotation text](URL) that open directly to the annotation + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ ✅ RESPONSE REQUIREMENTS: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ @@ -920,9 +928,35 @@ • Include specific citations (page numbers, quotes) from tool results • 🔴 CRITICAL: If you used `load_document_text`, you MUST use `search_exact_text` on key passages to generate proper citations. Otherwise your answer will have NO SOURCES. +• 🔴 MANDATORY: When your search results include annotation IDs (from similarity_search or + search_exact_text), you MUST call `generate_annotation_hyperlink(annotation_id)` for each + relevant annotation and include the resulting markdown links in your response. This allows + users to click directly to the exact location in the document. • If information isn't in the document, explicitly state it was not found • Use multiple search strategies to ensure thoroughness -• Present findings clearly with proper attribution to sources""" +• Present findings clearly with proper attribution to sources + +Example with annotation hyperlinks: + ❌ BAD: "The contract states 'payment due within 30 days' (page 5)" + ✅ GOOD: "[payment due within 30 days](URL) appears on page 5" + + Where the URL is generated by calling generate_annotation_hyperlink(annotation_id) + +🚨 BEFORE SENDING YOUR FINAL RESPONSE - MANDATORY HYPERLINK REVIEW: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +You MUST perform this review process before returning your response to the user: + +1. 📝 READ your drafted response text from start to finish +2. 🔍 IDENTIFY every place where you quoted or referenced specific passages from the document +3. 🔗 For EACH passage/quote that came from an annotation: + - Look up the annotation_id from your search results + - Call `generate_annotation_hyperlink(annotation_id)` to get the markdown link + - REPLACE the plain text quote with the markdown hyperlink +4. ✅ VERIFY that key passages now have clickable [links](URL) to their locations +5. ⚠️ If you quoted annotations without making them hyperlinks, go back and add them NOW + +This is NOT optional. If your response quotes passages without hyperlinks, you're not providing +full value to the user. They need to click directly to the source.""" DEFAULT_CORPUS_AGENT_INSTRUCTIONS = """You are a helpful corpus analysis assistant. Your role is to help users understand and analyze collections of documents by coordinating across @@ -932,11 +966,32 @@ 1. ALWAYS use tools to gather information before answering 2. You have access to multiple documents - use them effectively 3. ALWAYS cite sources from specific documents when making claims +4. 🔴 MANDATORY: Every time you mention a specific document by name or reference one in your response, + you MUST call `generate_document_hyperlink(document_id)` and include the resulting markdown link + in your response. This is NOT optional - users need clickable links to navigate to documents. **Available Tools:** - **Document-Specific Tools**: Available via `ask_document(document_id, question)` - **Corpus-Level Tools**: `list_documents()` to see all available documents - **Cross-Document Search**: Semantic search across the entire corpus +- **Hyperlink Tools** (MUST USE WHEN REFERENCING DOCUMENTS): + - `generate_document_hyperlink(document_id)` - Creates clickable markdown links to documents + - `generate_corpus_hyperlink(corpus_id)` - Creates clickable markdown links to the corpus + +**MANDATORY HYPERLINK REQUIREMENTS:** +🔴 You MUST generate hyperlinks in these situations (NO EXCEPTIONS): + ✓ When mentioning a document by title or name + ✓ When citing information from a specific document + ✓ When listing documents as part of your answer + ✓ When recommending documents for the user to review + ✓ When answering questions about specific documents + +Example responses: + ❌ BAD: "Document ABC123 contains information about..." + ✅ GOOD: "[Document Title](URL) contains information about..." + + ❌ BAD: "See documents X, Y, and Z for more details" + ✅ GOOD: "See [Doc X](URL), [Doc Y](URL), and [Doc Z](URL) for more details" **Recommended Strategy:** 1. If the corpus has a description, use it as context @@ -946,10 +1001,26 @@ - Use cross-document vector search for themes across documents 3. Synthesize information from multiple sources 4. Always cite which document(s) your information comes from +5. 🔴 CRITICAL: For EVERY document you reference, call `generate_document_hyperlink(document_id)` + and include the markdown link in your response - this is MANDATORY, not optional + +🚨 BEFORE SENDING YOUR FINAL RESPONSE - MANDATORY HYPERLINK REVIEW: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +You MUST perform this review process before returning your response to the user: + +1. 📝 READ your drafted response text from start to finish +2. 🔍 IDENTIFY every place where you mentioned a document (by title, name, or reference) +3. 🔗 For EACH document reference found: + - Call `generate_document_hyperlink(document_id)` to get the markdown link + - REPLACE the plain text reference with the markdown hyperlink +4. ✅ VERIFY that every document mentioned now has a clickable [link](URL) +5. ⚠️ If you find ANY document reference without a hyperlink, you MUST fix it before responding + +This is NOT optional. If your response mentions documents without hyperlinks, go back and add them NOW. **When Corpus Has No Description:** Don't just say "the corpus description is empty" - that's not helpful! Instead: -1. List available documents +1. List available documents (with hyperlinks!) 2. Ask the user which documents they want to know about 3. OR proactively examine key documents to provide a useful summary diff --git a/config/websocket/consumers/standalone_document_conversation.py b/config/websocket/consumers/standalone_document_conversation.py index ae3b9a24..e98a17ac 100644 --- a/config/websocket/consumers/standalone_document_conversation.py +++ b/config/websocket/consumers/standalone_document_conversation.py @@ -169,33 +169,29 @@ async def pick_document_embedder(self) -> str: """ from opencontractserver.annotations.models import Embedding - # Extract document ID in async context before passing to sync function document_id = self.document.id - def get_embedder_paths(): - """ - Construct AND evaluate queryset in same DB connection to avoid - transaction isolation issues with database_sync_to_async. - """ - return list( - Embedding.objects.filter( - annotation__document_id=document_id, - annotation__structural=True, - ) - .values_list("embedder_path", flat=True) - .distinct() - ) + # Use async iteration to query in the same transaction context + # This avoids transaction isolation issues with database_sync_to_async + embeddings = [ + e + async for e in Embedding.objects.filter( + annotation__document_id=document_id, + annotation__structural=True, + ).only("embedder_path") + ] - paths = await database_sync_to_async(get_embedder_paths)() + # Get unique embedder paths + paths = list({e.embedder_path for e in embeddings}) if paths: logger.info( - f"[Session {self.session_id}] Using existing embedder: {paths[0]} for Document {getattr(self, 'document_id', 'unknown')}" # noqa: E501 + f"[Session {self.session_id}] Using existing embedder: {paths[0]} for Document {document_id}" ) return paths[0] else: logger.warning( - f"[Session {self.session_id}] No existing embedder found for Document {getattr(self, 'document_id', 'unknown')}, " # noqa: E501 + f"[Session {self.session_id}] No existing embedder found for Document {document_id}, " f"falling back to DEFAULT_EMBEDDER: {settings.DEFAULT_EMBEDDER}" ) return settings.DEFAULT_EMBEDDER diff --git a/opencontractserver/llms/tools/__init__.py b/opencontractserver/llms/tools/__init__.py index 6eac3417..50381b75 100644 --- a/opencontractserver/llms/tools/__init__.py +++ b/opencontractserver/llms/tools/__init__.py @@ -5,8 +5,14 @@ """ from opencontractserver.llms.tools.core_tools import ( + agenerate_annotation_hyperlink, + agenerate_corpus_hyperlink, + agenerate_document_hyperlink, aload_document_md_summary, aload_document_txt_extract, + generate_annotation_hyperlink, + generate_corpus_hyperlink, + generate_document_hyperlink, get_md_summary_token_length, get_note_content_token_length, get_notes_for_document_corpus, @@ -31,6 +37,13 @@ "load_document_txt_extract", "aload_document_txt_extract", "aload_document_md_summary", + # Hyperlink generation tools + "generate_document_hyperlink", + "generate_annotation_hyperlink", + "generate_corpus_hyperlink", + "agenerate_document_hyperlink", + "agenerate_annotation_hyperlink", + "agenerate_corpus_hyperlink", # Factory and metadata "CoreTool", "ToolMetadata", diff --git a/opencontractserver/llms/tools/core_tools.py b/opencontractserver/llms/tools/core_tools.py index dc75badf..05c74abc 100644 --- a/opencontractserver/llms/tools/core_tools.py +++ b/opencontractserver/llms/tools/core_tools.py @@ -9,7 +9,9 @@ if TYPE_CHECKING: from opencontractserver.llms.agents.core_agents import SourceNode -from opencontractserver.annotations.models import Note, NoteRevision +from django.conf import settings + +from opencontractserver.annotations.models import Annotation, Note, NoteRevision from opencontractserver.corpuses.models import Corpus, CorpusDescriptionRevision from opencontractserver.documents.models import Document @@ -1863,3 +1865,453 @@ async def aget_page_image( image_format=image_format, dpi=dpi, ) + + +# ============================================================================== +# Hyperlink Generation Tools +# ============================================================================== + + +def _get_base_url() -> str: + """Get frontend base URL from Django settings. + + Returns: + Base URL for the frontend application. + Defaults to http://localhost:3000 if not configured. + """ + return getattr(settings, "FRONTEND_URL", "http://localhost:3000") + + +def _get_user_slug(user) -> str: + """Get slug or username from a user object. + + Args: + user: Django user instance + + Returns: + User slug if available, otherwise username + + Raises: + ValueError: If user has neither slug nor username + """ + if hasattr(user, "slug") and user.slug: + return user.slug + elif hasattr(user, "username") and user.username: + return user.username + else: + raise ValueError(f"User {user.id} has neither slug nor username") + + +def _build_query_params( + annotation_ids: Optional[list[int]] = None, + analysis_ids: Optional[list[int]] = None, + extract_ids: Optional[list[int]] = None, + structural: bool = False, + selected_only: bool = False, + bounding_boxes: bool = False, + labels: str = "ON_HOVER", +) -> str: + """Build query parameter string from parameters. + + Only includes non-default values to keep URLs clean, following the + routing system convention from docs/frontend/routing_system.md. + + Args: + annotation_ids: List of annotation IDs to select + analysis_ids: List of analysis IDs to show + extract_ids: List of extract IDs to show + structural: Show structural annotations + selected_only: Show only selected annotations + bounding_boxes: Show annotation bounding boxes + labels: Label display behavior (ALWAYS|ON_HOVER|HIDE) + + Returns: + Query string with ? prefix, or empty string if no params + """ + params = [] + + # Selection parameters + if annotation_ids: + params.append(f"ann={','.join(map(str, annotation_ids))}") + if analysis_ids: + params.append(f"analysis={','.join(map(str, analysis_ids))}") + if extract_ids: + params.append(f"extract={','.join(map(str, extract_ids))}") + + # Visualization parameters (only add if non-default) + if structural: + params.append("structural=true") + if selected_only: + params.append("selectedOnly=true") + if bounding_boxes: + params.append("boundingBoxes=true") + if labels != "ON_HOVER": + params.append(f"labels={labels}") + + return "?" + "&".join(params) if params else "" + + +def generate_document_link( + document_id: int, + corpus_id: Optional[int] = None, + annotation_ids: Optional[list[int]] = None, + analysis_ids: Optional[list[int]] = None, + extract_ids: Optional[list[int]] = None, + structural: bool = False, + selected_only: bool = False, + bounding_boxes: bool = False, + labels: str = "ON_HOVER", + base_url: Optional[str] = None, +) -> str: + """Generate a URL to view a document in the frontend. + + Creates URLs following the centralized routing patterns from + docs/frontend/routing_system.md: + - Document in corpus: /d/{userSlug}/{corpusSlug}/{docSlug} + - Standalone document: /d/{userSlug}/{docSlug} + + Args: + document_id: ID of the document to link to + corpus_id: Optional corpus context + annotation_ids: Optional list of annotation IDs to highlight + analysis_ids: Optional list of analysis IDs to show + extract_ids: Optional list of extract IDs to show + structural: Show structural annotations + selected_only: Show only selected annotations + bounding_boxes: Show bounding boxes + labels: Label display behavior (ALWAYS|ON_HOVER|HIDE) + base_url: Optional base URL override (defaults to settings.FRONTEND_URL) + + Returns: + Full URL to the document view + + Raises: + ValueError: If document doesn't exist or lacks required fields + """ + # Fetch document with related creator + try: + document = Document.objects.select_related("creator").get(pk=document_id) + except Document.DoesNotExist: + raise ValueError(f"Document with id={document_id} does not exist") + + # Verify document has slug + if ( + not hasattr(document, "slug") + or document.slug is None + or not str(document.slug).strip() + ): + raise ValueError(f"Document {document_id} lacks slug field") + + # Get user identifier + user_slug = _get_user_slug(document.creator) + + # Build path based on corpus context + if corpus_id: + try: + corpus = Corpus.objects.select_related("creator").get(pk=corpus_id) + except Corpus.DoesNotExist: + raise ValueError(f"Corpus with id={corpus_id} does not exist") + + if ( + not hasattr(corpus, "slug") + or corpus.slug is None + or not str(corpus.slug).strip() + ): + raise ValueError(f"Corpus {corpus_id} lacks slug field") + + # Document in corpus: /d/{user}/{corpus}/{doc} + path = f"/d/{user_slug}/{corpus.slug}/{document.slug}" + else: + # Standalone document: /d/{user}/{doc} + path = f"/d/{user_slug}/{document.slug}" + + # Build query string + query = _build_query_params( + annotation_ids=annotation_ids, + analysis_ids=analysis_ids, + extract_ids=extract_ids, + structural=structural, + selected_only=selected_only, + bounding_boxes=bounding_boxes, + labels=labels, + ) + + # Construct full URL + base = base_url or _get_base_url() + return f"{base.rstrip('/')}{path}{query}" + + +def generate_annotation_link( + annotation_id: int, + corpus_id: Optional[int] = None, + additional_annotation_ids: Optional[list[int]] = None, + structural: bool = True, + selected_only: bool = False, + bounding_boxes: bool = False, + labels: str = "ALWAYS", + base_url: Optional[str] = None, +) -> str: + """Generate a URL to view an annotation in the frontend. + + Looks up the annotation's document and generates a document link + with the annotation selected via query parameters. Uses sensible + defaults for annotation viewing (structural=True, labels=ALWAYS). + + Args: + annotation_id: ID of the annotation to link to + corpus_id: Optional corpus context (if None, uses annotation's corpus) + additional_annotation_ids: Other annotation IDs to select alongside + structural: Show structural annotations (default True for annotations) + selected_only: Show only selected annotations + bounding_boxes: Show bounding boxes + labels: Label display behavior (default ALWAYS for annotations) + base_url: Optional base URL override + + Returns: + Full URL to view the annotation + + Raises: + ValueError: If annotation doesn't exist or lacks required fields + """ + # Fetch annotation with related document and corpus + try: + annotation = Annotation.objects.select_related("document", "corpus").get( + pk=annotation_id + ) + except Annotation.DoesNotExist: + raise ValueError(f"Annotation with id={annotation_id} does not exist") + + # Use annotation's corpus if not explicitly specified + if corpus_id is None and annotation.corpus: + corpus_id = annotation.corpus.id + + # Build annotation ID list + ann_ids = [annotation_id] + if additional_annotation_ids: + ann_ids.extend(additional_annotation_ids) + + # Generate document link with annotation selected + return generate_document_link( + document_id=annotation.document.id, + corpus_id=corpus_id, + annotation_ids=ann_ids, + structural=structural, + selected_only=selected_only, + bounding_boxes=bounding_boxes, + labels=labels, + base_url=base_url, + ) + + +def generate_corpus_link( + corpus_id: int, + analysis_ids: Optional[list[int]] = None, + extract_ids: Optional[list[int]] = None, + base_url: Optional[str] = None, +) -> str: + """Generate a URL to view a corpus in the frontend. + + Creates URLs following the pattern: /c/{userSlug}/{corpusSlug} + + Args: + corpus_id: ID of the corpus to link to + analysis_ids: Optional list of analysis IDs to show + extract_ids: Optional list of extract IDs to show + base_url: Optional base URL override + + Returns: + Full URL to the corpus view + + Raises: + ValueError: If corpus doesn't exist or lacks required fields + """ + # Fetch corpus with related creator + try: + corpus = Corpus.objects.select_related("creator").get(pk=corpus_id) + except Corpus.DoesNotExist: + raise ValueError(f"Corpus with id={corpus_id} does not exist") + + # Verify corpus has slug + if ( + not hasattr(corpus, "slug") + or corpus.slug is None + or not str(corpus.slug).strip() + ): + raise ValueError(f"Corpus {corpus_id} lacks slug field") + + # Get user identifier + user_slug = _get_user_slug(corpus.creator) + + # Build path: /c/{user}/{corpus} + path = f"/c/{user_slug}/{corpus.slug}" + + # Build query string (corpus view supports analysis and extract selection) + query = _build_query_params(analysis_ids=analysis_ids, extract_ids=extract_ids) + + # Construct full URL + base = base_url or _get_base_url() + return f"{base.rstrip('/')}{path}{query}" + + +def generate_document_hyperlink( + document_id: int, + corpus_id: Optional[int] = None, + link_text: Optional[str] = None, +) -> str: + """Generate a markdown hyperlink to view a document. + + This tool creates a markdown-formatted link that can be embedded in + LLM responses. The link opens the specified document in the OpenContracts + frontend interface. + + Args: + document_id: The ID of the document to link to + corpus_id: Optional corpus context (shows document within corpus) + link_text: Custom text for the link (defaults to document title) + + Returns: + Markdown formatted link like: [Document Title](URL) + + Example: + >>> generate_document_hyperlink(42, corpus_id=1) + '[Contract Agreement](http://localhost:3000/d/john/my-corpus/contract)' + + Raises: + ValueError: If document doesn't exist or lacks required fields + """ + # Fetch document to get title + try: + document = Document.objects.get(pk=document_id) + except Document.DoesNotExist: + raise ValueError(f"Document with id={document_id} does not exist") + + # Generate URL + url = generate_document_link(document_id=document_id, corpus_id=corpus_id) + + # Determine link text + if link_text: + label = link_text + else: + label = document.title or f"Document #{document_id}" + + return f"[{label}]({url})" + + +def generate_annotation_hyperlink( + annotation_id: int, + link_text: Optional[str] = None, + include_structural: bool = True, +) -> str: + """Generate a markdown hyperlink to view a specific annotation. + + This tool creates a markdown-formatted link that opens the document + containing the annotation with the annotation highlighted and selected. + Uses sensible defaults optimized for annotation viewing (structural + annotations visible, labels always shown). + + Args: + annotation_id: The ID of the annotation to link to + link_text: Custom text for the link (defaults to annotation text snippet) + include_structural: Whether to show structural annotations (default True) + + Returns: + Markdown formatted link like: [Annotation Text...](URL) + + Example: + >>> generate_annotation_hyperlink(123) + '[This clause specifies payment terms...](http://localhost:3000/...?ann=123)' + + Raises: + ValueError: If annotation doesn't exist or lacks required fields + """ + # Fetch annotation to get text + try: + annotation = Annotation.objects.get(pk=annotation_id) + except Annotation.DoesNotExist: + raise ValueError(f"Annotation with id={annotation_id} does not exist") + + # Generate URL with annotation-optimized settings + url = generate_annotation_link( + annotation_id=annotation_id, structural=include_structural, labels="ALWAYS" + ) + + # Determine link text + if link_text: + label = link_text + else: + # Use annotation text, truncated to reasonable length + text = annotation.raw_text or f"Annotation #{annotation_id}" + label = text[:50] + "..." if len(text) > 50 else text + + return f"[{label}]({url})" + + +def generate_corpus_hyperlink(corpus_id: int, link_text: Optional[str] = None) -> str: + """Generate a markdown hyperlink to view a corpus. + + This tool creates a markdown-formatted link that opens the specified + corpus in the OpenContracts frontend interface. + + Args: + corpus_id: The ID of the corpus to link to + link_text: Custom text for the link (defaults to corpus title) + + Returns: + Markdown formatted link like: [Corpus Title](URL) + + Example: + >>> generate_corpus_hyperlink(1) + '[Legal Contracts 2024](http://localhost:3000/c/john/legal-contracts-2024)' + + Raises: + ValueError: If corpus doesn't exist or lacks required fields + """ + # Fetch corpus to get title + try: + corpus = Corpus.objects.get(pk=corpus_id) + except Corpus.DoesNotExist: + raise ValueError(f"Corpus with id={corpus_id} does not exist") + + # Generate URL + url = generate_corpus_link(corpus_id=corpus_id) + + # Determine link text + if link_text: + label = link_text + else: + label = corpus.title or f"Corpus #{corpus_id}" + + return f"[{label}]({url})" + + +async def agenerate_document_hyperlink( + document_id: int, + corpus_id: Optional[int] = None, + link_text: Optional[str] = None, +) -> str: + """Async wrapper around :func:`generate_document_hyperlink`.""" + return await _db_sync_to_async(generate_document_hyperlink)( + document_id=document_id, corpus_id=corpus_id, link_text=link_text + ) + + +async def agenerate_annotation_hyperlink( + annotation_id: int, + link_text: Optional[str] = None, + include_structural: bool = True, +) -> str: + """Async wrapper around :func:`generate_annotation_hyperlink`.""" + return await _db_sync_to_async(generate_annotation_hyperlink)( + annotation_id=annotation_id, + link_text=link_text, + include_structural=include_structural, + ) + + +async def agenerate_corpus_hyperlink( + corpus_id: int, link_text: Optional[str] = None +) -> str: + """Async wrapper around :func:`generate_corpus_hyperlink`.""" + return await _db_sync_to_async(generate_corpus_hyperlink)( + corpus_id=corpus_id, link_text=link_text + )