Skip to content

Commit ed2fdd0

Browse files
authored
feat: support web search in collection (#1451)
1 parent 550868e commit ed2fdd0

File tree

22 files changed

+2263
-145
lines changed

22 files changed

+2263
-145
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ evaluate:
216216
# OpenAPI and model generation
217217
.PHONY: merge-openapi generate-models generate-frontend-sdk
218218
merge-openapi:
219-
@cd aperag && redocly bundle ./api/openapi.yaml > ./api/openapi.merged.yaml
219+
@cd aperag && npx --yes @redocly/cli bundle ./api/openapi.yaml > ./api/openapi.merged.yaml
220220

221221
generate-models: merge-openapi
222222
@datamodel-codegen \

aperag/api/components/schemas/document.yaml

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,3 +251,89 @@ confirmDocumentsResponse:
251251
required:
252252
- confirmed_count
253253
- failed_count
254+
255+
fetchUrlRequest:
256+
type: object
257+
properties:
258+
urls:
259+
type: array
260+
items:
261+
type: string
262+
format: uri
263+
minItems: 1
264+
maxItems: 10
265+
description: List of URLs to fetch and import (max 10)
266+
example:
267+
- "https://example.com/article1"
268+
- "https://example.com/article2"
269+
required:
270+
- urls
271+
272+
fetchUrlResultItem:
273+
type: object
274+
properties:
275+
url:
276+
type: string
277+
description: The source URL
278+
fetch_status:
279+
type: string
280+
enum:
281+
- success
282+
- error
283+
description: Whether the URL was fetched successfully
284+
document_id:
285+
type: string
286+
description: ID of the created document (only present on success)
287+
filename:
288+
type: string
289+
description: Filename of the created document (only present on success)
290+
size:
291+
type: integer
292+
description: Size of the created document in bytes (only present on success)
293+
status:
294+
type: string
295+
description: Document status (only present on success)
296+
error:
297+
type: string
298+
description: Error message (only present on failure)
299+
required:
300+
- url
301+
- fetch_status
302+
303+
fetchUrlResponse:
304+
type: object
305+
properties:
306+
results:
307+
type: array
308+
items:
309+
$ref: '#/fetchUrlResultItem'
310+
description: Results for each URL
311+
total:
312+
type: integer
313+
description: Total number of URLs processed
314+
succeeded:
315+
type: integer
316+
description: Number of URLs successfully fetched
317+
failed:
318+
type: integer
319+
description: Number of URLs that failed
320+
required:
321+
- results
322+
- total
323+
- succeeded
324+
- failed
325+
326+
stagedDocumentsResponse:
327+
type: object
328+
properties:
329+
documents:
330+
type: array
331+
items:
332+
$ref: '#/uploadDocumentResponse'
333+
description: List of staged (UPLOADED) documents awaiting confirmation
334+
total:
335+
type: integer
336+
description: Total number of staged documents
337+
required:
338+
- documents
339+
- total

aperag/api/openapi.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ paths:
7373
$ref: './paths/collections.yaml#/upload_document'
7474
/collections/{collection_id}/documents/confirm:
7575
$ref: './paths/collections.yaml#/confirm_documents'
76+
/collections/{collection_id}/documents/fetch-url:
77+
$ref: './paths/collections.yaml#/fetch_url_document'
78+
/collections/{collection_id}/documents/staged:
79+
$ref: './paths/collections.yaml#/list_staged_documents'
7680
/collections/{collection_id}/searches:
7781
$ref: './paths/collections.yaml#/searches'
7882
/collections/{collection_id}/searches/{search_id}:

aperag/api/paths/collections.yaml

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,98 @@ confirm_documents:
664664
schema:
665665
$ref: '../components/schemas/common.yaml#/failResponse'
666666

667+
fetch_url_document:
668+
post:
669+
summary: Fetch documents from URLs
670+
description: |
671+
Fetch web page content from one or more URLs and create UPLOADED documents.
672+
Each URL is fetched using the web read service (JINA with Trafilatura fallback).
673+
Successfully fetched URLs produce UPLOADED documents in the staging area,
674+
identical to file uploads. Use the confirm endpoint to move them to PENDING and start indexing.
675+
security:
676+
- BearerAuth: []
677+
parameters:
678+
- name: collection_id
679+
in: path
680+
required: true
681+
schema:
682+
type: string
683+
requestBody:
684+
required: true
685+
content:
686+
application/json:
687+
schema:
688+
$ref: '../components/schemas/document.yaml#/fetchUrlRequest'
689+
examples:
690+
single_url:
691+
summary: Single URL
692+
value:
693+
urls:
694+
- "https://example.com/article"
695+
multiple_urls:
696+
summary: Multiple URLs
697+
value:
698+
urls:
699+
- "https://example.com/article1"
700+
- "https://example.com/article2"
701+
responses:
702+
'200':
703+
description: URL fetch completed (partial success is also 200)
704+
content:
705+
application/json:
706+
schema:
707+
$ref: '../components/schemas/document.yaml#/fetchUrlResponse'
708+
'400':
709+
description: Bad request - invalid URLs or too many URLs
710+
content:
711+
application/json:
712+
schema:
713+
$ref: '../components/schemas/common.yaml#/failResponse'
714+
'401':
715+
description: Unauthorized
716+
content:
717+
application/json:
718+
schema:
719+
$ref: '../components/schemas/common.yaml#/failResponse'
720+
'404':
721+
description: Collection not found
722+
content:
723+
application/json:
724+
schema:
725+
$ref: '../components/schemas/common.yaml#/failResponse'
726+
727+
list_staged_documents:
728+
get:
729+
summary: List staged documents
730+
description: Returns all UPLOADED (staged) documents for the collection that are awaiting confirmation.
731+
security:
732+
- BearerAuth: []
733+
parameters:
734+
- name: collection_id
735+
in: path
736+
required: true
737+
schema:
738+
type: string
739+
responses:
740+
'200':
741+
description: Staged documents retrieved successfully
742+
content:
743+
application/json:
744+
schema:
745+
$ref: '../components/schemas/document.yaml#/stagedDocumentsResponse'
746+
'401':
747+
description: Unauthorized
748+
content:
749+
application/json:
750+
schema:
751+
$ref: '../components/schemas/common.yaml#/failResponse'
752+
'404':
753+
description: Collection not found
754+
content:
755+
application/json:
756+
schema:
757+
$ref: '../components/schemas/common.yaml#/failResponse'
758+
667759
searches:
668760
get:
669761
summary: Get search history

aperag/schema/view_models.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,23 @@
1414

1515
# generated by datamodel-codegen:
1616
# filename: openapi.merged.yaml
17-
# timestamp: 2026-03-04T08:41:38+00:00
17+
# timestamp: 2026-03-09T07:45:40+00:00
1818

1919
from __future__ import annotations
2020

2121
from datetime import datetime
2222
from typing import Any, Literal, Optional, Union
2323

24-
from pydantic import BaseModel, ConfigDict, EmailStr, Field, RootModel, confloat, conint
24+
from pydantic import (
25+
AnyUrl,
26+
BaseModel,
27+
ConfigDict,
28+
EmailStr,
29+
Field,
30+
RootModel,
31+
confloat,
32+
conint,
33+
)
2534

2635

2736
class ModelSpec(BaseModel):
@@ -874,6 +883,51 @@ class ConfirmDocumentsResponse(BaseModel):
874883
)
875884

876885

886+
class FetchUrlRequest(BaseModel):
887+
urls: list[AnyUrl] = Field(
888+
...,
889+
description='List of URLs to fetch and import (max 10)',
890+
examples=[['https://example.com/article1', 'https://example.com/article2']],
891+
)
892+
893+
894+
class FetchUrlResultItem(BaseModel):
895+
url: str = Field(..., description='The source URL')
896+
fetch_status: Literal['success', 'error'] = Field(
897+
..., description='Whether the URL was fetched successfully'
898+
)
899+
document_id: Optional[str] = Field(
900+
None, description='ID of the created document (only present on success)'
901+
)
902+
filename: Optional[str] = Field(
903+
None, description='Filename of the created document (only present on success)'
904+
)
905+
size: Optional[int] = Field(
906+
None,
907+
description='Size of the created document in bytes (only present on success)',
908+
)
909+
status: Optional[str] = Field(
910+
None, description='Document status (only present on success)'
911+
)
912+
error: Optional[str] = Field(
913+
None, description='Error message (only present on failure)'
914+
)
915+
916+
917+
class FetchUrlResponse(BaseModel):
918+
results: list[FetchUrlResultItem] = Field(..., description='Results for each URL')
919+
total: int = Field(..., description='Total number of URLs processed')
920+
succeeded: int = Field(..., description='Number of URLs successfully fetched')
921+
failed: int = Field(..., description='Number of URLs that failed')
922+
923+
924+
class StagedDocumentsResponse(BaseModel):
925+
documents: list[UploadDocumentResponse] = Field(
926+
..., description='List of staged (UPLOADED) documents awaiting confirmation'
927+
)
928+
total: int = Field(..., description='Total number of staged documents')
929+
930+
877931
class VectorSearchParams(BaseModel):
878932
topk: Optional[int] = Field(None, description='Top K results')
879933
similarity: Optional[confloat(ge=0.0, le=1.0)] = Field(

0 commit comments

Comments
 (0)