Skip to content

Commit 1f85969

Browse files
authored
Add support for Dense Retrievers in REST API Indexing Pipeline (#1430)
1 parent 9dd7c74 commit 1f85969

File tree

3 files changed

+76
-10
lines changed

3 files changed

+76
-10
lines changed

rest_api/controller/file_upload.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,24 @@
1717
router = APIRouter()
1818

1919
try:
20-
INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
20+
_, pipeline_config, definitions = Pipeline._read_yaml(
21+
path=Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME, overwrite_with_env_variables=True
22+
)
23+
# Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would
24+
# end up with different indices. The check below prevents creation of Indexing Pipelines with FAISSDocumentStore.
25+
is_faiss_present = False
26+
for node in pipeline_config["nodes"]:
27+
if definitions[node["name"]]["type"] == "FAISSDocumentStore":
28+
is_faiss_present = True
29+
break
30+
if is_faiss_present:
31+
logger.warning("Indexing Pipeline with FAISSDocumentStore is not supported with the REST APIs.")
32+
INDEXING_PIPELINE = None
33+
else:
34+
INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
2135
except KeyError:
2236
INDEXING_PIPELINE = None
23-
logger.info("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.")
37+
logger.warning("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.")
2438

2539

2640
os.makedirs(FILE_UPLOAD_PATH, exist_ok=True) # create directory for uploading files

rest_api/pipeline/pipelines.yaml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
version: '0.7'
1+
version: '0.9'
22

33
components: # define all the building-blocks for Pipeline
4-
- name: ElasticsearchDocumentStore
4+
- name: DocumentStore
55
type: ElasticsearchDocumentStore
66
params:
77
host: localhost
8-
- name: ESRetriever
8+
- name: Retriever
99
type: ElasticsearchRetriever
1010
params:
11-
document_store: ElasticsearchDocumentStore # params can reference other components defined in the YAML
11+
document_store: DocumentStore # params can reference other components defined in the YAML
1212
top_k: 5
1313
- name: Reader # custom-name for the component; helpful for visualization & debugging
1414
type: FARMReader # Haystack Class name for the component
@@ -30,11 +30,10 @@ pipelines:
3030
- name: query # a sample extractive-qa Pipeline
3131
type: Query
3232
nodes:
33-
- name: ESRetriever
33+
- name: Retriever
3434
inputs: [Query]
3535
- name: Reader
36-
inputs: [ESRetriever]
37-
36+
inputs: [Retriever]
3837
- name: indexing
3938
type: Indexing
4039
nodes:
@@ -46,5 +45,7 @@ pipelines:
4645
inputs: [FileTypeClassifier.output_2]
4746
- name: Preprocessor
4847
inputs: [PDFFileConverter, TextFileConverter]
49-
- name: ElasticsearchDocumentStore
48+
- name: Retriever
5049
inputs: [Preprocessor]
50+
- name: DocumentStore
51+
inputs: [Retriever]
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
version: '0.9'
2+
3+
components: # define all the building-blocks for Pipeline
4+
- name: DocumentStore
5+
type: ElasticsearchDocumentStore # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents
6+
params:
7+
host: localhost
8+
- name: Retriever
9+
type: DensePassageRetriever
10+
params:
11+
document_store: DocumentStore # params can reference other components defined in the YAML
12+
top_k: 5
13+
- name: Reader # custom-name for the component; helpful for visualization & debugging
14+
type: FARMReader # Haystack Class name for the component
15+
params:
16+
model_name_or_path: deepset/roberta-base-squad2
17+
- name: TextFileConverter
18+
type: TextConverter
19+
- name: PDFFileConverter
20+
type: PDFToTextConverter
21+
- name: Preprocessor
22+
type: PreProcessor
23+
params:
24+
split_by: word
25+
split_length: 1000
26+
- name: FileTypeClassifier
27+
type: FileTypeClassifier
28+
29+
pipelines:
30+
- name: query # a sample extractive-qa Pipeline
31+
type: Query
32+
nodes:
33+
- name: Retriever
34+
inputs: [Query]
35+
- name: Reader
36+
inputs: [Retriever]
37+
- name: indexing
38+
type: Indexing
39+
nodes:
40+
- name: FileTypeClassifier
41+
inputs: [File]
42+
- name: TextFileConverter
43+
inputs: [FileTypeClassifier.output_1]
44+
- name: PDFFileConverter
45+
inputs: [FileTypeClassifier.output_2]
46+
- name: Preprocessor
47+
inputs: [PDFFileConverter, TextFileConverter]
48+
- name: Retriever
49+
inputs: [Preprocessor]
50+
- name: DocumentStore
51+
inputs: [Retriever]

0 commit comments

Comments
 (0)