Add support for Dense Retrievers in REST API Indexing Pipeline (#1430)

oryx1729 · web-flow · commit 1f859694f13f · 2021-09-10T11:53:32.000+02:00
diff --git a/rest_api/controller/file_upload.py b/rest_api/controller/file_upload.py
@@ -17,10 +17,24 @@
 router = APIRouter()
 
 try:
-    INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
+    _, pipeline_config, definitions = Pipeline._read_yaml(
+        path=Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME, overwrite_with_env_variables=True
+    )
+    # Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would
+    # end up with different indices. The check below prevents creation of Indexing Pipelines with FAISSDocumentStore.   
+    is_faiss_present = False
+    for node in pipeline_config["nodes"]:
+        if definitions[node["name"]]["type"] == "FAISSDocumentStore":
+            is_faiss_present = True
+            break
+    if is_faiss_present:
+        logger.warning("Indexing Pipeline with FAISSDocumentStore is not supported with the REST APIs.")
+        INDEXING_PIPELINE = None
+    else:
+        INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
 except KeyError:
     INDEXING_PIPELINE = None
-    logger.info("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.")
+    logger.warning("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.")
 
 
 os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)  # create directory for uploading files
diff --git a/rest_api/pipeline/pipelines.yaml b/rest_api/pipeline/pipelines.yaml
@@ -1,14 +1,14 @@
-version: '0.7'
+version: '0.9'
 
 components:    # define all the building-blocks for Pipeline
-  - name: ElasticsearchDocumentStore
+  - name: DocumentStore
     type: ElasticsearchDocumentStore
     params:
       host: localhost
-  - name: ESRetriever
+  - name: Retriever
     type: ElasticsearchRetriever
     params:
-      document_store: ElasticsearchDocumentStore    # params can reference other components defined in the YAML
+      document_store: DocumentStore    # params can reference other components defined in the YAML
       top_k: 5
   - name: Reader       # custom-name for the component; helpful for visualization & debugging
     type: FARMReader    # Haystack Class name for the component
@@ -30,11 +30,10 @@ pipelines:
   - name: query    # a sample extractive-qa Pipeline
     type: Query
     nodes:
-      - name: ESRetriever
+      - name: Retriever
         inputs: [Query]
       - name: Reader
-        inputs: [ESRetriever]
-
+        inputs: [Retriever]
   - name: indexing
     type: Indexing
     nodes:
@@ -46,5 +45,7 @@ pipelines:
         inputs: [FileTypeClassifier.output_2]
       - name: Preprocessor
         inputs: [PDFFileConverter, TextFileConverter]
-      - name: ElasticsearchDocumentStore
+      - name: Retriever
         inputs: [Preprocessor]
+      - name: DocumentStore
+        inputs: [Retriever]
diff --git a/rest_api/pipeline/pipelines_dpr.yaml b/rest_api/pipeline/pipelines_dpr.yaml
@@ -0,0 +1,51 @@
+version: '0.9'
+
+components:    # define all the building-blocks for Pipeline
+  - name: DocumentStore
+    type: ElasticsearchDocumentStore  # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents
+    params:
+      host: localhost
+  - name: Retriever
+    type: DensePassageRetriever
+    params:
+      document_store: DocumentStore    # params can reference other components defined in the YAML
+      top_k: 5
+  - name: Reader       # custom-name for the component; helpful for visualization & debugging
+    type: FARMReader    # Haystack Class name for the component
+    params:
+      model_name_or_path: deepset/roberta-base-squad2
+  - name: TextFileConverter
+    type: TextConverter
+  - name: PDFFileConverter
+    type: PDFToTextConverter
+  - name: Preprocessor
+    type: PreProcessor
+    params:
+      split_by: word
+      split_length: 1000
+  - name: FileTypeClassifier
+    type: FileTypeClassifier
+
+pipelines:
+  - name: query    # a sample extractive-qa Pipeline
+    type: Query
+    nodes:
+      - name: Retriever
+        inputs: [Query]
+      - name: Reader
+        inputs: [Retriever]
+  - name: indexing
+    type: Indexing
+    nodes:
+      - name: FileTypeClassifier
+        inputs: [File]
+      - name: TextFileConverter
+        inputs: [FileTypeClassifier.output_1]
+      - name: PDFFileConverter
+        inputs: [FileTypeClassifier.output_2]
+      - name: Preprocessor
+        inputs: [PDFFileConverter, TextFileConverter]
+      - name: Retriever
+        inputs: [Preprocessor]
+      - name: DocumentStore
+        inputs: [Retriever]