[Paddle-pipelines] Update pipelines examples & update markdown splitters (#6717)

w5688414 · web-flow · commit 71b4c5314d12 · 2023-08-15T11:32:28.000+08:00
* Update pipelines examples

* Update splitter
diff --git a/pipelines/examples/chatbot/chat_markdown_example.py b/pipelines/examples/chatbot/chat_markdown_example.py
@@ -49,6 +49,7 @@
 parser.add_argument('--title_split', default=False, type=bool, help='the markdown file is split by titles')
 parser.add_argument("--api_key", default=None, type=str, help="The API Key.")
 parser.add_argument("--secret_key", default=None, type=str, help="The secret key.")
+parser.add_argument('--indexing', default=False, type=bool, help='Whether indexing is enabled.')
 args = parser.parse_args()
 # yapf: enable
 
@@ -97,13 +98,15 @@ def chat_markdown_tutorial():
         text_splitter = CharacterTextSplitter(
             separator="\n", chunk_size=args.chunk_size, chunk_overlap=0, filters=["\n"]
         )
-    indexing_pipeline = Pipeline()
-    indexing_pipeline.add_node(component=markdown_converter, name="MarkdownConverter", inputs=["File"])
-    indexing_pipeline.add_node(component=text_splitter, name="Splitter", inputs=["MarkdownConverter"])
-    indexing_pipeline.add_node(component=retriever, name="Retriever", inputs=["Splitter"])
-    indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["Retriever"])
-    files = glob.glob(args.file_paths + "/**/*.md", recursive=True)
-    indexing_pipeline.run(file_paths=files)
+
+    if args.indexing:
+        indexing_pipeline = Pipeline()
+        indexing_pipeline.add_node(component=markdown_converter, name="MarkdownConverter", inputs=["File"])
+        indexing_pipeline.add_node(component=text_splitter, name="Splitter", inputs=["MarkdownConverter"])
+        indexing_pipeline.add_node(component=retriever, name="Retriever", inputs=["Splitter"])
+        indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["Retriever"])
+        files = glob.glob(args.file_paths + "/**/*.md", recursive=True)
+        indexing_pipeline.run(file_paths=files)
 
     # Query Markdowns
     ernie_bot = ErnieBot(api_key=args.api_key, secret_key=args.secret_key)
diff --git a/pipelines/examples/chatbot/chat_markdown_multi_recall_example.py b/pipelines/examples/chatbot/chat_markdown_multi_recall_example.py
@@ -64,6 +64,7 @@
 parser.add_argument("--es_chunk_size", default=500, type=int, help="Number of docs in one chunk sent to es")
 parser.add_argument("--es_thread_count", default=32, type=int, help="Size of the threadpool to use for the bulk requests")
 parser.add_argument("--es_queue_size", default=32, type=int, help="Size of the task queue between the main thread (producing chunks to send) and the processing threads.")
+parser.add_argument('--indexing', default=False, type=bool, help='Whether indexing is enabled.')
 args = parser.parse_args()
 # yapf: enable
 
@@ -120,15 +121,16 @@ def chat_markdown_tutorial():
     text_splitter = CharacterTextSplitter(
         separator="\n", chunk_size=args.data_chunk_size, chunk_overlap=0, filters=["\n"]
     )
-    indexing_pipeline = Pipeline()
-    indexing_pipeline.add_node(component=markdown_converter, name="MarkdownConverter", inputs=["File"])
-    indexing_pipeline.add_node(component=text_splitter, name="Splitter", inputs=["MarkdownConverter"])
-    indexing_pipeline.add_node(component=retriever, name="Retriever", inputs=["Splitter"])
-    indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["Retriever"])
-    files = glob.glob(args.file_paths + "/**/*.md", recursive=True)
-    if len(files) == 0:
-        raise Exception("file should not be empty")
-    indexing_pipeline.run(file_paths=files)
+    if args.indexing:
+        indexing_pipeline = Pipeline()
+        indexing_pipeline.add_node(component=markdown_converter, name="MarkdownConverter", inputs=["File"])
+        indexing_pipeline.add_node(component=text_splitter, name="Splitter", inputs=["MarkdownConverter"])
+        indexing_pipeline.add_node(component=retriever, name="Retriever", inputs=["Splitter"])
+        indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["Retriever"])
+        files = glob.glob(args.file_paths + "/**/*.md", recursive=True)
+        if len(files) == 0:
+            raise Exception("file should not be empty")
+        indexing_pipeline.run(file_paths=files)
 
     # Query Markdowns
     if args.chatbot in ["ernie_bot"]:
@@ -150,7 +152,7 @@ def chat_markdown_tutorial():
         component=TruncatedConversationHistory(max_length=256), name="TruncateHistory", inputs=["Template"]
     )
     query_pipeline.add_node(component=ernie_bot, name="ErnieBot", inputs=["TruncateHistory"])
-    query = "Aistudio最火的项目是哪个?"
+    query = "理财产品的认购期是多久？"
     start_time = time.time()
     prediction = query_pipeline.run(query=query, params={"DenseRetriever": {"top_k": 10}, "Ranker": {"top_k": 5}})
     end_time = time.time()
diff --git a/pipelines/pipelines/nodes/preprocessor/text_splitter.py b/pipelines/pipelines/nodes/preprocessor/text_splitter.py
@@ -391,7 +391,6 @@ def split_text(
         # header_stack: List[Dict[str, Union[int, str]]] = []
         header_stack: List[HeaderType] = []
         initial_metadata: Dict[str, str] = {}
-
         for line in lines:
             stripped_line = line.strip()
             # Check each line against each of the header types (e.g., #, ##)
@@ -495,9 +494,9 @@ def _merge_splits(
         # We now want to combine these smaller pieces into medium size
         # chunks to send to the LLM.
         if chunk_size is None:
-            chunk_size = self.chunk_size
+            chunk_size = self._chunk_size
         if chunk_overlap is None:
-            chunk_overlap = self.chunk_overlap
+            chunk_overlap = self._chunk_overlap
         if separator is None:
             separator = self._separator
         separator_len = self._length_function(separator)
diff --git a/pipelines/pipelines/utils/preprocessing.py b/pipelines/pipelines/utils/preprocessing.py
@@ -177,7 +177,7 @@ def convert_files_to_dicts_splitter(
         separator=separator,
         chunk_size=chunk_size,
         headers_to_split_on=headers_to_split_on,
-        return_each_line=False,
+        return_each_line=True,
         filters=filters,
     )
     if language == "chinese":
diff --git a/pipelines/requirements.txt b/pipelines/requirements.txt
@@ -29,4 +29,5 @@ boilerpy3
 events
 sseclient-py==1.7.2
 typing_extensions==4.5
-spacy
+spacy
+tritonclient[all]
diff --git a/pipelines/utils/offline_ann.py b/pipelines/utils/offline_ann.py
@@ -14,7 +14,11 @@
 
 import argparse
 
-from pipelines.document_stores import ElasticsearchDocumentStore, MilvusDocumentStore
+from pipelines.document_stores import (
+    BaiduElasticsearchDocumentStore,
+    ElasticsearchDocumentStore,
+    MilvusDocumentStore,
+)
 from pipelines.nodes import DensePassageRetriever
 from pipelines.utils import convert_files_to_dicts, fetch_archive_from_http, launch_es
 from pipelines.utils.preprocessing import convert_files_to_dicts_splitter
@@ -30,7 +34,9 @@
 parser = argparse.ArgumentParser()
 parser.add_argument("--index_name", default="baike_cities", type=str, help="The index name of the ANN search engine")
 parser.add_argument("--doc_dir", default="data/baike/", type=str, help="The doc path of the corpus")
-parser.add_argument("--search_engine", choices=["elastic", "milvus"], default="elastic", help="The type of ANN search engine.")
+parser.add_argument('--username', type=str, default="", help='Username of ANN search engine')
+parser.add_argument('--password', type=str, default="", help='Password of ANN search engine')
+parser.add_argument("--search_engine", choices=["elastic", "milvus", 'bes'], default="elastic", help="The type of ANN search engine.")
 parser.add_argument("--host", type=str, default="127.0.0.1", help="host ip of ANN search engine")
 parser.add_argument("--port", type=str, default="9200", help="port of ANN search engine")
 parser.add_argument("--embedding_dim", default=768, type=int, help="The embedding_dim of index")
@@ -51,6 +57,9 @@
 parser.add_argument('--filters', type=list, default=['\n'], help="Filter special symbols")
 parser.add_argument('--language', type=str, default='chinese', help="the language of files")
 parser.add_argument('--pooling_mode', choices=['max_tokens', 'mean_tokens', 'mean_sqrt_len_tokens', 'cls_token'], default='cls_token', help='the type of sentence embedding')
+parser.add_argument("--es_chunk_size", default=500, type=int, help="Number of docs in one chunk sent to es")
+parser.add_argument("--es_thread_count", default=32, type=int, help="Size of the threadpool to use for the bulk requests")
+parser.add_argument("--es_queue_size", default=32, type=int, help="Size of the task queue between the main thread (producing chunks to send) and the processing threads.")
 args = parser.parse_args()
 # yapf: enable
 
@@ -66,13 +75,30 @@ def offline_ann(index_name, doc_dir):
             index_param={"M": 16, "efConstruction": 50},
             index_type="HNSW",
         )
+    elif args.search_engine == "bes":
+
+        document_store = BaiduElasticsearchDocumentStore(
+            host=args.host,
+            port=args.port,
+            username=args.username,
+            password=args.password,
+            embedding_dim=args.embedding_dim,
+            similarity="dot_prod",
+            vector_type="bpack_vector",
+            search_fields=["content", "meta"],
+            index=args.index_name,
+            chunk_size=args.es_chunk_size,
+            thread_count=args.es_thread_count,
+            queue_size=args.es_queue_size,
+        )
+
     else:
         launch_es()
         document_store = ElasticsearchDocumentStore(
             host=args.host,
             port=args.port,
-            username="",
-            password="",
+            username=args.username,
+            password=args.password,
             embedding_dim=args.embedding_dim,
             index=index_name,
             search_fields=args.search_fields,  # 当使用了多路召回并且搜索字段设置了除content的其他字段，构建索引时其他字段也需要设置，例如：['content', 'name']。
@@ -128,6 +154,23 @@ def delete_data(index_name):
             index_param={"M": 16, "efConstruction": 50},
             index_type="HNSW",
         )
+    elif args.search_engine == "bes":
+
+        document_store = BaiduElasticsearchDocumentStore(
+            host=args.host,
+            port=args.port,
+            username=args.username,
+            password=args.password,
+            embedding_dim=args.embedding_dim,
+            similarity="dot_prod",
+            vector_type="bpack_vector",
+            search_fields=["content", "meta"],
+            index=args.index_name,
+            chunk_size=args.es_chunk_size,
+            thread_count=args.es_thread_count,
+            queue_size=args.es_queue_size,
+        )
+
     else:
         document_store = ElasticsearchDocumentStore(
             host=args.host,

Original file line number	Diff line number	Diff line change
`@@ -177,7 +177,7 @@ def convert_files_to_dicts_splitter(`
`177`	`177`	`separator=separator,`
`178`	`178`	`chunk_size=chunk_size,`
`179`	`179`	`headers_to_split_on=headers_to_split_on,`
`180`		`- return_each_line=False,`
	`180`	`+ return_each_line=True,`
`181`	`181`	`filters=filters,`
`182`	`182`	`)`
`183`	`183`	`if language == "chinese":`