Add Mac system support of pipelines (#2885)

w5688414 · web-flow · commit 1c498bc8a54e · 2022-08-01T17:18:50.000+08:00
* Add Mac system support of pipelines

* Add cpu requirements

* Add mac es faq and change to a small dataset of dureader

* adjust semantic search example
diff --git a/applications/experimental/pipelines/examples/semantic-search/README.md b/applications/experimental/pipelines/examples/semantic-search/README.md
@@ -59,7 +59,7 @@ python ./rest_api/setup.py install
 python ./ui/setup.py install
 ```
 ### 3.2 数据说明
-语义检索数据库的数据来自于[DuReader-Robust数据集](https://github.com/baidu/DuReader/tree/master/DuReader-Robust)，共包含 46972 个段落文本。
+语义检索数据库的数据来自于[DuReader-Robust数据集](https://github.com/baidu/DuReader/tree/master/DuReader-Robust)，共包含 46972 个段落文本，并选取了其中验证集1417条段落文本来搭建语义检索系统。
 
 ### 3.3 一键体验语义检索系统
 我们预置了基于[DuReader-Robust数据集](https://github.com/baidu/DuReader/tree/master/DuReader-Robust)搭建语义检索系统的代码示例，您可以通过如下命令快速体验语义检索系统的效果
@@ -78,7 +78,7 @@ python examples/semantic-search/semantic_search_example.py --device cpu
 整个 Web 可视化语义检索系统主要包含 3 大组件: 1. 基于 ElasticSearch 的 ANN 服务 2. 基于 RestAPI 构建模型服务 3. 基于 Streamlit 构建 WebUI，接下来我们依次搭建这 3 个服务并最终形成可视化的语义检索系统。
 
 #### 3.4.1 启动 ANN 服务
-1. 参考官方文档下载安装 [elasticsearch-8.1.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。
+1. 参考官方文档下载安装 [elasticsearch-8.3.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。
 2. 启动 ES 服务
 ```bash
 ./bin/elasticsearch
@@ -93,7 +93,7 @@ curl http://localhost:9200/_aliases?pretty=true
 ```
 # 以DuReader-Robust 数据集为例建立 ANN 索引库
 python utils/offline_ann.py --index_name dureader_robust_query_encoder \
-                            --doc_dir data/dureader_robust_processed
+                            --doc_dir data/dureader_dev
 ```
 #### 3.4.3 启动 RestAPI 模型服务
 ```bash
@@ -138,12 +138,19 @@ elasticsearch 需要在非root环境下运行，可以做如下的操作：
 
 ```
 adduser est
-chown est:est -R ${HOME}/elasticsearch-8.1.2/
-cd ${HOME}/elasticsearch-8.1.2/
+chown est:est -R ${HOME}/elasticsearch-8.3.2/
+cd ${HOME}/elasticsearch-8.3.2/
 su est
 ./bin/elasticsearch
 ```
 
+#### Mac OS上安装elasticsearch出现错误 `flood stage disk watermark [95%] exceeded on.... all indices on this node will be marked read-only`
+
+elasticsearch默认达到95％就全都设置只读，可以腾出一部分空间出来再启动，或者修改 `config/elasticsearch.pyml`。
+```
+cluster.routing.allocation.disk.threshold_enabled: false
+```
+
 ## Reference
 [1]Y. Sun et al., “[ERNIE 3.0: Large-scale Knowledge Enhanced Pre-training for Language Understanding and Generation](https://arxiv.org/pdf/2107.02137.pdf),” arXiv:2107.02137 [cs], Jul. 2021, Accessed: Jan. 17, 2022. [Online]. Available: http://arxiv.org/abs/2107.02137
 
diff --git a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
@@ -37,7 +37,7 @@ def semantic_search_tutorial():
         )
     else:
         doc_dir = "data/dureader_robust_processed"
-        dureader_data = "https://paddlenlp.bj.bcebos.com/applications/dureader_robust_processed.zip"
+        dureader_data = "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip"
 
         fetch_archive_from_http(url=dureader_data, output_dir=doc_dir)
         dicts = convert_files_to_dicts(dir_path=doc_dir, split_paragraphs=True)
diff --git a/applications/experimental/pipelines/pipelines/nodes/reader/ernie_dureader.py b/applications/experimental/pipelines/pipelines/nodes/reader/ernie_dureader.py
@@ -547,7 +547,7 @@ def logits_to_preds(
         start_end_matrix[invalid_indices[0][:], invalid_indices[1][:],
                          invalid_indices[2][:]] = -999
         start_end_matrix = paddle.to_tensor(start_end_matrix,
-                                            place=paddle.CUDAPlace(0))
+                                            place=self.devices[0])
 
         # Sort the candidate answers by their score. Sorting happens on the flattened matrix.
         # flat_sorted_indices.shape: (batch_size, max_seq_len^2, 1)
diff --git a/applications/experimental/pipelines/requirements-cpu.txt b/applications/experimental/pipelines/requirements-cpu.txt
@@ -0,0 +1,23 @@
+paddlepaddle
+paddlenlp
+paddleocr
+requests
+pydantic
+mmh3
+more_itertools
+elasticsearch>=7.7,<=7.10
+sqlalchemy>=1.4.2,<2
+sqlalchemy_utils
+langdetect
+python-docx
+nltk
+pdfplumber
+faiss-cpu>=1.7.2
+opencv-python
+opencv-contrib-python-headless
+python-multipart
+st-annotated-text
+streamlit==1.9.0
+fastapi
+uvicorn
+markdown
diff --git a/applications/experimental/pipelines/setup.py b/applications/experimental/pipelines/setup.py
@@ -15,11 +15,18 @@
 import setuptools
 import sys
 import pipelines
+import platform
 
 long_description = "PIPELINES: An End to End Natural Language Proceessing Development Kit Based on ERNIE"
 
-with open("requirements.txt") as fin:
-    REQUIRED_PACKAGES = fin.read()
+if platform.system().lower() == 'windows':
+    pass
+elif platform.system().lower() == "darwin":
+    with open("requirements-cpu.txt") as fin:
+        REQUIRED_PACKAGES = fin.read()
+elif platform.system().lower() == 'linux':
+    with open("requirements.txt") as fin:
+        REQUIRED_PACKAGES = fin.read()
 
 setuptools.setup(name="pipelines",
                  version=pipelines.__version__,
diff --git a/applications/experimental/pipelines/ui/webapp_semantic_search.py b/applications/experimental/pipelines/ui/webapp_semantic_search.py
@@ -28,8 +28,9 @@
 
 # Adjust to a question that you would like users to see in the search bar when they load the UI:
 DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP",
-                                        "燃气热水器哪个牌子好?")
-DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "北京")
+                                        "衡量酒水的价格的因素有哪些?")
+DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP",
+                                      "酒水的血统，存储的时间等")
 
 # Sliders
 DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER",

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ def semantic_search_tutorial():`
`37`	`37`	`)`
`38`	`38`	`else:`
`39`	`39`	`doc_dir = "data/dureader_robust_processed"`
`40`		`- dureader_data = "https://paddlenlp.bj.bcebos.com/applications/dureader_robust_processed.zip"`
	`40`	`+ dureader_data = "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip"`
`41`	`41`
`42`	`42`	`fetch_archive_from_http(url=dureader_data, output_dir=doc_dir)`
`43`	`43`	`dicts = convert_files_to_dicts(dir_path=doc_dir, split_paragraphs=True)`