Update quick start.

qingqing01 · qingqing01 · commit eef13ffbb690 · 2016-08-31T07:05:05.000Z
ISSUE=4602353 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1450 1ad973e4-5ce8-4261-8a94-b56d1f490c56
diff --git a/demo/quick_start/dataprovider_bow.py b/demo/quick_start/dataprovider_bow.py
@@ -75,7 +75,7 @@ def predict_initializer(settings, dictionary, **kwargs):
 
 # Declaring a data provider for prediction. The difference with process
 # is that label is not generated.
-@provider(init_hook=predict_initializer)
+@provider(init_hook=predict_initializer, should_shuffle=False)
 def process_predict(settings, file_name):
     with open(file_name, 'r') as f:
         for line in f:
diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py
@@ -43,7 +43,7 @@ def predict_initializer(settings, dictionary, **kwargs):
     ]
 
 
-@provider(init_hook=predict_initializer)
+@provider(init_hook=predict_initializer, should_shuffle=False)
 def process_predict(settings, file_name):
     with open(file_name, 'r') as f:
         for line in f:
diff --git a/demo/quick_start/predict.sh b/demo/quick_start/predict.sh
@@ -14,10 +14,10 @@
 # limitations under the License.
 set -e
 
-#cfg=trainer_config.lr.py
+cfg=trainer_config.lr.py
 #cfg=trainer_config.emb.py
 #cfg=trainer_config.cnn.py
-cfg=trainer_config.lstm.py
+#cfg=trainer_config.lstm.py
 model="output/pass-00003"
 paddle train \
     --config=$cfg \
diff --git a/demo/quick_start/preprocess.py b/demo/quick_start/preprocess.py
@@ -29,7 +29,6 @@
 from subprocess import Popen, PIPE
 from optparse import OptionParser
 import json
-from bs4 import BeautifulSoup
 from multiprocessing import Queue
 from multiprocessing import Pool
 import multiprocessing
@@ -69,16 +68,6 @@ def parse(path):
         yield json.loads(l)
     g.close()
 
-'''
-def clean(review):
-    """
-    Clean input review: remove HTML, convert words to lower cases.
-    """
-    # Remove HTML
-    review_text = BeautifulSoup(review, "html.parser").get_text()
-    return review_text
-'''
-
 
 def tokenize(sentences):
     """
@@ -152,15 +141,14 @@ def save_batch(data_dir, num_tokenize, data_dir_dict):
 def parse_batch(data, num_tokenize):
     """
     parse data by batch
-    parse -> clean ->tokenize ->save
+    parse -> tokenize -> save
     """
     raw_txt = parse(data)
     neg, pos = [], []
     count = 0
     sys.stderr.write("extract raw data\n")
     for l in raw_txt:
         rating = l["overall"]
-        #text = clean(l["reviewText"].lower()) # remove HTML
         text = l["reviewText"].lower()  # # convert words to lower case
         if rating == 5.0 and text:
             pos.append(text)
@@ -223,7 +211,6 @@ def main():
     pool.close()
     pool.join()
 
-    sys.stderr.write("clean data done.\n")
     file(os.path.join(os.path.dirname(data), 'labels.list'),
          'w').write('neg\t0\npos\t1\n')
 
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh
@@ -18,11 +18,13 @@
 # 3. distinct train set and test set.
 # 4. build dict
 
+set -e
 
-mkdir data/tmp
+mkdir -p data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
 # uniq and shuffle
 cd data/tmp
+echo 'uniq and shuffle...'
 cat pos_*|sort|uniq|shuf> pos.shuffed
 cat neg_*|sort|uniq|shuf> neg.shuffed
 
diff --git a/demo/quick_start/requirements.txt b/demo/quick_start/requirements.txt
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
@@ -16,7 +16,7 @@ set -e
 
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
+  sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
   sort | head -n 1
 }
 
diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md
@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st
 ## Preprocess data into standardized format
 In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
 
-`demo/quick_start` provides scripts for downloading data and preprocessing data, as shown below:
+`demo/quick_start` provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
 
 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-pip install -r requirements.txt
 ./preprocess.sh
 ```
 
@@ -432,6 +431,14 @@ There are several differences between training and inference network configurati
 - batch_size = 1.
 - You need to specify the location of `test_list` in the test data.
 
+The results in `result.txt` is as follows, each line is one sample.
+
+```
+predicted_label_id;probability_of_label_0 probability_of_label_1  # the first sample
+predicted_label_id;probability_of_label_0 probability_of_label_1  # the second sample
+```
+
+
 ```python
 is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
@@ -38,7 +38,6 @@
 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-pip install -r requirements.txt
 ./preprocess.sh
 ```
 
@@ -411,6 +410,13 @@ mv rank-00000 result.txt
 与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
 指定batch_size=1，数据传输无需label数据，预测数据指定test_list的位置。
 
+预测结果以文本的形式保存在`result.txt`中，一行为一个样本，格式如下：
+
+```
+预测ID;ID为0的概率 ID为1的概率
+预测ID;ID为0的概率 ID为1的概率
+```
+
 ```
 is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ def predict_initializer(settings, dictionary, **kwargs):`
`43`	`43`	`]`
`44`	`44`
`45`	`45`
`46`		`-@provider(init_hook=predict_initializer)`
	`46`	`+@provider(init_hook=predict_initializer, should_shuffle=False)`
`47`	`47`	`def process_predict(settings, file_name):`
`48`	`48`	`with open(file_name, 'r') as f:`
`49`	`49`	`for line in f:`
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ set -e`
`16`	`16`
`17`	`17`	`function get_best_pass() {`
`18`	`18`	`cat $1 \| grep -Pzo 'Test .\n.pass-.*' \| \`
`19`		`- sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).\n.pass-([0-9]+)/\1 \2/g' \|\`
	`19`	`+ sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).\n.pass-([0-9]+)/\1 \2/g' \|\`
`20`	`20`	`sort \| head -n 1`
`21`	`21`	`}`
`22`	`22`