Skip to content

Commit eef13ff

Browse files
committed
Update quick start.
ISSUE=4602353 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1450 1ad973e4-5ce8-4261-8a94-b56d1f490c56
1 parent d8f30da commit eef13ff

File tree

9 files changed

+25
-24
lines changed

9 files changed

+25
-24
lines changed

demo/quick_start/dataprovider_bow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def predict_initializer(settings, dictionary, **kwargs):
7575

7676
# Declaring a data provider for prediction. The difference with process
7777
# is that label is not generated.
78-
@provider(init_hook=predict_initializer)
78+
@provider(init_hook=predict_initializer, should_shuffle=False)
7979
def process_predict(settings, file_name):
8080
with open(file_name, 'r') as f:
8181
for line in f:

demo/quick_start/dataprovider_emb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def predict_initializer(settings, dictionary, **kwargs):
4343
]
4444

4545

46-
@provider(init_hook=predict_initializer)
46+
@provider(init_hook=predict_initializer, should_shuffle=False)
4747
def process_predict(settings, file_name):
4848
with open(file_name, 'r') as f:
4949
for line in f:

demo/quick_start/predict.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@
1414
# limitations under the License.
1515
set -e
1616

17-
#cfg=trainer_config.lr.py
17+
cfg=trainer_config.lr.py
1818
#cfg=trainer_config.emb.py
1919
#cfg=trainer_config.cnn.py
20-
cfg=trainer_config.lstm.py
20+
#cfg=trainer_config.lstm.py
2121
model="output/pass-00003"
2222
paddle train \
2323
--config=$cfg \

demo/quick_start/preprocess.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
from subprocess import Popen, PIPE
3030
from optparse import OptionParser
3131
import json
32-
from bs4 import BeautifulSoup
3332
from multiprocessing import Queue
3433
from multiprocessing import Pool
3534
import multiprocessing
@@ -69,16 +68,6 @@ def parse(path):
6968
yield json.loads(l)
7069
g.close()
7170

72-
'''
73-
def clean(review):
74-
"""
75-
Clean input review: remove HTML, convert words to lower cases.
76-
"""
77-
# Remove HTML
78-
review_text = BeautifulSoup(review, "html.parser").get_text()
79-
return review_text
80-
'''
81-
8271

8372
def tokenize(sentences):
8473
"""
@@ -152,15 +141,14 @@ def save_batch(data_dir, num_tokenize, data_dir_dict):
152141
def parse_batch(data, num_tokenize):
153142
"""
154143
parse data by batch
155-
parse -> clean ->tokenize ->save
144+
parse -> tokenize -> save
156145
"""
157146
raw_txt = parse(data)
158147
neg, pos = [], []
159148
count = 0
160149
sys.stderr.write("extract raw data\n")
161150
for l in raw_txt:
162151
rating = l["overall"]
163-
#text = clean(l["reviewText"].lower()) # remove HTML
164152
text = l["reviewText"].lower() # # convert words to lower case
165153
if rating == 5.0 and text:
166154
pos.append(text)
@@ -223,7 +211,6 @@ def main():
223211
pool.close()
224212
pool.join()
225213

226-
sys.stderr.write("clean data done.\n")
227214
file(os.path.join(os.path.dirname(data), 'labels.list'),
228215
'w').write('neg\t0\npos\t1\n')
229216

demo/quick_start/preprocess.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@
1818
# 3. distinct train set and test set.
1919
# 4. build dict
2020

21+
set -e
2122

22-
mkdir data/tmp
23+
mkdir -p data/tmp
2324
python preprocess.py -i data/reviews_Electronics_5.json.gz
2425
# uniq and shuffle
2526
cd data/tmp
27+
echo 'uniq and shuffle...'
2628
cat pos_*|sort|uniq|shuf> pos.shuffed
2729
cat neg_*|sort|uniq|shuf> neg.shuffed
2830

demo/quick_start/requirements.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

demo/sentiment/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ set -e
1616

1717
function get_best_pass() {
1818
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
19-
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
19+
sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
2020
sort | head -n 1
2121
}
2222

doc/demo/quick_start/index_en.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st
5959
## Preprocess data into standardized format
6060
In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
6161

62-
`demo/quick_start` provides scripts for downloading data and preprocessing data, as shown below:
62+
`demo/quick_start` provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
6363

6464
```bash
6565
cd demo/quick_start
6666
./data/get_data.sh
67-
pip install -r requirements.txt
6867
./preprocess.sh
6968
```
7069

@@ -432,6 +431,14 @@ There are several differences between training and inference network configurati
432431
- batch_size = 1.
433432
- You need to specify the location of `test_list` in the test data.
434433

434+
The results in `result.txt` is as follows, each line is one sample.
435+
436+
```
437+
predicted_label_id;probability_of_label_0 probability_of_label_1 # the first sample
438+
predicted_label_id;probability_of_label_0 probability_of_label_1 # the second sample
439+
```
440+
441+
435442
```python
436443
is_predict = get_config_arg('is_predict', bool, False)
437444
trn = 'data/train.list' if not is_predict else None

doc_cn/demo/quick_start/index.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
```bash
3939
cd demo/quick_start
4040
./data/get_data.sh
41-
pip install -r requirements.txt
4241
./preprocess.sh
4342
```
4443

@@ -411,6 +410,13 @@ mv rank-00000 result.txt
411410
与训练网络配置不同的是:无需label相关的层,指定outputs输出概率层(softmax输出),
412411
指定batch_size=1,数据传输无需label数据,预测数据指定test_list的位置。
413412

413+
预测结果以文本的形式保存在`result.txt`中,一行为一个样本,格式如下:
414+
415+
```
416+
预测ID;ID为0的概率 ID为1的概率
417+
预测ID;ID为0的概率 ID为1的概率
418+
```
419+
414420
```
415421
is_predict = get_config_arg('is_predict', bool, False)
416422
trn = 'data/train.list' if not is_predict else None

0 commit comments

Comments
 (0)