|
1 |
| -import yi_json |
2 |
| - |
3 |
| -g = 100 |
4 |
| -def read(): |
5 |
| - queue q; |
6 |
| - # warmup q |
7 |
| - for i = 0 : 1000 |
8 |
| - q.push(read()) |
9 |
| - yield q.shuffle_get() |
10 |
| - |
11 |
| -input = paddle.layer.data(...) |
12 |
| -intermediate = paddle.layers.fc(input) |
13 |
| -output = paddle.layer.softmax(intermediate) |
14 |
| - |
15 |
| -model = paddle.model.create(output) |
16 |
| - |
17 |
| -train(model, data_provider=read, cluster="clusterId") |
18 |
| - |
19 |
| -#-------------------------------------------------------------------------------- |
20 |
| - |
21 |
| -# 1. package, docker build, docker push |
22 |
| -# 2. kubectl, clusterId Kuberentes job, 10 trainer containers, 5 parameter server containers |
23 |
| - |
24 |
| -#-------------------------------------------------------------------------------- |
25 |
| - |
26 |
| -def train(): |
27 |
| - if os.environ["kube_api_server"] == nil: |
28 |
| - docker_build() |
29 |
| - docker_push() |
30 |
| - kube_ctrl_start_job() |
31 |
| - else: |
32 |
| - rank = kube_mpi_rank() |
33 |
| - if rank == 0: |
34 |
| - master() |
35 |
| - elif rank >= 15: |
36 |
| - parameter_server() |
37 |
| - else: |
38 |
| - _train() |
| 1 | +# Design Doc: PaddlePaddle API |
| 2 | + |
| 3 | +## Ingredients |
| 4 | + |
| 5 | +As the first step of our design, we list important concepts in deep |
| 6 | +learning and try to figure their relationship, as shown below: |
| 7 | + |
| 8 | +``` |
| 9 | +Model = {topology, parameters} |
| 10 | +
|
| 11 | +Evaluator = {Model*, activations} |
| 12 | +- forward |
| 13 | +- test |
| 14 | +
|
| 15 | +GradientMachine = {Model*, gradients} |
| 16 | +- backward |
| 17 | +
|
| 18 | +Optimizer = {Model*, Evaluator*, GradientMachine*} |
| 19 | +- train |
| 20 | +- update |
| 21 | +- checkpoint |
| 22 | +``` |
| 23 | + |
| 24 | +where the pair of curly braces `{` and `}` indicate *composition*, `*` |
| 25 | +indicates a *reference*, and `-` marks a "class method". |
| 26 | + |
| 27 | + |
| 28 | +### Model |
| 29 | + |
| 30 | +We used to think that parameters are part of the toplogy (or layers). |
| 31 | +But that is not true, because multiple layers could share the same |
| 32 | +parameter matrix. An example is a network that compares two text |
| 33 | +segments in a semantic space: |
| 34 | + |
| 35 | +``` |
| 36 | + semantic |
| 37 | +text A -> projection ---\ |
| 38 | + layer A \ |
| 39 | + cosine |
| 40 | + similarity -> output |
| 41 | + layer |
| 42 | + semantic / |
| 43 | +text B -> projection ---/ |
| 44 | + layer B |
| 45 | +``` |
| 46 | + |
| 47 | +In this network, the two semantic projection layers (A and B) share |
| 48 | +the same parameter matrix. |
| 49 | + |
| 50 | +For more information about our API that specifies topology and |
| 51 | +parameter sharing, please refer to [TODO: API]. |
| 52 | + |
| 53 | + |
| 54 | +### Evaluator |
| 55 | + |
| 56 | +Supposed that we have a trained ranking model, we should be able to |
| 57 | +use it in our search engine. The search engine's Web server is a |
| 58 | +concurrent program so to serve many HTTP requests simultaneously. It |
| 59 | +doens't make sense for each of these threads to have its own copy of |
| 60 | +model, because that would duplicate topologies and parameters. |
| 61 | +However, each thread should be able to record layer outputs, i.e., |
| 62 | +activations, computed from an input, derived from the request. With |
| 63 | +*Evaluator* that saves activations, we can write the over-simplified |
| 64 | +server program as: |
| 65 | + |
| 66 | +```python |
| 67 | +m = paddle.model.load("trained.model") |
| 68 | + |
| 69 | +http.handle("/", |
| 70 | + lambda req: |
| 71 | + e = paddle.evaluator.create(m) |
| 72 | + e.forward(req) |
| 73 | + e.activation(layer="output")) # returns activations of layer "output" |
| 74 | +``` |
| 75 | + |
| 76 | +### GradientMachine |
| 77 | + |
| 78 | +Similar to the evaluation, the training needs to compute gradients so |
| 79 | +to update model parameters. Because an [optimizer](#optimizer) might |
| 80 | +run multiple simultaneous threads to update the same model, gradients |
| 81 | +should be separated from the model. Because gradients are only used |
| 82 | +in training, but not serving, they should be separate from Evaluator. |
| 83 | +Hence the `GradientMachine`. |
| 84 | + |
| 85 | +### Optimizer |
| 86 | + |
| 87 | +None of Model, Evaluator, nor GradientMachine implements the training |
| 88 | +loop, hence Optimizer. We can define a concurrent optimizer that runs |
| 89 | +multiple simultaneious threads to train a model -- just let each |
| 90 | +thread has its own GradientMachine object. |
0 commit comments