wormhole/learn/difacto/config.proto at master · dmlc/wormhole · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
//  @file   config.proto
//  @brief  configuration for factorization machine

package dmlc.difacto;

message Config {

  // --  basic settings - input & output  --

  /// The training data, can be either a directory or a wildcard filename
  optional string train_data = 1;

  /// The validation or test data, can be either a directory or a wildcard filename
  optional string val_data = 2;

  /// data format. supports libsvm, crb, criteo, adfea, ...
  optional string data_format = 4 [default = "libsvm"];

  /// model output filename
  optional string model_out = 5;

  /// model input filename
  optional string model_in = 7;

  /// the filename for prediction output. if specified, then run
  /// prediction. otherwise run training
  optional string predict_out = 9;


  // --  basic settings - objective and optimization  --

  // - w -

  /// l1 regularizer for :math:`w`: :math:`\lambda_1 |w|_1`
  optional float lambda_l1 = 12 [default = 1];

  /// l2 regularizer for :math:`w`: :math:`\lambda_2 \|w\|_2^2`
  optional float lambda_l2 = 13 [default = 0];

  /// learning rate :math:`\eta` (or :math:`\alpha`) for :math:`w`
  optional float lr_eta = 14 [default = .01];

  // - V -

  /// for embedding :math:`V`
  message Embedding {
    /// -- model --

    /// the embedding dimension :math:`k`
    optional int32 dim = 1;

    /// features with occurence < threshold have no embedding (:math:`k=0`)
    optional int32 threshold = 2;

    /// l2 regularizer for :math:`V`: :math:`\lambda_2 \|V_i\|_2^2`
    optional float lambda_l2 = 3;

    /// -- learning --

    /// learning rate :math:`\eta` for :math:`V`. if not specified, then share the same with :math:`w`
    optional float lr_eta = 4 [default = .01];

    /// V is initialized by uniformly random weight in
    ///   [-init_scale, +init_scale]
    optional float init_scale = 6 [default = .01];

    /// -- advanced --

    /// leanring rate :math:`\beta` for :math:`V`.
    optional float lr_beta = 5 [default = 1];

    /// apply dropout on the gradient of :math:`V`. no in default
    optional float dropout = 7 [default = 0];

    /// project the gradient of :math:`V` into :math:`[-c c]`. no in default
    optional float grad_clipping = 8 [default = 0];

    /// normalized the l2-norm of gradient of :math:`V`. no in default
    optional float grad_normalization = 9 [default = 0];
  }

  /// the embedding :math:`V`
  repeated Embedding embedding = 15;

  /// - learning -

  /// the size of minibatch. the smaller, the faster the convergence, but the
  /// slower the system performance
  optional int32 minibatch = 22 [default = 1000];

  /// the maximal number of data passes
  optional int32 max_data_pass= 23 [default = 10];

  /// stop earilier based on validation
  optional bool early_stop = 24 [default = false];


  /// -- advanced --

  /// - data -

  /// save model for every k data pass. default is -1, which only saves for the
  /// last iteration
  optional int32 save_iter = 90 [default = -1];

  /// load model from the k-th iteration. default is -1, which loads the last
  /// iteration model
  optional int32 load_iter = 91 [default = -1];

  /// give a worker the data only if it can access. often used when the data has
  /// been dispatched to workers' local filesystem
  optional bool local_data = 101 [default = false];

  /// virtually partition a file into n parts for better loadbalance. default is 10
  optional int32 num_parts_per_file = 102 [default = 10];

  /// randomly shuffle data for minibatch SGD. a minibatch is randomly picked from
  /// rand_shuffle * minibatch examples. default is 10.
  optional int32 rand_shuffle = 103 [default = 10];

  /// down sampling negative examples in the training data. no in default
  optional float neg_sampling = 104 [default = 1.0];

  /// if true, then outputs a probability prediction. otherwise :math:`\langle  x, y \rangle`
  optional bool prob_predict = 105 [default = true];


  /// - learning -

  /// print the progress every n sec during training. 1 sec in default
  optional float print_sec = 111 [default = 1];

  /// learning rate :math:`\beta`, 1 in default
  optional float lr_beta = 112 [default = 1];

  /// the minimal objective decrease in early stop
  optional float min_objv_decr = 119 [default = .00001];

  /// the maximal allowed objective
  optional float max_objv = 118;

  /// use or not use the contraint :math:`V_i = 0` if :math:`w_i = 0`. yes in default
  optional bool l1_shrk = 114 [default = true];

  /// optional bool grad_normalization = 115 [default = false];

  /// - system performance -

  /// number of threads used within a worker and a server
  optional int32 num_threads = 121 [default = 2];

  /// the maximal concurrent minibatches being processing at the same time for
  /// sgd, and the maximal concurrent blocks for block CD. 2 in default.
  optional int32 max_concurrency = 122 [default = 2];

  /// cache the key list on both sender and receiver to reduce communication
  /// cost. it may increase the memory usage
  optional bool key_cache = 123 [default = true];

  /// compression the message to reduce communication cost. it may increase the
  /// computation cost.
  optional bool msg_compression = 124 [default = true];

  /// convert floating-points into fixed-point integers with n bytes. n can be 1,
  /// 2 and 3. 0 means no compression.
  optional int32 fixed_bytes = 125 [default = 0];
}