1
1
"""
2
2
Chain(layers...)
3
-
4
3
Chain multiple layers / functions together, so that they are called in sequence
5
4
on a given input.
6
-
7
5
`Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
8
6
`m[1:3](x)` will calculate the output of the first three layers.
9
-
10
7
# Examples
11
8
```jldoctest
12
9
julia> m = Chain(x -> x^2, x -> x+1);
13
-
14
10
julia> m(5) == 26
15
11
true
16
-
17
12
julia> m = Chain(Dense(10, 5), Dense(5, 2));
18
-
19
13
julia> x = rand(10);
20
-
21
14
julia> m(x) == m[2](m[1](x))
22
15
true
23
16
```
52
45
# only slightly changed to better handle interaction with Zygote @dsweber2
53
46
"""
54
47
activations(c::Chain, input)
55
-
56
48
Calculate the forward results of each layers in Chain `c` with `input` as model input.
57
49
"""
58
50
function activations (c:: Chain , input)
@@ -69,81 +61,75 @@ extraChain(::Tuple{}, x) = ()
69
61
70
62
71
63
"""
72
- Dense(in, out, σ = identity; bias = true, init = glorot_uniform)
64
+ Dense(in, out, σ= identity; bias= true, init= glorot_uniform)
73
65
Dense(W::AbstractMatrix, [bias, σ])
74
-
75
66
Create a traditional `Dense` layer, whose forward pass is given by:
76
-
77
67
y = σ.(W * x .+ bias)
78
-
79
68
The input `x` should be a vector of length `in`, or batch of vectors represented
80
69
as an `in × N` matrix, or any array with `size(x,1) == in`.
81
70
The out `y` will be a vector of length `out`, or a batch with
82
71
`size(y) == (out, size(x)[2:end]...)`
83
-
84
- Keyword `bias = false` will switch off trainable bias for the layer.
72
+ Keyword `bias=false` will switch off trainable bias for the layer.
85
73
The initialisation of the weight matrix is `W = init(out, in)`, calling the function
86
74
given to keyword `init`, with default [`glorot_uniform`](@doc Flux.glorot_uniform).
87
75
The weight matrix and/or the bias vector (of length `out`) may also be provided explicitly.
88
-
89
76
# Examples
90
77
```jldoctest
91
78
julia> d = Dense(5, 2)
92
79
Dense(5, 2)
93
-
94
80
julia> d(rand(Float32, 5, 64)) |> size
95
81
(2, 64)
96
-
97
82
julia> d(rand(Float32, 5, 1, 1, 64)) |> size # treated as three batch dimensions
98
83
(2, 1, 1, 64)
99
-
100
84
julia> d1 = Dense(ones(2, 5), false, tanh) # using provided weight matrix
101
85
Dense(5, 2, tanh; bias=false)
102
-
103
86
julia> d1(ones(5))
104
- 2-element Vector {Float64}:
87
+ 2-element Array {Float64,1 }:
105
88
0.9999092042625951
106
89
0.9999092042625951
107
-
108
90
julia> Flux.params(d1) # no trainable bias
109
91
Params([[1.0 1.0 … 1.0 1.0; 1.0 1.0 … 1.0 1.0]])
110
92
```
111
93
"""
112
- struct Dense{F,S <: AbstractArray ,T }
113
- weight:: S
114
- bias:: T
94
+ struct Dense{F, M <: AbstractMatrix , B }
95
+ weight:: M
96
+ bias:: B
115
97
σ:: F
98
+ function Dense (W:: M , bias = true , σ:: F = identity) where {M<: AbstractMatrix , F}
99
+ b = create_bias (W, bias, size (W,1 ))
100
+ new {F,M,typeof(b)} (W, b, σ)
101
+ end
116
102
end
117
103
118
- Dense (W, b) = Dense (W, b, identity)
119
-
120
- Dense (W:: AbstractArray , b:: Bool = true , σ = identity) =
121
- Dense (W, create_bias (W, b, size (W,1 )), σ)
104
+ function Dense (in:: Integer , out:: Integer , σ = identity;
105
+ initW = nothing , initb = nothing ,
106
+ init = glorot_uniform, bias= true )
122
107
123
- function Dense (in :: Integer , out :: Integer , σ = identity; initW = nothing ,
124
- init = glorot_uniform, initb = nothing , bias :: Bool = true )
125
- if initW != = nothing
126
- Base . depwarn ( " initW is deprecated, please use the `init` keyword instead " , :Dense )
127
- init = initW
108
+ W = if initW != = nothing
109
+ Base . depwarn ( " keyword initW is deprecated, please use init (which similarly accepts a funtion like randn) " , :Dense )
110
+ initW (out, in)
111
+ else
112
+ init (out, in)
128
113
end
129
114
130
- if initb != = nothing
131
- Base. depwarn (" initb is deprecated, please use the array based constructors instead " , :Dense )
132
- initb = initb
115
+ b = if bias === true && initb != = nothing
116
+ Base. depwarn (" keyword initb is deprecated, please simply supply the bias vector, bias=initb(out) " , :Dense )
117
+ initb (out)
133
118
else
134
- initb = zeros
119
+ bias
135
120
end
136
- Dense (init (out, in), bias ? initb (out) : Zeros (), σ)
121
+
122
+ return Dense (W, b, σ)
137
123
end
138
124
139
125
@functor Dense
140
126
141
127
function (a:: Dense )(x:: AbstractVecOrMat )
142
128
W, b, σ = a. weight, a. bias, a. σ
143
- σ .(W * x .+ b)
129
+ return σ .(W* x .+ b)
144
130
end
145
131
146
- (a:: Dense )(x) =
132
+ (a:: Dense )(x:: AbstractArray ) =
147
133
reshape (a (reshape (x, size (x,1 ), :)), :, size (x)[2 : end ]. .. )
148
134
149
135
function Base. show (io:: IO , l:: Dense )
@@ -156,14 +142,10 @@ end
156
142
"""
157
143
Diagonal(α, β)
158
144
Diagonal(size::Integer...)
159
-
160
145
Create an element-wise linear layer, which performs
161
-
162
146
y = α .* x .+ β
163
-
164
147
The learnable arrays are initialised `α = ones(Float32, size)` and
165
148
`β = zeros(Float32, size)`.
166
-
167
149
Used by [`LayerNorm`](@ref).
168
150
"""
169
151
struct Diagonal{T}
197
179
198
180
"""
199
181
Maxout(over)
200
-
201
182
The [Maxout](https://arxiv.org/abs/1302.4389) layer has a number of
202
183
internal layers which all receive the same input. It returns the elementwise
203
184
maximum of the internal layers' outputs.
204
-
205
185
Maxout over linear dense layers satisfies the univeral approximation theorem.
206
186
"""
207
187
struct Maxout{FS<: Tuple }
@@ -210,20 +190,15 @@ end
210
190
211
191
"""
212
192
Maxout(f, n_alts)
213
-
214
193
Construct a Maxout layer over `n_alts` instances of the layer given by `f`.
215
194
The function takes no arguments and should return some callable layer.
216
195
Conventionally, this is a linear dense layer.
217
-
218
196
# Examples
219
-
220
197
This constructs a `Maxout` layer over 4 internal dense linear layers, each
221
198
identical in structure (784 inputs, 128 outputs):
222
199
```jldoctest
223
200
julia> insize = 784;
224
-
225
201
julia> outsize = 128;
226
-
227
202
julia> Maxout(()->Dense(insize, outsize), 4);
228
203
```
229
204
"""
@@ -240,25 +215,19 @@ end
240
215
241
216
"""
242
217
SkipConnection(layer, connection)
243
-
244
218
Create a skip connection which consists of a layer or `Chain` of consecutive
245
219
layers and a shortcut connection linking the block's input to the output
246
220
through a user-supplied 2-argument callable. The first argument to the callable
247
221
will be propagated through the given `layer` while the second is the unchanged,
248
222
"skipped" input.
249
-
250
223
The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`.
251
224
Here is a more complicated example:
252
225
```jldoctest
253
226
julia> m = Conv((3,3), 4 => 7, pad=(1,1));
254
-
255
227
julia> x = ones(Float32, 5, 5, 4, 10);
256
-
257
228
julia> size(m(x)) == (5, 5, 7, 10)
258
229
true
259
-
260
230
julia> sm = SkipConnection(m, (mx, x) -> cat(mx, x, dims=3));
261
-
262
231
julia> size(sm(x)) == (5, 5, 11, 10)
263
232
true
264
233
```
@@ -281,45 +250,32 @@ end
281
250
"""
282
251
Bilinear(in1, in2, out, σ=identity; bias=true, init=glorot_uniform)
283
252
Bilinear(W::AbstractArray, [bias, σ])
284
-
285
253
Creates a Bilinear layer, which operates on two inputs at the same time.
286
254
Its output, given vectors `x` & `y`, is another vector `z` with,
287
255
for all `i ∈ 1:out`:
288
-
289
256
z[i] = σ(x' * W[i,:,:] * y + bias[i])
290
-
291
257
If `x` and `y` are matrices, then each column of the output `z = B(x, y)` is of this form,
292
258
with `B` a Bilinear layer.
293
-
294
259
If `y` is not given, it is taken to be equal to `x`, i.e. `B(x) == B(x, x)`
295
260
The two inputs may also be provided as a tuple, `B((x, y)) == B(x, y)`,
296
261
which is accepted as the input to a `Chain`.
297
-
298
262
The initialisation works as for [`Dense`](@ref) layer, with `W = init(out, in1, in2)`.
299
263
By default the bias vector is `zeros(Float32, out)`, option `bias=false` will switch off
300
264
trainable bias. Either of these may be provided explicitly.
301
-
302
265
# Examples
303
-
304
266
```jldoctest
305
267
julia> x, y = randn(Float32, 5, 32), randn(Float32, 5, 32);
306
-
307
268
julia> B = Flux.Bilinear(5, 5, 7);
308
-
309
269
julia> B(x) |> size # interactions based on one input
310
270
(7, 32)
311
-
312
271
julia> B(x,y) == B((x,y)) # two inputs, may be given as a tuple
313
272
true
314
-
315
273
julia> sc = SkipConnection(
316
274
Chain(Dense(5, 20, tanh), Dense(20, 9, tanh)),
317
275
Flux.Bilinear(9, 5, 3, bias=false),
318
276
); # used as the recombinator, with skip as the second input
319
-
320
277
julia> sc(x) |> size
321
278
(3, 32)
322
-
323
279
julia> Flux.Bilinear(rand(4,8,16), false, tanh) # first dim of weight is the output
324
280
Bilinear(8, 16, 4, tanh, bias=false)
325
281
```
@@ -373,26 +329,19 @@ end
373
329
374
330
"""
375
331
Parallel(connection, layers...)
376
-
377
332
Create a 'Parallel' layer that passes an input array to each path in
378
333
`layers`, reducing the output with `connection`.
379
-
380
334
Called with one input `x`, this is equivalent to `reduce(connection, [l(x) for l in layers])`.
381
335
If called with multiple inputs, they are `zip`ped with the layers, thus `Parallel(+, f, g)(x, y) = f(x) + g(y)`.
382
-
383
336
# Examples
384
-
385
337
```jldoctest
386
338
julia> model = Chain(Dense(3, 5),
387
339
Parallel(vcat, Dense(5, 4), Chain(Dense(5, 7), Dense(7, 4))),
388
340
Dense(8, 17));
389
-
390
341
julia> size(model(rand(3)))
391
342
(17,)
392
-
393
343
julia> model = Parallel(+, Dense(10, 2), Dense(5, 2))
394
344
Parallel(+, Dense(10, 2), Dense(5, 2))
395
-
396
345
julia> size(model(rand(10), rand(5)))
397
346
(2,)
398
347
```
0 commit comments