@@ -255,3 +255,173 @@ net = nn.Sequential(
255255
256256## 7.5. 批量规范化
257257
258+ ### 7.5.1. 核心概念
259+
260+ 批量规范化:训练深层网络时,对各层输入进行标准化处理,加速收敛,稳定训练
261+
262+ 操作:对小批量数据进行均值和方差标准化,再应用可学习的拉伸($\gamma$)和偏移($\beta$)参数
263+
264+ 公式:$\mathrm{BN}(\mathbf{x}) = \boldsymbol{\gamma} \odot \frac{\mathbf{x} - \hat{\boldsymbol{\mu}}_ \mathcal{B}}{\hat{\boldsymbol{\sigma}}_ \mathcal{B}} + \boldsymbol{\beta}$,其中 $\hat{\boldsymbol{\mu}}_ \mathcal{B}$ 为小批量均值,$\hat{\boldsymbol{\sigma}}_ \mathcal{B}$ 为小批量标准差(加 $\epsilon$ 防除零)
265+
266+ ### 7.5.2. 关键特性
267+
268+ 训练/预测模式差异:
269+
270+ - 训练:用当前小批量均值和方差
271+
272+ - 预测:用训练过程中累积的移动平均均值和移动平均方差
273+
274+ 正则化作用:小批量统计带来的噪声可减少过拟合
275+
276+ 批量大小影响:需足够大(通常 50~ 100),否则效果差
277+
278+ ### 7.5.3. 实现细节
279+
280+ 全连接层:在特征维度计算均值和方差,形状为 $(1, num\_ features)$
281+
282+ 卷积层:在通道维度计算均值和方差(含所有空间位置),形状为 $(1, num\_ features, 1, 1)$
283+
284+ PyTorch 实现:
285+
286+ - 自定义层:` class BatchNorm(nn.Module) ` ,含 ` gamma ` 、` beta ` 、` moving_mean ` 、` moving_var ` 参数
287+
288+ - 框架 API:` nn.BatchNorm1d(num_features) ` (全连接层)、` nn.BatchNorm2d(num_features) ` (卷积层)
289+
290+ ## 7.6. 残差网络(ResNet)
291+
292+ ### 7.6.1. 核心思想
293+
294+ 深层网络需保证函数类嵌套性($\mathcal{F} \subseteq \mathcal{F}'$),确保增加层数能提升性能
295+
296+ 核心创新:残差块(residual block),使新增层易于拟合恒等映射($f(\mathbf{x}) = \mathbf{x}$)
297+
298+ 残差映射($f(\mathbf{x}) - \mathbf{x}$)比直接拟合映射更易优化
299+
300+ ### 7.6.2. 残差块
301+
302+ 结构:2 个 $3 \times 3$ 卷积层,每层后接批量规范化和 ReLU
303+
304+ 跨层残差连接:输入直接加在第二个卷积层输出前,再经 ReLU
305+
306+ 通道数变化时:用 $1 \times 1$ 卷积调整输入形状后再相加
307+
308+ ``` py
309+ class Residual (nn .Module ):
310+ def __init__ (self , input_channels , num_channels , use_1x1conv = False , strides = 1 ):
311+ super ().__init__ ()
312+ self .conv1 = nn.Conv2d(input_channels, num_channels, kernel_size = 3 , padding = 1 , stride = strides)
313+ self .conv2 = nn.Conv2d(num_channels, num_channels, kernel_size = 3 , padding = 1 )
314+ self .conv3 = nn.Conv2d(input_channels, num_channels, kernel_size = 1 , stride = strides) if use_1x1conv else None
315+ self .bn1 = nn.BatchNorm2d(num_channels)
316+ self .bn2 = nn.BatchNorm2d(num_channels)
317+
318+ def forward (self , X ):
319+ Y = F.relu(self .bn1(self .conv1(X)))
320+ Y = self .bn2(self .conv2(Y))
321+ if self .conv3:
322+ X = self .conv3(X)
323+ Y += X
324+ return F.relu(Y)
325+ ```
326+
327+ ### 7.6.3. ResNet 模型结构
328+
329+ 1 . 初始层:$7 \times 7$ 卷积(64 通道,步幅 2)→ BatchNorm → ReLU → $3 \times 3$ 最大池化(步幅 2)
330+
331+ 2 . 4 个残差模块:
332+
333+ - 每个模块含多个残差块,通道数依次为 64、128、256、512
334+
335+ - 非首个模块的第一个残差块用 $1 \times 1$ 卷积翻倍通道数并减半尺寸
336+
337+ 3 . 输出层:全局平均池化 → 全连接层(10 类输出)
338+
339+ ``` py
340+ # 模块构建
341+ def resnet_block (input_channels , num_channels , num_residuals , first_block = False ):
342+ blk = []
343+ for i in range (num_residuals):
344+ if i == 0 and not first_block:
345+ blk.append(Residual(input_channels, num_channels, use_1x1conv = True , strides = 2 ))
346+ else :
347+ blk.append(Residual(num_channels, num_channels))
348+ return blk
349+
350+ # 完整网络(ResNet-18)
351+ b1 = nn.Sequential(nn.Conv2d(1 , 64 , kernel_size = 7 , stride = 2 , padding = 3 ),
352+ nn.BatchNorm2d(64 ), nn.ReLU(),
353+ nn.MaxPool2d(kernel_size = 3 , stride = 2 , padding = 1 ))
354+ b2 = nn.Sequential(* resnet_block(64 , 64 , 2 , first_block = True ))
355+ b3 = nn.Sequential(* resnet_block(64 , 128 , 2 ))
356+ b4 = nn.Sequential(* resnet_block(128 , 256 , 2 ))
357+ b5 = nn.Sequential(* resnet_block(256 , 512 , 2 ))
358+ net = nn.Sequential(b1, b2, b3, b4, b5,
359+ nn.AdaptiveAvgPool2d((1 ,1 )),
360+ nn.Flatten(), nn.Linear(512 , 10 ))
361+ ```
362+
363+ ## 7.7. 稠密连接网络(DenseNet)
364+
365+ ### 7.7.1. 核心思想
366+
367+ 与 ResNet 的残差连接(相加)不同,DenseNet 采用稠密连接,通过通道维度上的连结融合特征
368+
369+ 函数映射形式:$\mathbf{x} \to [ \mathbf{x}, f_1(\mathbf{x}), f_2([ \mathbf{x}, f_1(\mathbf{x})] ), \ldots] $
370+
371+ ### 7.7.2. 主要组件
372+
373+ 1 . 卷积块
374+
375+ ``` py
376+ def conv_block (input_channels , num_channels ):
377+ return nn.Sequential(
378+ nn.BatchNorm2d(input_channels), nn.ReLU(),
379+ nn.Conv2d(input_channels, num_channels, kernel_size = 3 , padding = 1 )
380+ )
381+ ```
382+
383+ 2 . ** 稠密块(dense block)**
384+
385+ ```py
386+ class DenseBlock (nn .Module ):
387+ def __init__ (self , num_convs , input_channels , num_channels ):
388+ super ().__init__ ()
389+ layer = []
390+ for i in range (num_convs):
391+ layer.append(conv_block(num_channels* i + input_channels, num_channels))
392+ self .net = nn.Sequential(* layer)
393+ def forward (self , X ):
394+ for blk in self .net:
395+ Y = blk(X)
396+ X = torch.cat((X, Y), dim = 1 ) # 通道维度连结
397+ return X
398+ ```
399+
400+ 3 . ** 过渡层(transition layer)**
401+
402+ 控制模型复杂度,减少通道数并减半空间维度
403+
404+ ```py
405+ def transition_block (input_channels , num_channels ):
406+ return nn.Sequential(
407+ nn.BatchNorm2d(input_channels), nn.ReLU(),
408+ nn.Conv2d(input_channels, num_channels, kernel_size = 1 ),
409+ nn.AvgPool2d(kernel_size = 2 , stride = 2 )
410+ )
411+ ```
412+
413+ # ## 7.7.3. 网络结构
414+
415+ 1 . 初始模块:7 ×7 卷积(64 通道)+ 3 ×3 最大池化
416+
417+ 2 . 4 个稠密块(每个含 4 个卷积层,增长率 32 )
418+
419+ 3 . 稠密块间通过过渡层连接(通道数减半)
420+
421+ 4 . 最终:全局平均池化 + 全连接层(10 类输出)
422+
423+ # ## 7.7.4. 训练配置
424+
425+ 学习率 0.1 ,轮次 10 ,批次大小 256
426+
427+ 输入图像大小调整为 96 ×96
0 commit comments