pytorch实现Cosine learning rate& warmup step decay(代码&plot图都已注释,方便调试)

Cosine learning rate decay

学习率不断衰减是一个提高精度的好方法。其中有step decay和cosine decay等,前者是随着epoch增大学习率不断减去一个小的数,后者是让学习率随着训练过程曲线下降。
对于cosine decay,假设总共有T个batch(不考虑warmup阶段),在第t个batch时,学习率η_t为在这里插入图片描述
在这里插入图片描述

注意:

  1. 图中的lr是lambda1*lr_rate的结果
  2. 便于工程上的运用,起始学习率=0.00035,尾端防止学习率为0,当lr小于0.00035时,也设成0.00035
1
lambda1 = lambda epoch: (0.9*epoch / t+0.1) if epoch < t else  0.1  if n_t * (1+math.cos(math.pi*(epoch - t)/(T-t)))<0.1 else n_t * (1+math.cos(math.pi*(epoch - t)/(T-t)))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# -*- coding:utf-8 -*-
import math
import matplotlib.pyplot as plt
import torch.optim as optim
from torchvision.models import resnet18


t=10#warmup
T=120#共有120个epoch,则用于cosine rate的一共有110个epoch
lr_rate = 0.0035
n_t=0.5
model = resnet18(num_classes=10)
lambda1 = lambda epoch: (0.9*epoch / t+0.1) if epoch < t else  0.1  if n_t * (1+math.cos(math.pi*(epoch - t)/(T-t)))<0.1 else n_t * (1+math.cos(math.pi*(epoch - t)/(T-t)))
optimizer = optim.SGD(model.parameters(), lr=lr_rate, momentum=0.9, nesterov=True)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)

index = 0
x = []
y = []
for epoch in range(T):
    x.append(index)
    y.append(optimizer.param_groups[0]['lr'])
    index += 1
    scheduler.step()

plt.figure(figsize=(10, 8))
plt.xlabel('epoch')
plt.ylabel('cosine rate')
plt.plot(x, y, color='r', linewidth=2.0, label='cosine rate')
plt.legend(loc='best')
plt.show()

warmup step decay

在这里插入图片描述

在这里插入图片描述

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# -*- coding:utf-8 -*-
"""
在build_optimizer.py中可查看学习率的模样
学习率控制,主要设置在于lr_sheduler.py,这个WarmupMultiStepLR需要传入一个optimizer,目的为了获取optimizer的base_lr
"""
from bisect import bisect_right
import torch
import matplotlib.pyplot as plt
import numpy as np

"""
args:
milestones:在多少个epoch时进行一次衰减
gamma:衰减1/10
last_epoch=-1:从第0个epoch开始,在for循环中是0,1,2,3变动的

"""


class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
    def __init__(
            self,
            optimizer,
            milestones,  # [40,70]
            gamma=0.1,  #
            warmup_factor=0.01,
            warmup_iters=10,
            warmup_method="linear",
            last_epoch=-1,
    ):
        if not list(milestones) == sorted(milestones):  # 保证输入的list是按前后顺序放的
            raise ValueError(
                "Milestones should be a list of" " increasing integers. Got {}",
                milestones,
            )

        if warmup_method not in ("constant", "linear"):
            raise ValueError(
                "Only 'constant' or 'linear' warmup_method accepted",
                " but got {}".format(warmup_method)
            )

        self.milestones = milestones
        self.gamma = gamma
        self.warmup_factor = warmup_factor
        self.warmup_iters = warmup_iters
        self.warmup_method = warmup_method
        super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)

    '''
    self.last_epoch是一直变动的[0,1,2,3,,,50]
    self.warmup_iters=10固定(表示线性warm up提升10个epoch)

    '''

    def get_lr(self):
        warmup_factor = 1
        list = {<!-- -->}
        if self.last_epoch < self.warmup_iters:  # 0<10
            if self.warmup_method == "constant":
                warmup_factor = self.warmup_factor  # 1/3
            elif self.warmup_method == "linear":
                alpha = self.last_epoch / self.warmup_iters  # self.last_epoch是一直变动的[0,1,2,3,,,50]/10
                warmup_factor = self.warmup_factor * (1 - alpha) + alpha  # self.warmup_factor=1/3
                list = {<!-- -->"last_epoch": self.last_epoch, "warmup_iters": self.warmup_iters, "alpha": alpha,
                        'warmup_factor': warmup_factor}

        # print(base_lr  for base_lr in    self.base_lrs)
        # print(base_lr* warmup_factor* self.gamma ** bisect_right(self.milestones, self.last_epoch) for base_lr in self.base_lrs)

        return [base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch) for base_lr in
                self.base_lrs]  # self.base_lrs,optimizer初始学习率weight_lr=0.0003,bias_lr=0.0006


if __name__ == '__main__':

    from torch import nn
    import torch
    import torch.optim as optim
    from torchvision.models import resnet18
    base_lrs=0.0035
    model = resnet18(num_classes=10)
    optimizer = optim.SGD(model.parameters(), lr=base_lrs, momentum=0.9, nesterov=True)
    lr_scheduler = WarmupMultiStepLR(optimizer, [40, 70], warmup_iters=10, )

    y = []
    for epoch in range(120):
        optimizer.zero_grad()  # 优化器optimizer一遍,学习率也变一次
        optimizer.step()
        y.append(optimizer.param_groups[0]['lr'])
        print('epoch:', epoch, 'lr:', optimizer.param_groups[0]['lr'])
        lr_scheduler.step()
    plt.plot(y, c='r', label='warmup step_lr', linewidth=1)
    plt.legend(loc='best')
    plt.xticks(np.arange(0, 120, 5))
    plt.show()

两者性能对比

图(a)是学习率随epoch增大而下降的图,可以看出cosine decay比step decay更加平滑一点。图(b)是准确率随epoch的变化图,两者最终的准确率没有太大差别,不过cosine decay的学习过程更加平滑。至于哪个效果好,可能对于不同问题答案是不一样的,要具体实验。在这里插入图片描述