python-3.x - 即使未应用前向传播，也会调整nn.ModuleList()的权重

我正在尝试使用nn.ModuleList（）进行一些多任务学习，但是在训练之前，所有列表元素（即任务）的权重都会进行调整。以下代码（基于此notebook）创建了一个名为MTL的神经网络对象。

import torch
import torch.nn as nn
import numpy as np
import os
from torch.autograd import Variable
import math
import sklearn.preprocessing as sk
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
import random

#creating training data
seed = 42
random.seed(seed)
torch.cuda.manual_seed_all(seed)

N = 10000
M = 100
c = 0.5
p = 0.9
k = np.random.randn(M)
u1 = np.random.randn(M)
u1 -= u1.dot(k) * k / np.linalg.norm(k)**2
u1 /= np.linalg.norm(u1)
k /= np.linalg.norm(k)
u2 = k
w1 = c*u1
w2 = c*(p*u1+np.sqrt((1-p**2))*u2)
X = np.random.normal(0, 1, (N, M))
eps1 = np.random.normal(0, 0.01)
eps2 = np.random.normal(0, 0.01)
Y1 = np.matmul(X, w1) + np.sin(np.matmul(X, w1))+eps1
Y2 = np.matmul(X, w2) + np.sin(np.matmul(X, w2))+eps2
split = list(np.random.permutation(N))

X_train = X[split[0:8000],:]
Y1_train = Y1[split[0:8000]]
Y2_train = Y2[split[0:8000]]
X_valid = X[8000:9000,:]
Y1_valid = Y1[8000:9000]
Y2_valid = Y2[8000:9000]
X_test = X[9000:10000,:]
Y1_test = Y1[9000:10000]
Y2_test = Y2[9000:10000]

X_train = torch.from_numpy(X_train)
X_train = X_train.float()
Y1_train = torch.tensor(Y1_train)
Y1_train = Y1_train.float()
Y2_train = torch.tensor(Y2_train)
Y2_train = Y2_train.float()

X_valid = torch.from_numpy(X_valid)
X_valid = X_valid.float()
Y1_valid = torch.tensor(Y1_valid)
Y1_valid = Y1_valid.float()
Y2_valid = torch.tensor(Y2_valid)
Y2_valid = Y2_valid.float()

X_test = torch.from_numpy(X_test)
X_test = X_test.float()
Y1_test = torch.tensor(Y1_test)
Y1_test = Y1_test.float()
Y2_test = torch.tensor(Y2_test)
Y2_test = Y2_test.float()

input_size, feature_size = X.shape

LR = 0.001
epoch = 50
mb_size = 100

#the network
class MTLnet(nn.Module):
    def __init__(self):
        super(MTLnet, self).__init__()

        self.sharedlayer = nn.Sequential(
            nn.Linear(feature_size, 64),
            nn.ReLU(),
            nn.Dropout()
        )

        output = ['tower1', 'tower2']
        self.scoring_list = nn.ModuleList()

        for task, lab in enumerate(output):
            tower = nn.Sequential(
                nn.Linear(64, 32),
                nn.ReLU(),
                nn.Dropout(),
                nn.Linear(32, 16),
                nn.ReLU(),
                nn.Dropout(),
                nn.Linear(16, 1)
            )
            self.scoring_list.append(tower)

    def forward(self, x, task_id):
        h_shared = self.sharedlayer(x)
        logits = self.scoring_list[task_id](h_shared)

        return logits

def random_mini_batches(XE, R1E, R2E, mini_batch_size = 3, seed = 42):
    # Creating the mini-batches
    np.random.seed(seed)
    m = XE.shape[0]
    mini_batches = []
    permutation = list(np.random.permutation(m))
    shuffled_XE = XE[permutation,:]
    shuffled_X1R = R1E[permutation]
    shuffled_X2R = R2E[permutation]
    num_complete_minibatches = math.floor(m/mini_batch_size)
    for k in range(0, int(num_complete_minibatches)):
        mini_batch_XE = shuffled_XE[k * mini_batch_size : (k+1) * mini_batch_size, :]
        mini_batch_X1R = shuffled_X1R[k * mini_batch_size : (k+1) * mini_batch_size]
        mini_batch_X2R = shuffled_X2R[k * mini_batch_size : (k+1) * mini_batch_size]
        mini_batch = (mini_batch_XE, mini_batch_X1R, mini_batch_X2R)
        mini_batches.append(mini_batch)
    Lower = int(num_complete_minibatches * mini_batch_size)
    Upper = int(m - (mini_batch_size * math.floor(m/mini_batch_size)))
    if m % mini_batch_size != 0:
        mini_batch_XE = shuffled_XE[Lower : Lower + Upper, :]
        mini_batch_X1R = shuffled_X1R[Lower : Lower + Upper]
        mini_batch_X2R = shuffled_X2R[Lower : Lower + Upper]
        mini_batch = (mini_batch_XE, mini_batch_X1R, mini_batch_X2R)
        mini_batches.append(mini_batch)

    return mini_batches

MTL = MTLnet()
optimizer = torch.optim.Adam(MTL.parameters(), lr=LR)
loss_func = nn.MSELoss()

神经网络具有以下结构：

<bound method Module.parameters of MTLnet(
  (sharedlayer): Sequential(
    (0): Linear(in_features=100, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (scoring_list): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.5)
      (3): Linear(in_features=32, out_features=16, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.5)
      (6): Linear(in_features=16, out_features=1, bias=True)
    )
    (1): Sequential(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.5)
      (3): Linear(in_features=32, out_features=16, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.5)
      (6): Linear(in_features=16, out_features=1, bias=True)
    )
  )
)>

权重的初始值为（您会有所不同）：

print(MTL.state_dict()['scoring_list.0.6.weight'])
print(MTL.state_dict()['scoring_list.1.6.weight'])

输出：

tensor([[-0.0240, -0.1798, -0.2393, -0.2149, -0.1393,  0.1718, -0.1476,  0.0346,
          0.2485, -0.0305, -0.1574,  0.1500, -0.2356, -0.0597,  0.0291,  0.0521]])
tensor([[ 0.2046, -0.1277, -0.2103, -0.1006, -0.1311,  0.1902, -0.0969, -0.0953,
          0.1340,  0.1506, -0.1222, -0.0638, -0.0661,  0.1118, -0.1009, -0.1438]])

以下代码训练神经网络，并在每个时期后打印权重。第一个纪元将训练共享层和nn.ModuleList（）的第一个元素（即task1）。第二个时期将训练共享层和nn.ModuleList（）的第二个元素（即task2）。

trainTask1 = True
epoch = 2

for it in range(epoch):
    minibatches = random_mini_batches(X_train, Y1_train, Y2_train, mb_size)
    for minibatch in minibatches:
        XE, YE1, YE2  = minibatch

        if trainTask1:
            Yhat = MTL(XE, 0)
            loss = loss_func(Yhat, YE1.view(-1,1))
        else:
            Yhat = MTL(XE, 1)
            loss = loss_func(Yhat, YE2.view(-1,1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #Shows you the weights of task 1 and they get adjusted during the 2 epoch even if no training has happened
        #print("Task 1 weights {}".format(MTL.state_dict()['scoring_list.0.6.weight']))

    #@prosti suggested to freeze the layers of the task which aren't trained
    if trainTask1:
        trainTask1 = False
        for param in MTL.scoring_list[0].parameters():
            param.requires_grad = False
    else:
        trainTask1 = True
        for param in MTL.scoring_list[1].parameters():
            param.requires_grad = False

    print(it)
    print(MTL.state_dict()['scoring_list.0.6.weight'])
    print(MTL.state_dict()['scoring_list.1.6.weight'])

输出：

0
tensor([[-0.0283, -0.2025, -0.2181, -0.2183, -0.1438,  0.1605, -0.1715,  0.0863,
          0.2765, -0.0153, -0.1519,  0.1704, -0.2453, -0.0539,  0.0220,  0.0739]])
tensor([[ 0.2046, -0.1277, -0.2103, -0.1006, -0.1311,  0.1902, -0.0969, -0.0953,
          0.1340,  0.1506, -0.1222, -0.0638, -0.0661,  0.1118, -0.1009, -0.1438]])
1
tensor([[-0.0311, -0.2114, -0.2162, -0.2214, -0.1463,  0.1614, -0.1800,  0.1003,
          0.2850, -0.0148, -0.1576,  0.1809, -0.2511, -0.0575,  0.0221,  0.0844]])
tensor([[ 0.2693, -0.0921, -0.2313, -0.1483, -0.0995,  0.2497, -0.1028, -0.1108,
          0.1405,  0.1997, -0.1266, -0.0725, -0.0871,  0.1472, -0.0924, -0.0994]])

在第一个时期之后，调整了task1的权重，但未调整task2的权重（如预期的那样），但是在第二个时期之后，调整了两个任务的权重。那不应该发生。

您还可以看到，当在超小批处理中打印权重时（只是取消注释打印内容），即使所有层都冻结并且没有进行任何计算，也始终会针对第一个任务调整权重。

除了optimizer.zero_grad()之外，还有其他需要清除的缓存吗？

最佳答案

如果在训练期间监视MTL.scoring_list[0][6].weight.grad和MTL.scoring_list[1][6].weight.grad，您会注意到，在第一个时期，MTL.scoring_list[1][6].weight.grad为None，而在第二个时期，MTL.scoring_list[0][6].weight.grad为零张量。

查看各种优化器的.step()来源，似乎他们没有检查.requires_grad。他们仅检查.grad是否为None。因此，即使.grad是零张量，optimizer.step仍会执行其操作。这最终是否影响冻结的权重取决于优化器执行的确切计算。

作为快速解决方案，您可以在param.grad = None之后添加param.requires_grad = False，以便优化器完全忽略这些参数。这似乎解决了问题。但是您可能仍想考虑它可能对优化程序在未来时期的计算产生的任何影响。