警告:权重没有梯度，将会被设置为0[w 0903 10:35:37.970024 44 grad.cc:77] grads[0] 'stage0.conv.0.weight' doesn't have gradient. It will be set to zero:

我是一个jittor用户 · 2022 年9 月 3 日 02:54

如果您使用 Jittor 时遇到了安装问题、运行错误、输出错误等问题，您可以在 “问答” 区提问。
为了更好地定位您的问题，建议包含以下内容：

错误信息、

Jittor 运行的完整 log（建议复制粘贴运行输出，如果可能请不要使用截图）、

复现此问题的代码或者描述、

您认为正确结果应当如何、

其他必要信息

这有一个参考模板

错误信息：只是节选一小部分，网络的权重都没有梯度。

[w 0903 10:35:37.970024 44 grad.cc:77] grads[0] 'stage0.conv.0.weight' doesn't have gradient. It will be set to zero: Var(5532:1:3:1:i0:o2:s1:n1,float32,stage0.conv.0.weight,130bd54e00)[32,3,3,3,]
[w 0903 10:35:37.975722 44 grad.cc:77] grads[1] 'stage0.conv.1.weight' doesn't have gradient. It will be set to zero: Var(5537:1:3:1:i0:o2:s1:n1,float32,stage0.conv.1.weight,1473ab8600)[32,]
[w 0903 10:35:37.977012 44 grad.cc:77] grads[2] 'stage0.conv.1.bias' doesn't have gradient. It will be set to zero: Var(5542:1:3:1:i0:o2:s1:n1,float32,stage0.conv.1.bias,134d473800)[32,]
[w 0903 10:35:37.977899 44 grad.cc:77] grads[3] 'stage1.conv.0.weight' doesn't have gradient. It will be set to zero: Var(5571:1:3:1:i0:o2:s1:n1,float32,stage1.conv.0.weight,130bd56c00)[32,32,1,1,]
[w 0903 10:35:37.978494 44 grad.cc:77] grads[4] 'stage1.conv.1.weight' doesn't have gradient. It will be set to zero: Var(5576:1:3:1:i0:o2:s1:n1,float32,stage1.conv.1.weight,1473ab8c00)[32,]
[w 0903 10:35:37.979121 44 grad.cc:77] grads[5] 'stage1.conv.1.bias' doesn't have gradient. It will be set to zero:

Jittor输出的log：损失一直没有下降

第1轮,loss : [1.4217179]
miou: 0.27836284039731685
第2轮,loss : [1.4190152]
miou: 0.27836284039731685
第3轮,loss : [1.420676]
miou: 0.27836284039731685
第4轮,loss : [1.4197009]
miou: 0.27836284039731685
第5轮,loss : [1.4200754]
miou: 0.27836284039731685
第6轮,loss : [1.4185944]
miou: 0.27836284039731685
第7轮,loss : [1.4202898]
miou: 0.27836284039731685
第8轮,loss : [1.4205399]
miou: 0.27836284039731685
第9轮,loss : [1.4184973]
miou: 0.27836284039731685

复现代码
关键的训练代码:

def train():
    exp_args.istrain = True
    #训练
    mymodel = PortraitNet()
    #设置epoch数
    epochs = 2000
    batch_size = 16

    #定义数据读取器
    train_loader = MyDataset(exp_args).set_attrs(batch_size=batch_size, shuffle=True)

    #定义优化器
    optimizer = jt.nn.SGD(mymodel.parameters(), 
                   exp_args.learning_rate, 
                   exp_args.momentum, 
                   exp_args.weight_decay)

    max_miou = 0
    steps = 0
    #定义损失函数
    loss_Softmax = jt.nn.CrossEntropyLoss()
    for epoch in range(1,int(epochs)):
        loss_sum = 0
        mymodel.train()
        for batch_id, (input_ori, input, edge, mask) in enumerate(train_loader):
            input_ori = jt.array(input_ori)
            input = jt.array(input)
            edge = jt.array(edge)
            mask = jt.array(mask)
            
            output_mask, output_edge = mymodel(input)
            
            loss_mask = loss_Softmax(output_mask, mask)
        
            loss_edge = loss_Focalloss(output_edge, edge) * exp_args.edgeRatio
            
            # total loss
            loss = loss_mask + loss_edge
            
            if exp_args.stability == True:
                output_mask_ori, output_edge_ori = mymodel(input_ori)
                loss_mask_ori = loss_Softmax(output_mask_ori, mask)

                loss_edge_ori = loss_Focalloss(output_edge_ori, edge) * exp_args.edgeRatio
                
                if exp_args.use_kl == True:
                    loss_stability_mask = loss_KL(output_mask, 
                                                    jt.array(output_mask_ori), 
                                                    exp_args.temperature) * exp_args.alpha
                    
                loss = loss_mask + loss_mask_ori + loss_stability_mask + loss_edge
                
            loss_sum += loss.numpy()
            #更新梯度和参数
            optimizer.step(loss_sum)
            steps += 1

涉及两个自己修改别人的损失函数

#Focalloss
def loss_Focalloss(pred,label,gamma = 2.0):
    N,C,H,W = pred.shape[0],pred.shape[1],pred.shape[2],pred.shape[3]
    pred = pred.reshape([N,C,-1])# N,C,H,W => N,C,H*W
    pred = pred.transpose((0,2,1))# N,C,H*W => N,H*W,C
    pred = pred.reshape([-1,C])# N,H*W,C => N*H*W,C
    label = label.reshape([-1,1])# N,H,W => N*H*W,1
    
    label = jt.squeeze(label,dim=1) # 去除多余维度
    label = label.numpy()
    label = paddle.to_tensor(label)
    label = paddle.nn.functional.one_hot(label, num_classes=2) 
    # jittor没有one_hot,但label可能要先转换为tensor才能输入到上面的one_hot函数里，最后再转回jittor的var
    label = jt.array(label,dtype="float32") # TODO: float32是否需要引号

    one = jt.array([1.], dtype='float32')
    fg_label = jittor_core.ops.greater_equal(label, one) # 是否直接用jt.greater_equal就可以
    # fg_label = jittor_core.ops.cast(fg_label, dtype='float32')  # 是否可以直接用 jt.array(fg_label, dtype='float32')
    fg_label = jt.array(fg_label, dtype='float32')
    fg_num = jt.sum(fg_label)
    ## 下面三个输入都要先转为numpy，再转为Tensor，
    pred = pred.numpy()
    label = label.numpy()
    fg_num = fg_num.numpy()
    pred = paddle.to_tensor(pred)
    label = paddle.to_tensor(label)
    fg_num = paddle.to_tensor(fg_num)
    loss_focal = paddle.nn.functional.sigmoid_focal_loss(pred,label,normalizer=fg_num)
    ## tensor转回numpy，最后转回 var
    loss_focal = loss_focal.numpy()
    loss_focal = jt.array(loss_focal)
    return loss_focal

import paddle

import jittor_core

#散度loss

def loss_KL(student_outputs, teacher_outputs, T):
# KD_loss = paddle.nn.KLDivLoss()(F.log_softmax(student_outputs/T, axis=1), 
    #                          F.softmax(teacher_outputs/T, axis=1)) * T * T
    student_outputs = jt.nn.log_softmax(student_outputs/T, dim=1)
    teacher_outputs = jt.nn.softmax(teacher_outputs/T, dim=1)
    # 因为输入的是var，而下面的函数输入是tensor，所以需要先将loss_KL输入的var转为numpy,再转tensor,输入给KLDivloss()
    student_outputs = student_outputs.numpy()
    teacher_outputs = teacher_outputs.numpy()
    student_outputs = paddle.to_tensor(student_outputs)
    teacher_outputs = paddle.to_tensor(teacher_outputs)
    kldiv_criterion = paddle.nn.KLDivLoss(reduction='mean')
    KD_loss = kldiv_criterion(student_outputs, teacher_outputs)# 这里填tensor
    # 将输出的KD_loss(tensor) 转回numpy， 最后转回var
    KD_loss = KD_loss.numpy()
    KD_loss = jt.array(KD_loss)
    return KD_loss

我认为正确的结果应该是能够计算梯度，
我认为报错的原因是计算损失函数的核心的用tensor计算的，所以报错，但jittor没有类似的库，所以我是用paddle的一些库写的。
其他必要信息。
我参考的代码是这一份
GitHub - wobushihuair/paddlepaddle2.0-PortraitNet: 飞桨论文复现挑战赛(第三期)-PortraitNet论文复现

lzhengning · 2022 年9 月 3 日 04:04

optimizer.step()传入的应当是一个 jittor var，您传入的 loss_sum 是 numpy array，导致梯度断掉。

我是一个jittor用户 · 2022 年9 月 3 日 04:32

修改之后，
loss_sum += loss
但运行报错，

错误信息

[w 0903 12:46:39.763762 40 grad.cc:77] grads[237] 'edge.weight' doesn't have gradient. It will be set to zero: Var(8649:1:3:1:i0:o2:s1:n1,float32,edge.weight,130bd58e00)[2,8,3,3,]
[w 0903 12:46:41.696854 40 cuda_device_allocator.cc:29] Unable to alloc cuda device memory, use unify memory instead. This may cause low performance.
[i 0903 12:46:41.700620 40 cuda_device_allocator.cc:31] 
=== display_memory_info ===
 total_cpu_ram: 62.81GB total_device_ram: 7.928GB
 hold_vars: 2039 lived_vars: 8312 lived_ops: 15867
 name: sfrl is_device: 1 used: 1.312GB(22.5%) unused: 4.506GB(77.5%) total: 5.818GB
 name: sfrl is_device: 1 used: 83.34MB(93.6%) unused: 5.664MB(6.36%) total:    89MB
 name: sfrl is_device: 0 used: 83.34MB(93.6%) unused: 5.664MB(6.36%) total:    89MB
 name: sfrl is_device: 0 used: 5.035MB(21.9%) unused: 17.96MB(78.1%) total:    23MB
 name: temp is_device: 0 used:     0 B(-nan%) unused:     0 B(-nan%) total:     0 B
 name: temp is_device: 1 used:     0 B(0%) unused: 954.1MB(100%) total: 954.1MB
 cpu&gpu: 6.946GB gpu: 6.837GB cpu:   112MB
 free: cpu(19.44GB) gpu(149.1MB)
===========================

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_946101/1274271006.py in <module>
     82                 writer.add_scalar(tag="train/loss", step=steps, value=float(loss.numpy()))
     83 
---> 84 train()

/tmp/ipykernel_946101/1274271006.py in train()
     36             loss_mask = loss_Softmax(output_mask, mask)
     37 
---> 38             loss_edge = loss_Focalloss(output_edge, edge) * exp_args.edgeRatio
     39 
     40             # total loss

/tmp/ipykernel_946101/1610901517.py in loss_Focalloss(pred, label, gamma)
     54 
     55     label = jt.squeeze(label,dim=1) # 去除多余维度
---> 56     label = label.numpy()
     57     label = paddle.to_tensor(label)
     58     label = paddle.nn.functional.one_hot(label, num_classes=2)

RuntimeError: [f 0903 12:46:41.739513 40 executor.cc:665] 
Execute fused operator(955/2224) failed. 
[JIT Source]: /home/pi/.cache/jittor/jt1.3.5/g++9.4.0/py3.7.12/Linux-5.15.0-4x63/IntelRXeonRCPUx21/default/cu11.2.67_sm_35_52/jit/__opkey0_broadcast_to__Tx_float32__DIM_7__BCAST_19__opkey1_broadcast_to__Tx_float32__DIM_7___hash_a7cee490a3b450a6_op.cc 
[OP TYPE]: fused_op:( broadcast_to, broadcast_to, binary.multiply, reindex_reduce.add,)
[Input]: float32[96,16,1,1,], float32[16,96,176,176,], 
[Output]: float32[16,16,176,176,], 
[Async Backtrace]: not found, please set env JT_SYNC=1, trace_py_var=3 
[Reason]: [f 0903 12:46:41.738845 40 helper_cuda.h:128] CUDA error at /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/jittor/src/mem/allocator/cuda_device_allocator.cc:32  code=2( cudaErrorMemoryAllocation ) cudaMallocManaged(&ptr, size)
**********
Async error was detected. To locate the async backtrace and get better error report, please rerun your code with two enviroment variables set:
>>> export JT_SYNC=1
>>> export trace_py_var=3

log
以下是log信息

info 12:27:17.496: Restart requested file:///home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:17.497: Restart kernel execution
info 12:27:17.497: Cancel pending cells
info 12:27:17.497: Restarting undefined
info 12:27:17.498: Starting raw kernel 'old (Python 3.7.12)' for interpreter /home/pi/anaconda3/envs/old/bin/python
info 12:27:17.516: Process Execution: > ~/anaconda3/envs/old/bin/python -c "import ipykernel; print(ipykernel.__version__); print("5dc3a68c-e34e-4080-9c3e-2a532b2ccb4d"); print(ipykernel.__file__)"
> ~/anaconda3/envs/old/bin/python -c "import ipykernel; print(ipykernel.__version__); print("5dc3a68c-e34e-4080-9c3e-2a532b2ccb4d"); print(ipykernel.__file__)"
info 12:27:17.523: Kernel launching with ports 9071,9072,9073,9074,9075. Start port is 9000
info 12:27:17.554: Process Execution: > ~/anaconda3/envs/old/bin/python -m ipykernel_launcher --ip=127.0.0.1 --stdin=9074 --control=9072 --hb=9071 --Session.signature_scheme="hmac-sha256" --Session.key=b"346462fe-ca1d-41bf-9004-7a9aefa54e40" --shell=9073 --transport="tcp" --iopub=9075 --f=/home/pi/.local/share/jupyter/runtime/kernel-v2-16214270xgqO0z7YaAd.json
> ~/anaconda3/envs/old/bin/python -m ipykernel_launcher --ip=127.0.0.1 --stdin=9074 --control=9072 --hb=9071 --Session.signature_scheme="hmac-sha256" --Session.key=b"346462fe-ca1d-41bf-9004-7a9aefa54e40" --shell=9073 --transport="tcp" --iopub=9075 --f=/home/pi/.local/share/jupyter/runtime/kernel-v2-16214270xgqO0z7YaAd.json
info 12:27:17.554: Process Execution: cwd: ~/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet
cwd: ~/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet
info 12:27:17.725: ipykernel version 6.15.1 for /home/pi/anaconda3/envs/old/bin/python
info 12:27:17.725: ipykernel location ~/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/__init__.py for /home/pi/anaconda3/envs/old/bin/python
warn 12:27:18.378: StdErr from Kernel Process /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/traitlets/traitlets.py:2395: FutureWarning: Supporting extra quotes around strings is deprecated in traitlets 5.0. You can use 'hmac-sha256' instead of '"hmac-sha256"' if you require traitlets >=5.
  FutureWarning,
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/traitlets/traitlets.py:2349: FutureWarning: Supporting extra quotes around Bytes is deprecated in traitlets 5.0. Use '346462fe-ca1d-41bf-9004-7a9aefa54e40' instead of 'b"346462fe-ca1d-41bf-9004-7a9aefa54e40"'.
  FutureWarning,

info 12:27:18.378: Kernel Output: NOTE: When using the `ipython kernel` entry point, Ctrl-C will not work.

To exit, you will have to explicitly quit this process, by either sending
"quit" from a client, or using Ctrl-\ in UNIX-like environments.

To read more about this, see https://github.com/ipython/ipython/issues/2049


To connect another client to this kernel, use:
    --existing kernel-v2-16214270xgqO0z7YaAd.json

info 12:27:18.603: Got new session 74bca231-34a6-4244-a464-14f16661eaa0
info 12:27:18.603: Started new restart session
info 12:27:18.606: Executing silently Code (idle) = import sys\nprint(sys.executable)
info 12:27:19.443: Executing silently Code (completed) = import sys\nprint(sys.executable) with 1 output(s)
info 12:27:19.446: UpdateWorkingDirectoryAndPath in Kernel
info 12:27:19.446: Executing silently Code (idle) = import os\nimport sys\n%cd "//home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitn
info 12:27:19.470: Executing silently Code (completed) = import os\nimport sys\n%cd "//home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitn with 1 output(s)
info 12:27:19.470: Waiting for idle on (kernel): 74bca231-34a6-4244-a464-14f16661eaa0 -> idle
info 12:27:19.471: Finished waiting for idle on (kernel): 74bca231-34a6-4244-a464-14f16661eaa0 -> idle
info 12:27:22.479: Execute Cell 0 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 1 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 2 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 3 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 4 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 5 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 6 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 7 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 8 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 9 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 10 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 11 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 12 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 13 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 14 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
info 12:27:22.479: Execute Cell 15 /home/pi/Desktop/RL_learning/ComputerGraph/portraitnet_code/12myPortraitnet/Portraitnet3.ipynb
warn 12:27:22.641: StdErr from Kernel Process e[38;5;2m[i 0903 12:27:22.638382 36 log.cc:351] Load log_sync: 1e[m

info 12:27:25.825: Cell 0 executed with state Success
info 12:27:25.960: Cell 1 executed with state Success
info 12:27:26.116: Cell 2 executed with state Success
info 12:27:26.221: Cell 3 executed with state Success
info 12:27:26.311: Cell 4 executed with state Success
info 12:27:26.590: Cell 5 executed with state Success
info 12:27:26.635: Cell 6 executed with state Success
info 12:27:26.782: Cell 7 executed with state Success
info 12:27:28.775: Cell 8 executed with state Success
info 12:27:29.2: Cell 9 executed with state Success
info 12:27:29.58: Cell 10 executed with state Success
info 12:27:30.428: Cell 11 executed with state Success
info 12:27:30.497: Cell 12 executed with state Success
info 12:27:41.916: Cancel all remaining cells true || Error || undefined
info 12:27:41.916: Cancel pending cells
info 12:27:41.917: Cell 13 executed with state Error
info 12:27:41.917: Cell 14 executed with state Idle
info 12:27:41.917: Cell 15 executed with state Idle

认为报错原因
3.1 是我不够显存吗,电脑: ubuntu20.04,显存8G。
3.2 为什么会在label = label.numpy()停下来？不是很懂。
3.3 为什么最后输出的层edge也是没有权重。
网络代码如下。

import numpy as np  
import jittor as jt
from jittor import nn

# 1x1 Convolution

class conv_1x1(nn.Module):
    def __init__(self, inp, oup):
        super(conv_1x1, self).__init__()
        self.conv = nn.Conv(in_channels=inp, out_channels=oup, kernel_size=1, stride=1, padding=0, bias=False)
    
    def execute(self, x):
        x = self.conv(x)
        return x    

class conv_1x1_bn(nn.Module):
    def __init__(self, inp, oup):
        super(conv_1x1_bn, self).__init__()
        self.conv = nn.Sequential(
                    nn.Conv(in_channels=inp, out_channels=oup, kernel_size=1, stride=1, padding=0, bias=False),
                    nn.BatchNorm(oup),
                    nn.ReLU())
        
    def execute(self, x):
        x = self.conv(x)
        return x


class conv_bn(nn.Module):
    def __init__(self, inp, oup, kernel, stride):
        super(conv_bn, self).__init__()
        self.conv = nn.Sequential(
                    nn.Conv(in_channels=inp, out_channels=oup, kernel_size=kernel, stride=stride, padding=(kernel-1)//2, bias=False),
                    nn.BatchNorm(num_features=oup),
                    nn.ReLU())

    def execute(self, x):
        x = self.conv(x)
        return x

# same 深度可分离卷积
class conv_dw(nn.Module):
    def __init__(self, inp, oup, kernel, stride):
        super(conv_dw, self).__init__()
        self.conv = nn.Sequential(
                    nn.Conv(inp, inp, kernel, stride, (kernel-1)//2, groups=inp, bias=False),
                    nn.BatchNorm(num_features=inp),
                    nn.ReLU(),
                    nn.Conv(inp, oup, 1, 1, 0, bias=False),
                    nn.BatchNorm(num_features=oup),
                    nn.ReLU())
    
    def execute(self, x):
        x = self.conv(x)
        return x

class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio, dilation=1):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1,2]
        self.use_res_connect = ((self.stride==1) and (inp == oup)) # 判断是否步长为1且通道数相同
        
        self.conv = nn.Sequential(
                    # pw
                    nn.Conv(inp, inp*expand_ratio, kernel_size=1, stride=1, padding=0, dilation=1, groups=1, bias=False),
                    nn.BatchNorm(num_features=inp*expand_ratio),
                    nn.ReLU(),
                    # dw
                    nn.Conv(inp*expand_ratio, inp*expand_ratio, kernel_size=3, stride=stride, padding=dilation,
                            dilation=dilation, groups=inp*expand_ratio, bias=False),
                    nn.BatchNorm(num_features=inp*expand_ratio),
                    nn.ReLU(),
                    # pw-linear
                    nn.Conv(inp*expand_ratio, oup, kernel_size=1, stride=1, padding=0, dilation=1, groups=1, bias=False),
                    nn.BatchNorm(num_features=oup))
    
    def execute(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)
            
        
class ResidualBlock(nn.Module):
    def __init__(self, inp, oup, stride=1):
        super(ResidualBlock, self).__init__()
        self.block = nn.Sequential(
                    conv_dw(inp, oup, kernel=3, stride=stride), # 一次深度可分离卷积
                    nn.Conv(in_channels=oup, out_channels=oup, kernel_size=3, stride=1, padding=1, groups=oup, bias=False),
                    nn.BatchNorm(num_features=oup),
                    nn.ReLU(),
                    nn.Conv(in_channels=oup, out_channels=oup, kernel_size=1, stride=1, padding=0, bias=False),
                    nn.BatchNorm(num_features=oup))
        # 保证通道数一致
        if inp == oup:
            self.residual = None
        else:
            self.residual = nn.Sequential(
                            nn.Conv(in_channels=inp, out_channels=oup, kernel_size=1, stride=1, padding=0, bias=False),
                            nn.BatchNorm(num_features=oup))
        self.relu = nn.ReLU()
        
    def execute(self, x):
        residual = x
        
        out = self.block(x)
        if self.residual is not None:
            residual = self.residual(x)
        
        out += residual
        out = self.relu(out)
        return out
  

class PortraitNet(nn.Module):
    def __init__(self, n_class=2, useUpsample=False, useDeconvGroup=False, addEdge=True,
                 channelRatio=1.0, minChannel=16, video=False):
        super(PortraitNet, self).__init__()
        """
        setting of inverted residual blocks
        self.inververted_residual_setting = 
        [
            # t, c, n, c
            [1, 16, 1 ,1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 320, 1, 1],
        ]
        """
        self.addEdge = addEdge
        self.channelRatio = channelRatio
        self.minChannel = minChannel
        self.useDeconvGroup = useDeconvGroup
        
        if video == True:
            self.stage0 = conv_bn(4, self.depth(32), 3, 2)
        else:
            self.stage0 = conv_bn(3, self.depth(32), 3, 2)
        
        self.stage1 =   InvertedResidual(self.depth(32), self.depth(16), 1, 1) # 1/2
        
        self.stage2 = nn.Sequential(      # 1/4
                        InvertedResidual(self.depth(16), self.depth(24), 2, 6),
                        InvertedResidual(self.depth(24), self.depth(24), 1, 6))
        
        self.stage3 = nn.Sequential(  # 1/8
                        InvertedResidual(self.depth(24), self.depth(32), 2, 6),
                        InvertedResidual(self.depth(32), self.depth(32), 1, 6),
                        InvertedResidual(self.depth(32), self.depth(32), 1, 6))
        
        self.stage4 = nn.Sequential(  # 1/16
                        InvertedResidual(self.depth(32), self.depth(64), 2, 6),
                        InvertedResidual(self.depth(64), self.depth(64), 1, 6),
                        InvertedResidual(self.depth(64), self.depth(64), 1, 6),
                        InvertedResidual(self.depth(64), self.depth(64), 1, 6))
        
        self.stage5 = nn.Sequential(  # 1/16
                        InvertedResidual(self.depth(64), self.depth(96), 1, 6),
                        InvertedResidual(self.depth(96), self.depth(96), 1, 6),
                        InvertedResidual(self.depth(96), self.depth(96), 1, 6))
        
        self.stage6 = nn.Sequential( # 1/32
                        InvertedResidual(self.depth(96), self.depth(160), 2, 6),
                        InvertedResidual(self.depth(160), self.depth(160), 1, 6),
                        InvertedResidual(self.depth(160), self.depth(160), 1, 6))
        
        self.stage7 = nn.Sequential( # 1/32
                        InvertedResidual(self.depth(160), self.depth(320), 1, 6))
        
        if useUpsample == True:
            self.deconv1 = nn.Upsample(scale_factor=2, mode='bilinear')
            self.deconv2 = nn.Upsample(scale_factor=2, mode='bilinear')
            self.deconv3 = nn.Upsample(scale_factor=2, mode='bilinear')
            self.deconv4 = nn.Upsample(scale_factor=2, mode='bilinear')
            self.deconv5 = nn.Upsample(scale_factor=2, mode='bilinear')
        else:
            if useDeconvGroup == True:
                self.deconv1 = nn.ConvTranspose(self.depth(96), self.depth(96), groups=self.depth(96),
                                                 kernel_size=4, stride=2, padding=1, bias=False)
                self.deconv2 = nn.ConvTranspose(self.depth(32), self.depth(32), groups=self.depth(32),
                                                kernel_size=4, stride=2, padding=1, bias=False)
                self.deconv3 = nn.ConvTranspose(self.depth(24), self.depth(24), groups=self.depth(24),
                                                kernel_size=4, stride=2, padding=1, bias=False)
                self.deconv4 = nn.ConvTranspose(self.depth(16), self.depth(16), groups=self.depth(16),
                                                kernel_size=4, stride=2, padding=1, bias=False)
                self.deconv5 = nn.ConvTranspose(self.depth(8), self.depth(8), groups=self.depth(8),
                                                kernel_size=4, stride=2, padding=1, bias=False)
            else:
                self.deconv1 = nn.ConvTranspose(self.depth(96), self.depth(96), groups=1,
                                                kernel_size=4, stride=2, padding=1, bias=False)
                self.deconv2 = nn.ConvTranspose(self.depth(32), self.depth(32),groups=1,
                                                kernel_size=4, stride=2, padding=1, bias=False)
                self.deconv3 = nn.ConvTranspose(self.depth(24), self.depth(24), groups=1,
                                                kernel_size=4, stride=2, padding=1, bias=False)
                self.deconv4 = nn.ConvTranspose(self.depth(16), self.depth(16), groups=1,
                                                kernel_size=4, stride=2, padding=1, bias=False)
                self.deconv5 = nn.ConvTranspose(self.depth(8), self.depth(8), groups=1,
                                                kernel_size=4, stride=2, padding=1, bias=False)
        
        self.transit1 = ResidualBlock(self.depth(320), self.depth(96))
        self.transit2 = ResidualBlock(self.depth(96), self.depth(32))
        self.transit3 = ResidualBlock(self.depth(32), self.depth(24))
        self.transit4 = ResidualBlock(self.depth(24), self.depth(16))
        self.transit5 = ResidualBlock(self.depth(16), self.depth(8))
        
        self.pred = nn.Conv(self.depth(8), n_class, 3, 1, 1, bias=False)
        if self.addEdge == True:
            self.edge = nn.Conv(self.depth(8), n_class, 3, 1, 1, bias=False)
                
    def depth(self, channels):
        min_channel = min(channels, self.minChannel)
        return max(min_channel, int(channels*self.channelRatio))
    
    def execute(self, x):
        feature_1_2 = self.stage0(x)
        feature_1_2 = self.stage1(feature_1_2)
        feature_1_4 = self.stage2(feature_1_2)
        feature_1_8 = self.stage3(feature_1_4)
        feature_1_16 = self.stage4(feature_1_8)
        feature_1_16 = self.stage5(feature_1_16)
        feature_1_32 = self.stage6(feature_1_16)
        feature_1_32 = self.stage7(feature_1_32)
        
        up_1_16 = self.deconv1(self.transit1(feature_1_32))
        up_1_8 = self.deconv2(self.transit2(feature_1_16 + up_1_16))
        up_1_4 = self.deconv3(self.transit3(feature_1_8 + up_1_8))
        up_1_2 = self.deconv4(self.transit4(feature_1_4 + up_1_4))
        up_1_1 = self.deconv5(self.transit5(up_1_2))
        
        pred = self.pred(up_1_1)
        if self.addEdge == True:
            edge = self.edge(up_1_1)
            return pred, edge
        else:
            return pred

lzhengning · 2022 年9 月 3 日 14:44

您好，错误是因为显存不够导致的，log显示您的剩余显存只有 149.1 MB，不足以申请新的空间用于网络计算。

我是一个jittor用户 · 2022 年9 月 3 日 15:18

那为什么我就算用cpu, jt.flags.use_cuda=0强制训练，loss也下降不了。
而且为什么最后一层输出还是没有权重（梯度）检查好几遍都找不到原因

lzhengning · 2022 年9 月 3 日 15:52

注意到您的 loss function 里将 pred 等 jittor var 转换为 numpy 再转回 jittor。这种计算方式在任何深度学习框架中都是无法进行梯度传递的，因为框架丢失了计算图信息。

除输入外，所有计算都需要以 jittor var 的形式进行，这样才能保持梯度正常传递。

lzhengning · 2022 年9 月 3 日 15:57

您注释里提到的几个问题：

Q1. label = jt.array(label,dtype="float32") # TODO: float32是否需要引号
A1. 可以用字符串 "float32"，或者 jt.float32 表示类型

Q2. fg_label = jittor_core.ops.greater_equal(label, one) # 是否直接用jt.greater_equal就可以
A2. 是的，几乎所有 jittor_core.ops.SOME_OP 都可以简写为 jittor.SOME_OP

Q3. fg_label = jittor_core.ops.cast(fg_label, dtype='float32') # 是否可以直接用 jt.array(fg_label, dtype='float32')
A3. 是的，array 函数也可以用来改变数据类型，更简单的方式是 Var.astype(jt.float32)

我是一个jittor用户 · 2022 年9 月 3 日 16:16

谢谢作者大大。
那问一下，什么时候可以更新下面两个API呢？就两个…
1.[one_hot]参考paddle/nn/functional/one_hot_cn.html#one-hot
2. [KLDivLoss] 参考api/paddle/nn/KLDivLoss_cn.html#kldivloss
超链接发不出来