- 错误信息
[w 0909 17:08:30.869660 36 cuda_device_allocator.cc:29] Unable to alloc cuda device memory, use unify memory instead. This may cause low performance.
[i 0909 17:08:30.871317 36 cuda_device_allocator.cc:31]
=== display_memory_info ===
total_cpu_ram: 62.81GB total_device_ram: 7.928GB
hold_vars: 2077 lived_vars: 18369 lived_ops: 21772
name: sfrl is_device: 1 used: 369.4MB(25.3%) unused: 1.065GB(74.7%) total: 1.426GB
name: sfrl is_device: 1 used: 7.562MB(18%) unused: 34.44MB(82%) total: 42MB
name: sfrl is_device: 0 used: 7.562MB(18%) unused: 34.44MB(82%) total: 42MB
name: sfrl is_device: 0 used: 5.365MB(23.3%) unused: 17.64MB(76.7%) total: 23MB
name: temp is_device: 0 used: 0 B(-nan%) unused: 0 B(-nan%) total: 0 B
name: temp is_device: 1 used: 0 B(0%) unused: 980.3MB(100%) total: 980.3MB
cpu&gpu: 2.488GB gpu: 2.424GB cpu: 65MB
free: cpu(15.04GB) gpu(4.662GB)
===========================
[w 0909 17:08:32.335088 36 cuda_device_allocator.cc:29] Unable to alloc cuda device memory, use unify memory instead. This may cause low performance.
[i 0909 17:08:32.336920 36 cuda_device_allocator.cc:31]
=== display_memory_info ===
total_cpu_ram: 62.81GB total_device_ram: 7.928GB
hold_vars: 2077 lived_vars: 18369 lived_ops: 21772
name: sfrl is_device: 1 used: 369.4MB(94.7%) unused: 20.6MB(5.28%) total: 390MB
name: sfrl is_device: 1 used: 7.562MB(18%) unused: 34.44MB(82%) total: 42MB
name: sfrl is_device: 0 used: 7.562MB(18%) unused: 34.44MB(82%) total: 42MB
name: sfrl is_device: 0 used: 5.365MB(23.3%) unused: 17.64MB(76.7%) total: 23MB
name: temp is_device: 0 used: 0 B(-nan%) unused: 0 B(-nan%) total: 0 B
name: temp is_device: 1 used: 0 B(0%) unused: 980.3MB(100%) total: 980.3MB
cpu&gpu: 1.443GB gpu: 1.379GB cpu: 65MB
free: cpu(15.04GB) gpu(5.705GB)
===========================
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_1966205/250786907.py in <module>
60 writer.add_scalar(tag="train/loss", step=steps, value=float(loss.numpy()))
61
---> 62 train()
63
/tmp/ipykernel_1966205/250786907.py in train()
48 print("第{}轮, loss : ".format(epoch))
49 # print("第{}轮, loss : {}".format(epoch, loss_sum/(batch_id+1)))
---> 50 miou = test(epoch)
51 if max_miou < miou:
52 max_miou = miou
/tmp/ipykernel_1966205/2450582164.py in test(epoch)
28 prob = jt.nn.softmax(output_mask, dim=1)[0,1,:,:]
29 # print("prob========",type(prob))
---> 30 pred = prob.numpy()
31 pred[pred>0.5] = 1
32 pred[pred<=0.5] = 0
RuntimeError: [f 0909 17:08:33.622941 36 executor.cc:665]
Execute fused operator(2274/4065) failed.
[JIT Source]: /home/pi/.cache/jittor/jt1.3.5/g++9.4.0/py3.7.12/Linux-5.15.0-4x63/IntelRXeonRCPUx21/default/cu11.2.67_sm_35_52/jit/__opkey0_array__T_int32__o_1__opkey1_array__T_int32__o_1__opkey2_array__T_int32__o_1__opke___hash_570b4a4adaf03635_op.cc
[OP TYPE]: fused_op:( array, array, array, broadcast_to, broadcast_to, broadcast_to, broadcast_to, array, broadcast_to, broadcast_to, broadcast_to, array, broadcast_to, broadcast_to, broadcast_to, index, broadcast_to, broadcast_to, binary.add, binary.equal, binary.subtract, binary.multiply, binary.subtract, binary.multiply, binary.multiply, binary.add, binary.add, binary.add, binary.subtract, binary.multiply, binary.multiply, binary.multiply, binary.multiply,)
[Input]: float32[123904,2,], float32[123904,2,], float32[123904,2,], float32[123904,2,], float32[123904,2,], float32[123904,2,], float32[1,],
[Output]: float32[123904,123904,2,],
[Async Backtrace]: ---
/home/pi/anaconda3/envs/old/lib/python3.7/runpy.py:193 <_run_module_as_main>
/home/pi/anaconda3/envs/old/lib/python3.7/runpy.py:85 <_run_code>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel_launcher.py:17 <<module>>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/traitlets/config/application.py:976 <launch_instance>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelapp.py:712 <start>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/tornado/platform/asyncio.py:215 <start>
/home/pi/anaconda3/envs/old/lib/python3.7/asyncio/base_events.py:541 <run_forever>
/home/pi/anaconda3/envs/old/lib/python3.7/asyncio/base_events.py:1786 <_run_once>
/home/pi/anaconda3/envs/old/lib/python3.7/asyncio/events.py:88 <_run>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelbase.py:510 <dispatch_queue>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelbase.py:499 <process_one>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelbase.py:406 <dispatch_shell>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelbase.py:730 <execute_request>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/ipkernel.py:387 <do_execute>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/zmqshell.py:528 <run_cell>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:2976 <run_cell>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3030 <_run_cell>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/async_helpers.py:78 <_pseudo_sync_runner>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3258 <run_cell_async>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3473 <run_ast_nodes>
/home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3553 <run_code>
/tmp/ipykernel_1966205/250786907.py:62 <<module>>
/tmp/ipykernel_1966205/250786907.py:36 <train>
[Reason]: [f 0909 17:08:33.622486 36 helper_cuda.h:128] CUDA error at /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/jittor/src/mem/allocator/cuda_device_allocator.cc:32 code=2( cudaErrorMemoryAllocation ) cudaMallocManaged(&ptr, size)
- 错误代码
from asyncore import write
def train():
exp_args.istrain = True
mymodel = PortraitNet()
epochs = 2000
# 数据读取器
train_loader = MyDataset(exp_args).set_attrs(batch_size = exp_args.batch_size, shuffle = exp_args.istrain)
# 优化器
optimizer = nn.SGD(mymodel.parameters(),
exp_args.learning_rate,
exp_args.momentum,
exp_args.weight_decay)
max_miou = 0
steps = 0
loss_softmax = nn.CrossEntropyLoss()
for epoch in range(1, int(epochs)):
loss_sum = 0
mymodel.train()
for batch_id, (input_ori, input, edge, mask) in enumerate(train_loader):
input_ori = jt.array(input_ori)
input = jt.array(input)
edge = jt.array(edge)
mask = jt.array(mask)
output_mask, output_edge = mymodel(input)
loss_mask = loss_softmax(output_mask, mask)
loss_edge = loss_Focalloss(output_edge, edge) * exp_args.edgeRatio
output_mask_ori, output_edge_ori = mymodel(input_ori)
loss_mask_ori = loss_softmax(output_mask_ori, mask)
loss_edge_ori = loss_Focalloss(output_edge_ori, edge) * exp_args.edgeRatio
loss_stability_mask = loss_KL(output_mask, jt.array(output_mask_ori), exp_args.temperature) * exp_args.alpha
# total loss
loss = loss_mask + loss_edge + loss_mask_ori + loss_stability_mask
# loss_sum += loss.numpy()
# 更新参数
optimizer.step(loss)
steps += 1
print("第{}轮, loss : ".format(epoch))
# print("第{}轮, loss : {}".format(epoch, loss_sum/(batch_id+1)))
miou = test(epoch)
if max_miou < miou:
max_miou = miou
# 保存模型参数
if not os.path.exists(os.path.join("./save_model")):
os.mkdir(os.path.join("./save_model"))
if not os.path.exists(os.path.join("./save_model", str(epoch))):
os.mkdir(os.path.join("./save_model", str(epoch),str(epoch)+".pkl"))
if steps % 10 == 0:
writer.add_scalar(tag="train/loss", step=steps, value=float(loss.numpy()))
train()
def test(epoch):
exp_args.istrain = False
val_loader = MyDataset(exp_args).set_attrs(batch_size = 1, shuffle=False)
iou = 0
loss_softmax = jt.nn.CrossEntropyLoss()
mymodel.eval()
for batch_id, (input_ori, input, edge, mask) in enumerate(val_loader):
input_ori = jt.array(input_ori) # TODO: 是roi吧
input = jt.array(input)
edge = jt.array(edge)
mask = jt.array(mask)
output_mask, output_edge = mymodel(input)
loss_mask = loss_softmax(output_mask, mask)
loss_edge = loss_Focalloss(output_edge, edge) * exp_args.edgeRatio
output_mask_ori, output_edge_ori = mymodel(input_ori)
loss_mask_ori = loss_softmax(output_mask_ori, mask)
loss_stability_mask = loss_KL(output_mask, jt.array(output_mask_ori), exp_args.temperature) * exp_args.alpha
loss = loss_mask + loss_edge + loss_mask_ori + loss_stability_mask
pred = output_mask
prob = jt.nn.softmax(output_mask, dim=1)[0,1,:,:]
# print("prob========",type(prob))
pred = prob.numpy()
pred[pred>0.5] = 1
pred[pred<=0.5] = 0
iou += calcIOU(pred, mask[0].numpy())
# 求miou
miou = iou / len(val_loader)
myfile = open("./log_eg1800/eg1800_log.txt","a+") # 保存日志
myfile.write(str(epoch) + " " + str(miou) + "\n")
print("miou=", miou)
return miou
- 原因
不知道为什么在对var.numpy()
就会报错,很多地方都是,搞不懂