gaugan尝试训练时出现以下问题

不知道是A卡的原因还是版本的原因,训练到一半就出错了。
/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/src/type/fp16_compute.h:88:15: error: no viable overloaded ‘=’
aa[0] = {0};
~~~~~ ^ ~~~
/opt/dtk-21.10.1/hip/include/hip/hsa_detail/hip_vector_types.hpp:653:26: note: candidate function not viable: cannot convert initializer list argument to ‘const HIP_vector_type<int, 4>’
HIP_vector_type& operator=(const HIP_vector_type&) = default;
^
/opt/dtk-21.10.1/hip/include/hip/hsa_detail/hip_vector_types.hpp:655:26: note: candidate function not viable: cannot convert initializer list argument to ‘HIP_vector_type<int, 4>’
HIP_vector_type& operator=(HIP_vector_type&&) = default;
^
In file included from /public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/jit/cublas_matmul__T_float32__Trans_a_N__Trans_b_T__op_S__JIT_1__JIT_cuda_1__index_t_int32_hash_e5c685b2023aaaa5_op.cc:20:
/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/src/type/fp16_compute.h:93:15: error: no viable overloaded ‘=’
aa[0] = {0};
~~~~~ ^ ~~~
/opt/dtk-21.10.1/hip/include/hip/hsa_detail/hip_vector_types.hpp:653:26: note: candidate function not viable: cannot convert initializer list argument to ‘const HIP_vector_type<int, 2>’
HIP_vector_type& operator=(const HIP_vector_type&) = default;
^
/opt/dtk-21.10.1/hip/include/hip/hsa_detail/hip_vector_types.hpp:655:26: note: candidate function not viable: cannot convert initializer list argument to ‘HIP_vector_type<int, 2>’
HIP_vector_type& operator=(HIP_vector_type&&) = default;
^
2 errors generated when compiling for gfx900.
In file included from /public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/jit/cudnn_conv__Tx_float32__Ty_float32__Tw_float32__XFORMAT_abcd__WFORMAT_oihw__YFORMAT_abcd_____hash_4d5b3e2d24c769d3_op.cc:19:
/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/src/type/fp16_compute.h:88:15: error: no viable overloaded ‘=’
aa[0] = {0};
~~~~~ ^ ~~~
/opt/dtk-21.10.1/hip/include/hip/hsa_detail/hip_vector_types.hpp:653:26: note: candidate function not viable: cannot convert initializer list argument to ‘const HIP_vector_type<int, 4>’
HIP_vector_type& operator=(const HIP_vector_type&) = default;
^
/opt/dtk-21.10.1/hip/include/hip/hsa_detail/hip_vector_types.hpp:655:26: note: candidate function not viable: cannot convert initializer list argument to ‘HIP_vector_type<int, 4>’
HIP_vector_type& operator=(HIP_vector_type&&) = default;
^
In file included from /public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/jit/cudnn_conv__Tx_float32__Ty_float32__Tw_float32__XFORMAT_abcd__WFORMAT_oihw__YFORMAT_abcd_____hash_4d5b3e2d24c769d3_op.cc:19:
/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/src/type/fp16_compute.h:93:15: error: no viable overloaded ‘=’
aa[0] = {0};
~~~~~ ^ ~~~
/opt/dtk-21.10.1/hip/include/hip/hsa_detail/hip_vector_types.hpp:653:26: note: candidate function not viable: cannot convert initializer list argument to ‘const HIP_vector_type<int, 2>’
HIP_vector_type& operator=(const HIP_vector_type&) = default;
^
/opt/dtk-21.10.1/hip/include/hip/hsa_detail/hip_vector_types.hpp:655:26: note: candidate function not viable: cannot convert initializer list argument to ‘HIP_vector_type<int, 2>’
HIP_vector_type& operator=(HIP_vector_type&&) = default;
^
/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/jit/cudnn_conv__Tx_float32__Ty_float32__Tw_float32__XFORMAT_abcd__WFORMAT_oihw__YFORMAT_abcd_____hash_4d5b3e2d24c769d3_op.cc:145:30: warning: unused variable ‘algos’ [-Wunused-variable]
miopenConvFwdAlgorithm_t algos[] = {
^
1 warning and 2 errors generated when compiling for gfx900.
[w 1128 20:42:45.841089 72 parallel_compiler.cc:103] Compile thread timeout, ignored.
Traceback (most recent call last):
File “spade_train.py”, line 52, in
trainer.run_generator_one_step(data_i)
File “/public/home/acy5h6d4oe/JGAN/models/gaugan/pix2pix_trainer.py”, line 31, in run_generator_one_step
self.optimizer_G.backward(g_loss)
File “/public/home/acy5h6d4oe/.local/lib/python3.8/site-packages/jittor/optim.py”, line 144, in backward
jt.sync(params_has_grad)
RuntimeError: Wrong inputs arguments, Please refer to examples(help(jt.sync)).

Types of your inputs are:
self = module,
args = (list, ),

The function declarations are:
void sync(const vector<VarHolder*>& vh=vector<VarHolder*>(), bool device_sync=false, bool weak_sync=true)

Failed reason:[f 1128 20:42:50.024501 72 parallel_compiler.cc:331] Error happend during compilation:
[Error] source file location:/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/jit/__opkey0_broadcast_to__Tx_float32__DIM_7__BCAST_19__opkey1_reindex__Tx_float32__XDIM_4__YD___hash_7ec7c1a1c2a4a646_op.cc
Compile fused operator(9/126)failed:[Op(10713:0:1:1:i1:o1:s0,broadcast_to->10714),Op(10711:0:1:1:i1:o1:s0,reindex->10712),Op(10715:0:1:1:i2:o1:s0,binary.multiply->10716),Op(10717:0:1:1:i1:o1:s0,reduce.add->10718),]

Reason: [f 1128 20:42:21.637173 08:C7 log.cc:608] Check failed ret(256) == 0(0) Run cmd failed: “/opt/dtk-21.10.1/bin/hipcc” “/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/jit/cudnn_conv__Tx_float32__Ty_float32__Tw_float32__XFORMAT_abcd__WFORMAT_oihw__YFORMAT_abcd_____hash_4d5b3e2d24c769d3_op.cc” -Wall -Wno-unknown-pragmas -std=c++17 -fPIC -march=native -fdiagnostics-color=always -lstdc++ -ldl -shared -I"/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/src" -I/opt/conda/include/python3.8 -I/opt/conda/include/python3.8 -I"/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/extern/cuda/inc" -DHAS_CUDA -DIS_ROCM -I/public/home/acy5h6d4oe/.local/lib/python3.8/site-packages/jittor/extern/rocm -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I"/opt/dtk-21.10.1/hip/include" -I"/opt/dtk-21.10.1/llvm/bin/…/lib/clang/13.0.0" -I/opt/dtk-21.10.1/hsa/include -L"/opt/dtk-21.10.1/lib" -Xlinker -rpath=“/opt/dtk-21.10.1/lib” -lamdhip64 -I"/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/extern/cuda/cudnn/inc" -I"/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/extern/cuda/cudnn/ops" -I"/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/extern/cuda/inc" -I"/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/rocm_jittor/extern/cuda/cudnn/inc" -I"/opt/dtk-21.10.1/miopen/include" -L"/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/cuda" -Xlinker -rpath=“/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/cuda” -l:libcuda_extern.so -L/opt/dtk-21.10.1/miopen/lib -Xlinker -rpath=/opt/dtk-21.10.1/miopen/lib -lMIOpen -L"/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/custom_ops" -Xlinker -rpath=“/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/custom_ops” -l:“gen_ops_cudnn_conv_cudnn_conv_backward_x_cudnn_con___hashbf2e36.cpython-38-x86_64-linux-gnu”.so -o “/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/jit/cudnn_conv__Tx_float32__Ty_float32__Tw_float32__XFORMAT_abcd__WFORMAT_oihw__YFORMAT_abcd_____hash_4d5b3e2d24c769d3_op.so”

您的网络中使用到了半精度训练吗?看上去编译出错的原因和类型有关,请您提供下面文件,以便确认原因
/public/home/acy5h6d4oe/.cache/jittor/jt1.3.5/g++7.3.1/py3.8.3/Linux-3.10.0-9xcc/HYGONC86718532x39/default/jit/__opkey0_broadcast_to__Tx_float32__DIM_7__BCAST_19__opkey1_reindex__Tx_float32__XDIM_4__YD___hash_7ec7c1a1c2a4a646_op.cc

#include <hip/hip_runtime.h>
#define CUDART_VERSION 10000

#define JIT 1
#include <assert.h>
#include “fused_op.h”
#include “misc/cuda_atomic.h”
#include “misc/cuda_limits.h”
#include “helper_cuda.h”
using namespace jittor;
#define INLINE_FUNC inline static void
#pragma GCC diagnostic ignored “-Wunused-function”
inline static int get_thread_range_log(int& thread_num, int64 range) {
int nbits = NanoVector::get_nbits(std::min((int64)thread_num, range)) - 2;
thread_num >>= nbits;
return nbits;
}
void jittor::FusedOp::jit_run() {
Op* rop_0_0 = context->vrm.relay_groups[0].oprcs[0].op;
GET_VAR_MEMBER(rop_0_0, 104) = vars[2].var;
GET_VAR_MEMBER(rop_0_0, 112) = vars[0].var;
GET_VAR_MEMBER(rop_0_0, 120) = vars[5].var;
rop_0_0->do_run();
}

目前确认是最新的jittor版本会出现这个问题,您可以暂时退回到之前版本的Jittor,我们将在之后的 Jittor 版本中修复这个错误。

感谢您的反馈!

已在 1.3.6.4 版本解决中修复这个错误。