admin 管理员组文章数量: 1184232
nvidia t4 shoc 测试问题
nvidia t4 跑shoc benchmark测试的时候遇到问题如下,
注意,跑 neuralnet 之前,要解压nn_data到 pwd 下面,但是依然报错
用strace 跟踪 执行
strace -f -o gm.truss /home/shoc-master/bin/Serial/CUDA/NeuralNet -s 4 -d 1
25353 execve("/home/shoc-master/bin/Serial/CUDA/NeuralNet", ["/home/shoc-master/bin/Serial/CUD"..., "-s", "4", "-d", "1"], [/* 52 vars */]) = 0
25353 brk(NULL) = 0x20e3000
25353 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9cd6045000
25353 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
25353 open("/usr/local/cuda-10.1/lib64/tls/x86_64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/local/cuda-10.1/lib64/tls/x86_64", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/local/cuda-10.1/lib64/tls/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/local/cuda-10.1/lib64/tls", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/local/cuda-10.1/lib64/x86_64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/local/cuda-10.1/lib64/x86_64", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/local/cuda-10.1/lib64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/local/cuda-10.1/lib64", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
25353 open("/usr/lib64/mpich-3.2/lib/tls/x86_64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/lib64/mpich-3.2/lib/tls/x86_64", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/lib64/mpich-3.2/lib/tls/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/lib64/mpich-3.2/lib/tls", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/lib64/mpich-3.2/lib/x86_64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/lib64/mpich-3.2/lib/x86_64", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/lib64/mpich-3.2/lib/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/lib64/mpich-3.2/lib", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
发现一堆错误!
怀疑是不是cuda没有安装好,于是检查 cuda的安装log
cat /var/log/cuda_install.log
发现如下问题
[WARNING]: Missing recommended library: libGLU.so
[WARNING]: Missing recommended library: libXi.so
[WARNING]: Missing recommended library: libXmu.so
解决这个 warning,
yum install mesa-libGLU-devel mesa-libGL-devel
yum install libXmu*
yum install libXi*
再重新安装cuda
——————————————————————————————————————————
这个分割线告诉读者,上面都不是问题的根本原因,问题的根本原因是。。。shoc 源代码
相信写代码的人都看见了,activations[0]没有被初始化,free的时候程序就崩掉了,再回头看看之前的问题
25353 writev(19, [{"*** Error in `", 14}, {"/home/shoc-master/bin/Serial/CUD"..., 43}, {"': ", 3}, {"free(): invalid pointer", 23}, {": 0x", 4}, {"00007f9cd123c818", 16}, {" ***\n", 5}], 7) = 108
25353 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9cd5e56000
25353 mmap(0x7f9cac000000, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x7f9c9e000000
25353 munmap(0x7f9c9e000000, 67108864) = 0
25353 mmap(NULL, 134217728, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x7f9c9a000000
25353 munmap(0x7f9c9a000000, 33554432) = 0
25353 munmap(0x7f9ca0000000, 33554432) = 0
25353 mprotect(0x7f9c9c000000, 135168, PROT_READ|PROT_WRITE) = 0
25353 futex(0x7f9cd1240cc0, FUTEX_WAKE_PRIVATE, 2147483647) = 0
25353 futex(0x7f9cd1458190, FUTEX_WAKE_PRIVATE, 2147483647) = 0
25353 write(19, "======= Backtrace: =========\n", 29) = 29
25353 writev(19, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"+0x", 3}, {"81489", 5}, {")", 1}, {"[0x", 3}, {"7f9cd0ef7489", 12}, {"]\n", 2}], 8) = 43
25353 writev(19, [{"/home/shoc-master/bin/Serial/CUD"..., 43}, {"[0x", 3}, {"40b40c", 6}, {"]\n", 2}], 4) = 54
25353 writev(19, [{"/home/shoc-master/bin/Serial/CUD"..., 43}, {"[0x", 3}, {"405cfb", 6}, {"]\n", 2}], 4) = 54
25353 writev(19, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"__libc_start_main", 17}, {"+0x", 3}, {"f5", 2}, {")", 1}, {"[0x", 3}, {"7f9cd0e983d5", 12}, {"]\n", 2}], 9) = 57
25353 writev(19, [{"/home/shoc-master/bin/Serial/CUD"..., 43}, {"[0x", 3}, {"40651f", 6}, {"]\n", 2}], 4) = 54
25353 write(19, "======= Memory map: ========\n", 29) = 29
25353 open("/proc/self/maps", O_RDONLY) = 20
25353 read(20, "00400000-004a7000 r-xp 00000000 "..., 1024) = 1024
25353 write(19, "00400000-004a7000 r-xp 00000000 "..., 1024) = 1024
25353 read(20, "000 00:05 38726 "..., 1024) = 1024
25353 write(19, "000 00:05 38726 "..., 1024) = 1024
25353 read(20, "4 352848 /de"..., 1024) = 1024
25353 write(19, "4 352848 /de"..., 1024) = 1024
25353 read(20, "7f9ccdc54000-7f9ccde54000 ---p 0"..., 1024) = 1024
25353 write(19, "7f9ccdc54000-7f9ccde54000 ---p 0"..., 1024) = 1024
25353 read(20, "0000 00:00 0 \n7f9cd0e76000-7f9cd"..., 1024) = 1024
25353 write(19, "0000 00:00 0 \n7f9cd0e76000-7f9cd"..., 1024) = 1024
25353 read(20, "0 ---p 00101000 fd:00 33633924 "..., 1024) = 1024
25353 write(19, "0 ---p 00101000 fd:00 33633924 "..., 1024) = 1024
25353 read(20, "r/lib64/libcublas.so.10.2.1.243\n"..., 1024) = 1024
25353 write(19, "r/lib64/libcublas.so.10.2.1.243\n"..., 1024) = 1024
25353 read(20, "2.17.so\n7f9cd5c24000-7f9cd5e2300"..., 1024) = 1024
25353 write(19, "2.17.so\n7f9cd5c24000-7f9cd5e2300"..., 1024) = 1024
25353 read(20, "f9cd5fed000-7f9cd5ffd000 -w-s 00"..., 1024) = 1024
25353 write(19, "f9cd5fed000-7f9cd5ffd000 -w-s 00"..., 1024) = 1024
25353 read(20, "f9cd602f000 rw-s 00000000 00:05 "..., 1024) = 1024
25353 write(19, "f9cd602f000 rw-s 00000000 00:05 "..., 1024) = 1024
25353 read(20, "960000-7ffdae962000 r-xp 0000000"..., 1024) = 158
25353 write(19, "960000-7ffdae962000 r-xp 0000000"..., 158) = 158
25353 read(20, "", 1024) = 0
25353 close(20) = 0
25353 rt_sigprocmask(SIG_UNBLOCK, [ABRT], NULL, 8) = 0
25353 tgkill(25353, 25353, SIGABRT) = 0
25353 --- SIGABRT {si_signo=SIGABRT, si_code=SI_TKILL, si_pid=25353, si_uid=0} ---
25401 +++ killed by SIGABRT (core dumped) +++
25400 +++ killed by SIGABRT (core dumped) +++
25353 +++ killed by SIGABRT (core dumped) +++
其实 strace 已经明确告诉我们问题的所在,只是好久没有搞c++了。
本文标签: nvidia t4 shoc 测试问题
版权声明:本文标题:nvidia t4 shoc 测试问题 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:https://www.roclinux.cn/b/1686652431a20634.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论