Comments (9)
Hi,
I am not able to reproduce it. Would you mind adding the following lines at line 33 of superpoint_graph-release/learning/ecc/cuda_kernels.py
:
ksrc.encode('utf-8')
(kname+dtype+'.cu').encode('utf-8')
to see which call causes the crash?
from superpoint_graph.
@loicland I have added the codes you mentioned above like this:
def get_kernel_func(kname, ksrc, dtype):
if kname+dtype not in modules:
ksrc = ksrc.replace('DTYPE', dtype)
prog = Program(ksrc.encode('utf-8'), (kname+dtype+'.cu').encode('utf-8'))
ksrc.encode('utf-8')
(kname + dtype + '.cu').encode('utf-8')
ptx = prog.compile()
log = prog._interface.nvrtcGetProgramLog(prog._program)
if len(log.strip()) > 0: print(log)
module = cupy.cuda.function.Module()
module.load(bytes(ptx.encode()))
modules[kname+dtype] = module
else:
module = modules[kname+dtype]
but after running it, problem occurred again, like the following:
Will save to results/sema3d/trainval_best
Total number of parameters: 213772
Module(
(ecc): GraphNetwork(
(0): RNNGraphConvModule(
(_cell): GRUCellEx(32, 32)(ingate layernorm)
(_fnet): Sequential(
(0): Linear(in_features=13, out_features=32, bias=True)
(1): ReLU(inplace)
(2): Linear(in_features=32, out_features=128, bias=True)
(3): ReLU(inplace)
(4): Linear(in_features=128, out_features=64, bias=True)
(5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
(6): ReLU(inplace)
(7): Linear(in_features=64, out_features=32, bias=False)
)
)
(1): Linear(in_features=352, out_features=8, bias=True)
)
(ptn): PointNet(
(stn): STNkD(
(convs): Sequential(
(0): Conv1d(11, 64, kernel_size=(1,), stride=(1,))
(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
(2): ReLU(inplace)
(3): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
(4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
(5): ReLU(inplace)
(6): Conv1d(64, 128, kernel_size=(1,), stride=(1,))
(7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
(8): ReLU(inplace)
)
(fcs): Sequential(
(0): Linear(in_features=128, out_features=128, bias=True)
(1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
(2): ReLU(inplace)
(3): Linear(in_features=128, out_features=64, bias=True)
(4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
(5): ReLU(inplace)
)
(proj): Linear(in_features=64, out_features=4, bias=True)
)
(convs): Sequential(
(0): Conv1d(11, 64, kernel_size=(1,), stride=(1,))
(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
(2): ReLU(inplace)
(3): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
(4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
(5): ReLU(inplace)
(6): Conv1d(64, 128, kernel_size=(1,), stride=(1,))
(7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
(8): ReLU(inplace)
(9): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
(10): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
(11): ReLU(inplace)
(12): Conv1d(128, 256, kernel_size=(1,), stride=(1,))
(13): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True)
(14): ReLU(inplace)
)
(fcs): Sequential(
(0): Linear(in_features=257, out_features=256, bias=True)
(1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True)
(2): ReLU(inplace)
(3): Linear(in_features=256, out_features=64, bias=True)
(4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
(5): ReLU(inplace)
(6): Linear(in_features=64, out_features=32, bias=True)
)
)
)
Epoch 0/500 (results/sema3d/trainval_best):
0%| | 0/7 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/main.py", line 388, in <module>
main()
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/main.py", line 287, in main
acc, loss, oacc, avg_iou = train()
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/main.py", line 185, in train
outputs = model.ecc(embeddings)
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/graphnet.py", line 97, in forward
input = module(input)
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/modules.py", line 54, in forward
input = ecc.GraphConvFunction(nc, nc, idxn, idxe, degs, degs_gpu, self._edge_mem_limit)(hx, weights)
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/ecc/GraphConvModule.py", line 67, in forward
cuda_kernels.conv_aggregate_fw(output.narrow(0,startd,numd), products.view(-1,self._out_channels), self._degs_gpu.narrow(0,startd,numd))
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/ecc/cuda_kernels.py", line 122, in conv_aggregate_fw
function, stream = get_kernel_func('conv_aggregate_fw_kernel_v2', conv_aggregate_fw_kernel_v2(), get_dtype(src))
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/ecc/cuda_kernels.py", line 33, in get_kernel_func
prog = Program(ksrc.encode('utf-8'), (kname+dtype+'.cu').encode('utf-8'))
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/pynvrtc/compiler.py", line 52, in __init__
include_names)
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/pynvrtc/interface.py", line 200, in nvrtcCreateProgram
c_char_p(encode_str(src)), c_char_p(encode_str(name)),
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/pynvrtc/interface.py", line 54, in encode_str
return s.encode("utf-8")
AttributeError: 'bytes' object has no attribute 'encode'
Exception ignored in: <bound method Program.__del__ of <pynvrtc.compiler.Program object at 0x7f53a0e84978>>
Traceback (most recent call last):
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/pynvrtc/compiler.py", line 56, in __del__
self._interface.nvrtcDestroyProgram(self._program)
AttributeError: 'Program' object has no attribute '_program'
Notice that I ran this code in Python 3.5 and Pytorch 0.3.1
from superpoint_graph.
Hi,
I meant to put the lines
ksrc.encode('utf-8')
(kname+dtype+'.cu').encode('utf-8')
above
prog = Program(ksrc.encode('utf-8'), (kname+dtype+'.cu').encode('utf-8'))
to see which one cause the error.
from superpoint_graph.
sorry, the output is :
Epoch 0/500 (results/sema3d/trainval_best):
0%| | 0/7 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/main.py", line 388, in <module>
main()
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/main.py", line 287, in main
acc, loss, oacc, avg_iou = train()
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/main.py", line 185, in train
outputs = model.ecc(embeddings)
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/graphnet.py", line 97, in forward
input = module(input)
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/modules.py", line 54, in forward
input = ecc.GraphConvFunction(nc, nc, idxn, idxe, degs, degs_gpu, self._edge_mem_limit)(hx, weights)
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/ecc/GraphConvModule.py", line 67, in forward
cuda_kernels.conv_aggregate_fw(output.narrow(0,startd,numd), products.view(-1,self._out_channels), self._degs_gpu.narrow(0,startd,numd))
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/ecc/cuda_kernels.py", line 122, in conv_aggregate_fw
function, stream = get_kernel_func('conv_aggregate_fw_kernel_v2', conv_aggregate_fw_kernel_v2(), get_dtype(src))
File "/home/xuyan/pytorch_xuyan/superpoint_graph-release/learning/ecc/cuda_kernels.py", line 35, in get_kernel_func
prog = Program(ksrc.encode('utf-8'), (kname+dtype+'.cu').encode('utf-8'))
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/pynvrtc/compiler.py", line 52, in __init__
include_names)
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/pynvrtc/interface.py", line 200, in nvrtcCreateProgram
c_char_p(encode_str(src)), c_char_p(encode_str(name)),
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/pynvrtc/interface.py", line 54, in encode_str
return s.encode("utf-8")
AttributeError: 'bytes' object has no attribute 'encode'
Exception ignored in: <bound method Program.__del__ of <pynvrtc.compiler.Program object at 0x7fb27a822588>>
Traceback (most recent call last):
File "/home/xuyan/tensorflow_xuyan/virtual_3.5/lib/python3.5/site-packages/pynvrtc/compiler.py", line 56, in __del__
self._interface.nvrtcDestroyProgram(self._program)
AttributeError: 'Program' object has no attribute '_program'
from superpoint_graph.
Hi
I have the same problem as @MEIXuYan .
My env: Ubuntu 16.04, Python 3.6, Pytorch 0.3.1, CUDA 9.0. When I run the learning script, the terminal shows the same error info as Xuyan's.
I tried to change the line 54 of interface.py
in
~/anaconda3/envs/spgenv/lib/python3.6/site-packages/pynvrtc
from return s.encode("utf-8")
to return s
, and the learning script is running with no error now. I am not sure about the principle beneath but it is working...
from superpoint_graph.
Hi,
I would discourage changing directly the python package, as it could have adverse consequences with other project.
Program expects bytes or integer, and not string. For some reason, that I am not able to reproduce, it seems that in your case the names are already encoded. Hence, following your fix, I would suggest replacing line 3ΜΆ5ΜΆ 33 of /learning/ecc/cuda_kernels.py
by
prog = Program(ksrc, (kname+dtype+'.cu'))
Could you tell me if it fixes the issue?
from superpoint_graph.
Hi Loic,
I think you meant line 33, then it works. Thanks!
from superpoint_graph.
Good. I did not write the kernel, mys007 did. I will refer it to him in case he has more insight what might be going on.
from superpoint_graph.
This was caused by recently merged NVIDIA/pynvrtc#2 and fixed here today - 44bcbe8
from superpoint_graph.
Related Issues (20)
- cuda acceleration for partition HOT 4
- TypeError in 'NearestNeighbors(1, algorithm='kd_tree').fit(xyz)' HOT 2
- Inconsistent class_maps for s3dis HOT 2
- CUDA error when training for Semantic3D HOT 1
- the version of metrics HOT 2
- When making ply_c, fatal error: numpy/ndarrayobject.h: No such file or directory HOT 3
- How to control the number of superpoints in a room? HOT 6
- Segmentation fault (core dumped) HOT 2
- Running on Stanford3dDataset_v1.2_Aligned_Version, the error occurs. HOT 6
- CMake error HOT 1
- Which version of Pytorch is needed for this code? HOT 1
- ModuleNotFoundError: No module named 'torchnet' HOT 3
- RuntimeError: scan failed to synchronize: an illegal memory access was encountered HOT 2
- L0-cut pursuit partition algorithm HOT 3
- cupy_backends.cuda.api.driver.CUDADriverError: CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory access was encountered HOT 3
- About the number of superpoints HOT 2
- Overfitting soon after around 30 epochs HOT 13
- How to visualize SSP HOT 1
- ValueError: need at least one array to concatenate HOT 1
- Pretrained weight link
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
π Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. πππ
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google β€οΈ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from superpoint_graph.