NLP/DuReader-Robust-BASELINE的训练程序,单卡时正常运行,多卡时则会报错,具体信息如下:
C++ Call Stacks (More useful to developers):
0 std::string paddle::platform::GetTraceBackString<std::string const&>(std::string const&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(std::string const&, char const*, int)
2 paddle::framework::Tensor::check_memory_size() const
3 long const* paddle::framework::Tensor::data() const
4 paddle::operators::LookupTableV2CUDAKernel::Compute(paddle::framework::ExecutionContext const&) const
5 std::Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CUDAPlace, false, 0ul, paddle::operators::LookupTableV2CUDAKernel, paddle::operators::LookupTableV2CUDAKernel, paddle::operators::LookupTableV2CUDAKernelpaddle::platform::float16 >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::M_invoke(std::Any_data const&, paddle::framework::ExecutionContext const&)
6 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&, paddle::framework::RuntimeContext*) const
7 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const
8 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&)
9 paddle::framework::details::ComputationOpHandle::RunImpl()
10 paddle::framework::details::ThreadedSSAGraphExecutor::RunOpSync(paddle::framework::details::OpHandleBase*)
11 std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result, std::__future_base::_Result_base::_Deleter>, void> >::_M_invoke(std::_Any_data const&)
12 std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&)
13 ThreadPool::ThreadPool(unsigned long)::{lambda()#1}::operator()() const
Python Call Stacks (More useful to users):
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/paddle/fluid/framework.py", line 2459, in append_op
attrs=kwargs.get("attrs", None))
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/paddle/fluid/layer_helper.py", line 43, in append_op
return self.main_program.current_block().append_op(*args, **kwargs)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/paddle/fluid/input.py", line 268, in embedding
'padding_idx': padding_idx
File "/home/zhan/Research-master/NLP/DuReader-Robust-BASELINE/src/model/ernie.py", line 97, in _build_model
name=self._pos_emb_name, initializer=self._param_initializer))
File "/home/zhan/Research-master/NLP/DuReader-Robust-BASELINE/src/model/ernie.py", line 81, in init
self.build_model(src_ids, position_ids, sentence_ids, input_mask)
File "", line 39, in create_model
use_fp16=args.use_fp16)
File "", line 6, in
is_training=True)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3254, in run_ast_nodes
if (await self.run_code(code, result, async=asy)):
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3063, in run_cell_async
interactivity=interactivity, compiler=compiler, result=result)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
coro.send(None)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2886, in _run_cell
return runner(coro)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2858, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 300, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 545, in execute_request
user_expressions, allow_stdin,
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 365, in process_one
yield gen.maybe_future(dispatch(*args))
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/gen.py", line 748, in run
yielded = self.gen.send(value)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/gen.py", line 714, in init
self.run()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/gen.py", line 225, in wrapper
runner = Runner(result, future, yielded)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 381, in dispatch_queue
yield self.process_one()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/gen.py", line 748, in run
yielded = self.gen.send(value)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/gen.py", line 787, in inner
self.run()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/ioloop.py", line 743, in _run_callback
ret = callback()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/ioloop.py", line 690, in
lambda f: self._run_callback(functools.partial(callback, future))
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
handle._run()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
self._run_once()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 149, in start
self.asyncio_loop.run_forever()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 583, in start
self.io_loop.start()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/traitlets/config/application.py", line 664, in launch_instance
app.start()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in
app.launch_new_instance()
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/zhan/anaconda3/envs/paddle/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
Error Message Summary:
PaddleCheckError: holder_ should not be null
Tensor holds no memory. Call Tensor::mutable_data first. at [/paddle/paddle/fluid/framework/tensor.cc:23]
[operator < lookup_table_v2 > error]
运行环境为:
paddlepaddle-gpu 1.6.1.post107
cuda 10.0
cudnn 7.6.4
nccl 2.6.4