When 1 process creates too many UCP contexts (with multiple interfaces) UCP fails to initialize. Currently we segv: need to gracefully stop.
repro:
UCC_TL_NCCL_TUNE=0 ./test/gtest/gtest
Value of: ucc_context_create(lib_h, &ctx_params, ctx_config, &ctx_h)
Actual: -6
Expected: UCC_OK
Which is: 0
[1620760142.526132] [jazz23:193943:0] sock.c:139 UCX ERROR socket create failed: Too many open files
[1620760142.526143] [jazz23:193943:0] sock.c:139 UCX ERROR socket create failed: Too many open files
[1620760142.526148] [jazz23:193943:0] sock.c:139 UCX ERROR socket create failed: Too many open files
[1620760142.526962] [jazz23:193943:0] tl_ucp_context.c:89 TL_UCP ERROR failed to create ucp worker, Input/output error
[1620760142.529811] [jazz23:193943:0] ucc_context.c:293 UCC WARN failed to create tl context for ucp
[1620760142.529821] [jazz23:193943:0] cl_basic_context.c:23 CL_BASIC WARN TL UCP context is not available, CL BASIC can't proceed
[1620760142.529824] [jazz23:193943:0] ucc_context.c:362 UCC WARN failed to create cl context for basic, skipping
[1620760142.529827] [jazz23:193943:0] ucc_context.c:370 UCC ERROR no CL context created in ucc_context_create
../../../test/gtest/common/test_ucc.cc:22: Failure
Value of: ucc_context_create(lib_h, &ctx_params, ctx_config, &ctx_h)
Actual: -6
Expected: UCC_OK
Which is: 0
[jazz23:193943:0:193943] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x88)
/labhome/valentinp/workspace/ucc/build_rel/src/../../src/core/ucc_team.c: [ ucc_team_create_post_single() ]
...
37 {
38 ucc_status_t status;
39 if ((team->params.mask & UCC_TEAM_PARAM_FIELD_EP) &&
==> 40 (team->params.mask & UCC_TEAM_PARAM_FIELD_EP_RANGE) &&
41 (team->params.ep_range == UCC_COLLECTIVE_EP_RANGE_CONTIG)) {
42 team->rank =
43 team->params.ep; //TODO need to make sure we don't exceed rank size
==== backtrace (tid: 193943) ====
0 0x00000000000080c5 ucc_team_create_post_single() /labhome/valentinp/workspace/ucc/build_rel/src/../../src/core/ucc_team.c:40
1 0x00000000000080c5 ucc_team_create_post() /labhome/valentinp/workspace/ucc/build_rel/src/../../src/core/ucc_team.c:161
2 0x000000000047d7bb UccTeam::init_team() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/test_ucc.cc:160
3 0x000000000047d7bb UccTeam::init_team() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/test_ucc.cc:160
4 0x000000000047ef41 UccTeam::UccTeam() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/test_ucc.cc:221
5 0x000000000047f21d construct<UccTeam, std::vector<std::shared_ptr<UccProcess>, std::allocator<std::shared_ptr<UccProcess> > >&>() /usr/include/c++/4.8.2/ext/new_allocator.h:120
6 0x000000000047f21d __shared_ptr<std::allocator<UccTeam>, std::vector<std::shared_ptr<UccProcess>, std::allocator<std::shared_ptr<UccProcess> > >&>() /usr/include/c++/4.8.2/bits/shared_ptr_base.h:961
7 0x000000000047f21d shared_ptr<std::allocator<UccTeam>, std::vector<std::shared_ptr<UccProcess>, std::allocator<std::shared_ptr<UccProcess> > >&>() /usr/include/c++/4.8.2/bits/shared_ptr.h:316
8 0x000000000047f21d allocate_shared<UccTeam, std::allocator<UccTeam>, std::vector<std::shared_ptr<UccProcess>, std::allocator<std::shared_ptr<UccProcess> > >&>() /usr/include/c++/4.8.2/bits/shared_ptr.h:598
9 0x000000000047f21d make_shared<UccTeam, std::vector<std::shared_ptr<UccProcess>, std::allocator<std::shared_ptr<UccProcess> > >&>() /usr/include/c++/4.8.2/bits/shared_ptr.h:614
10 0x000000000047f21d UccJob::create_team() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/test_ucc.cc:303
11 0x00000000004b3651 test_team_team_create_multiple_preconnect_Test::test_body() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/core/test_team.cc:53
12 0x00000000004b3651 std::vector<std::shared_ptr<UccTeam>, std::allocator<std::shared_ptr<UccTeam> > >::push_back() /usr/include/c++/4.8.2/bits/stl_vector.h:920
13 0x00000000004b3651 test_team_team_create_multiple_preconnect_Test::test_body() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/core/test_team.cc:53
14 0x000000000047aa96 ucc::test_base::run() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/test.cc:89
15 0x0000000000476523 HandleSehExceptionsInMethodIfSupported<testing::Test, void>() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/gtest-all.cc:3562
16 0x000000000046990d testing::Test::Run() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/gtest-all.cc:3635
17 0x00000000004699dc testing::TestInfo::Run() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/gtest-all.cc:3812
18 0x0000000000469b3f testing::TestCase::Run() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/gtest-all.cc:3930
19 0x000000000046df47 testing::internal::UnitTestImpl::RunAllTests() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/gtest-all.cc:5808
20 0x000000000046e24b testing::internal::UnitTestImpl::RunAllTests() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/gtest-all.cc:5725
21 0x0000000000453f89 RUN_ALL_TESTS() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/gtest.h:20059
22 0x0000000000453f89 main() /labhome/valentinp/workspace/ucc/build_rel/test/gtest/../../../test/gtest/common/main.cc:43
23 0x00000000000223d5 __libc_start_main() ???:0
24 0x0000000000455e60 _start() ???:0
=================================