I tried to use libvma in my Ceph cluster recently. However, I met same crash in version 6.9.1(installed via RPM) and 7.0.5(installed from src with debug enabled). The core trace is pasted below. Cloud some one help to look at this issue? Thanks very much!
(gdb) bt
#0 0x00007f4c566cf65b in raise () from /lib64/libpthread.so.0
#1 0x00000000009bd1c4 in ?? ()
#2 <signal handler called>
#3 0x00007f4c52c88d71 in ?? () from /usr/lib64/libmlx4-rdmav2.so
#4 0x00007f4c5536612f in ibv_create_cq () from /usr/lib64/libibverbs.so.1
#5 0x00007f4c5762ed08 in cq_mgr::cq_mgr (this=0x3819340, p_ring=0x15358000, p_ib_ctx_handler=0x373a200, cq_size=3000, p_comp_event_channel=0x94639f0, is_rx=false) at dev/cq_mgr.cpp:69
#6 0x00007f4c57633090 in qp_mgr::configure (this=0x37fb000, p_rx_comp_event_channel=0x94639e0) at dev/qp_mgr.cpp:104
#7 0x00007f4c5763cbae in qp_mgr_eth::qp_mgr_eth (this=0x37fb000, p_ring=0x15358000, p_context=0x373a200, port_num=1 '\001', p_rx_comp_event_channel=0x94639e0, tx_num_wr=3000,
vlan=101) at ../../src/vma/dev/qp_mgr.h:162
#8 0x00007f4c576365af in ring_eth::create_qp_mgr (this=0x15358000, key=..., p_rx_comp_event_channel=0x94639e0) at dev/ring.cpp:70
#9 0x00007f4c57637790 in ring::create_resources (this=0x15358000, p_ring_info=0x7f4c1d2ded60, active=0) at dev/ring.cpp:209
#10 0x00007f4c57652b5c in ring_eth::ring_eth (this=0x15358000, local_if=168758798, p_ring_info=0x7f4c1d2ded60, count=1, active=0, vlan=101) at ../../src/vma/dev/ring.h:382
#11 0x00007f4c57651abd in net_device_val_eth::create_ring (this=0x37a85a0) at dev/net_device_val.cpp:500
#12 0x00007f4c57650168 in net_device_val::reserve_ring (this=0x37a85a0, key=0) at dev/net_device_val.cpp:267
#13 0x00007f4c576ac9b5 in dst_entry::resolve_ring (this=0x36fad00) at proto/dst_entry.cpp:242
#14 0x00007f4c576ad896 in dst_entry::prepare_to_send (this=0x36fad00, skip_rules=false) at proto/dst_entry.cpp:482
#15 0x00007f4c576d70b9 in sockinfo_tcp::connect (this=0x6283f00, __to=0x6310310, __tolen=16) at sock/sockinfo_tcp.cpp:1631
#16 0x00007f4c576efb0c in connect (__fd=66, __to=0x6310310, __tolen=16) at sock/sock-redirect.cpp:569
#17 0x0000000000b4f037 in Pipe::connect() ()
#18 0x0000000000b53743 in Pipe::writer() ()
#19 0x0000000000b5f16d in Pipe::Writer::entry() ()
#20 0x00007f4c566c7a51 in start_thread () from /lib64/libpthread.so.0
#21 0x00007f4c556579ad in clone () from /lib64/libc.so.6
(gdb) f 5
#5 0x00007f4c5762ed08 in cq_mgr::cq_mgr (this=0x3819340, p_ring=0x15358000, p_ib_ctx_handler=0x373a200, cq_size=3000, p_comp_event_channel=0x94639f0, is_rx=false) at dev/cq_mgr.cpp:69
69 m_p_ibv_cq = ibv_create_cq(m_p_ib_ctx_handler->get_ibv_context(), cq_size, (void*)this, m_comp_event_channel, 0);
(gdb) p *this
$3 = {m_p_ring = 0x15358000, m_p_ib_ctx_handler = 0x373a200, m_b_is_rx = false, m_b_is_rx_csum_on = false, m_comp_event_channel = 0x94639f0, m_p_ibv_cq = 0x0,
m_b_notification_armed = false, m_b_was_drained = false, m_n_wce_counter = 0, m_qp_rec = {qp = 0x0, debth = 0}, m_p_next_rx_desc_poll = 0x0, m_rx_queue = {m_list = {head = {
next = 0x3819388, prev = 0x3819388}, obj_ptr = 0x0}, m_size = 0}, m_rx_pool = {m_list = {head = {next = 0x38193a8, prev = 0x38193a8}, obj_ptr = 0x0}, m_size = 0},
m_n_out_of_free_bufs_warning = 0, m_p_cq_stat = 0x0, m_cq_stat_static = {n_rx_pkt_drop = 0, n_rx_sw_queue_len = 0, n_rx_drained_at_once_max = 0, n_buffer_pool_len = 0,
buffer_miss_rate = 0}, m_cq_id = 5, m_n_cq_poll_sn = 0, m_transport_type = VMA_TRANSPORT_ETH, m_sz_transport_header = 0, m_buffer_miss_count = 0, m_buffer_total_count = 0,
m_buffer_prev_id = 0, static m_n_cq_id_counter = {counter = 6}, static m_n_global_sn = 0}
(gdb) p m_comp_event_channel
$4 = (ibv_comp_channel *) 0x94639f0
(gdb) p *m_comp_event_channel
$5 = {context = 0x3828218, fd = 69, refcnt = 0}
(gdb) p *p_ib_ctx_handler
$6 = {<event_handler_ibverbs> = {_vptr.event_handler_ibverbs = 0x7f4c5797ee90}, m_p_ibv_context = 0x3828218, m_ibv_port_attr = {state = IBV_PORT_ACTIVE, max_mtu = IBV_MTU_4096,
active_mtu = IBV_MTU_1024, gid_tbl_len = 128, port_cap_flags = 201392128, max_msg_sz = 1073741824, bad_pkey_cntr = 0, qkey_viol_cntr = 0, pkey_tbl_len = 1, lid = 0, sm_lid = 0,
lmc = 0 '\000', max_vl_num = 2 '\002', sm_sl = 0 '\000', subnet_timeout = 0 '\000', init_type_reply = 0 '\000', active_width = 2 '\002', active_speed = 4 '\004',
phys_state = 5 '\005', link_layer = 2 '\002', reserved = 0 '\000'}, m_p_ibv_device = 0x37d8300, m_ibv_device_attr = {fw_ver = "2.34.5000", '\000' <repeats 54 times>,
node_guid = 6942324114412346852, sys_image_guid = 6942324114412346852, max_mr_size = 18446744073709551615, page_size_cap = 4294966784, vendor_id = 713, vendor_part_id = 4103,
hw_ver = 0, max_qp = 392632, max_qp_wr = 16351, reserved = 6069350, max_sge = 32, max_sge_rd = 0, max_cq = 65408, max_cqe = 4194303, max_mr = 524032, max_pd = 32764,
max_qp_rd_atom = 16, max_ee_rd_atom = 0, max_res_rd_atom = 6282112, max_qp_init_rd_atom = 128, max_ee_init_rd_atom = 0, exp_atomic_cap = IBV_EXP_ATOMIC_HCA, max_ee = 0,
max_rdd = 0, max_mw = 0, max_raw_ipv6_qp = 0, max_raw_ethy_qp = 0, max_mcast_grp = 131072, max_mcast_qp_attach = 244, max_total_mcast_qp_attach = 31981568, max_ah = 2147483647,
max_fmr = 0, max_map_per_fmr = 8191, max_srq = 65472, max_srq_wr = 16383, max_srq_sge = 31, max_pkeys = 128, local_ca_ack_delay = 15 '\017', phys_port_cnt = 2 '\002',
comp_mask = 9, calc_cap = {data_types = 7, data_sizes = 1, int_ops = 29, uint_ops = 29, fp_ops = 29}, timestamp_mask = 0, hca_core_clock = 0,
exp_device_cap_flags = 8152104758268435558, max_dc_req_rd_atom = 0, max_dc_res_rd_atom = 0, inline_recv_sz = 0, max_rss_tbl_sz = 0, ext_atom = {log_atomic_arg_sizes = 0,
max_fa_bit_boundary = 0, log_max_atomic_inline = 0}, umr_caps = {max_klm_list_size = 0, max_send_wqe_inline_klms = 0, max_umr_recursion_depth = 0, max_umr_stride_dimension = 0},
odp_caps = {general_odp_caps = 0, per_transport_caps = {rc_odp_caps = 0, uc_odp_caps = 0, ud_odp_caps = 0, dc_odp_caps = 0, xrc_odp_caps = 0, raw_eth_odp_caps = 0}}, max_dct = 0,
max_ctx_res_domain = 0}, m_p_ibv_pd = 0x36b6100, m_channel = 0, m_removed = false, m_conf_attr_rx_num_wre = 16000, m_conf_attr_tx_num_post_send_notify = 64,
m_conf_attr_tx_max_inline = 220, m_conf_attr_tx_num_wre = 3000}
(gdb) p *p_ring
$7 = {<mem_buf_desc_owner> = {_vptr.mem_buf_desc_owner = 0x7f4c5797ea70}, m_p_ring_stat = 0x0, m_local_if = 168758798, m_transport_type = VMA_TRANSPORT_ETH, m_n_num_resources = 1,
m_ring_resources_map = std::tr1::unordered_map with 0 elements,
m_ring_active_resource = {<std::tr1::__detail::_Hashtable_iterator_base<std::pair<ring_resource_definition const, ring_resources_info_t>, false>> = {_M_cur_node = 0x1000,
_M_cur_bucket = 0x1523d758}, <No data fields>}, m_rx_channel_fd_to_ring_resources_map = std::tr1::unordered_map with 0 elements,
m_l2_mc_ip_attach_map = std::tr1::unordered_map with 0 elements, m_tcp_dst_port_attach_map = std::tr1::unordered_map with 0 elements, m_p_tx_comp_event_channel = 0x94639f0,
m_flow_tcp_map = {_vptr.hash_map = 0x7f4c5797eb10, m_hash_table = {0x0 <repeats 4096 times>}, m_last = 0x0}, m_flow_udp_mc_map = {_vptr.hash_map = 0x7f4c5797eaf0, m_hash_table = {
0x0 <repeats 4096 times>}, m_last = 0x0}, m_flow_udp_uc_map = {_vptr.hash_map = 0x7f4c5797ead0, m_hash_table = {0x0 <repeats 4096 times>}, m_last = 0x0},
m_lock_ring_rx = {<lock_mutex> = {<lock_base> = {_vptr.lock_base = 0x7f4c5797eb70, m_lock_name = 0x7f4c577198ab "ring:lock_rx"}, m_lock = {__data = {__lock = 0, __count = 0,
__owner = 0, __nusers = 0, __kind = 1, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 16 times>, "\001", '\000' <repeats 22 times>,
__align = 0}}, <No data fields>}, m_lock_ring_tx = {<lock_mutex> = {<lock_base> = {_vptr.lock_base = 0x7f4c5797eb70, m_lock_name = 0x7f4c577198b8 "ring:lock_tx"}, m_lock = {
__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 1, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = '\000' <repeats 16 times>, "\001", '\000' <repeats 22 times>, __align = 0}}, <No data fields>}, m_lock_ring_tx_buf_wait = {<lock_base> = {
_vptr.lock_base = 0x7f4c5797ebb0, m_lock_name = 0x7f4c577198c5 "ring:lock_tx_buf_wait"}, m_lock = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0,
__spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}}, m_p_n_rx_channel_fds = 0x11410f60, m_tx_pool = {m_list = {head = {
next = 0x153701d8, prev = 0x153701d8}, obj_ptr = 0x0}, m_size = 0}, m_tx_num_bufs = 0, m_tx_num_wr = 3000, m_tx_num_wr_free = 3000,
m_b_qp_tx_first_flushed_completion_handled = false, m_missing_buf_ref_count = 0, m_cq_moderation_info = {period = 0, count = 0, packets = 0, bytes = 0, prev_packets = 0,
prev_bytes = 0, missed_rounds = 0}, m_tx_lkey = 0, m_partition = 101, m_gro_mgr = {_vptr.gro_mgr = 0x7f4c5797e970, m_n_flow_max = 32, m_n_buf_max = 32, m_n_flow_count = 0,
m_p_rfs_arr = 0x36f2500}, m_ring_stat_static = {n_rx_pkt_count = 0, n_rx_byte_count = 0, n_rx_interrupt_requests = 0, n_rx_interrupt_received = 0, n_rx_cq_moderation_count = 0,
n_rx_cq_moderation_period = 0, n_tx_retransmits = 0}}