I've been coding for conductiong vector operation using your Vector model.
It's a simple code loading 64 x 64 vector table data and adding (8 x 256 times) 64 elements.
In this code, there is not much performance difference between lanes 1 and 8.
I think when we perform vector operation(vfadd_vv_f32m1), it's not allocated in 8-lane.
Please provide a S/W guide which can utilize vector 8 lanes.
// table size 64 x 64
n = 64; // element = 64
vl = vsetvl_e32m1(n); // VLEN = 64 * 32-bit = 2048-bit
zero = (float*)malloc(sizeof(float)*vl); // 64 * float malloc
for(i = 0; i < 256; ++i) {
vfloat32m1_t vop0 = vle32_v_f32m1(zero); // vfloat32m1_t init 0
for (j = 0 ; j < 8; ++j) { // vector add 8 times
int index_ij = *get2df<int>(index_iter, i, j, 8); // table index
vfloat32m1_t vop0_lo = vle32_v_f32m1(get2df<float>(embed, index_ij, 0, 64)); // vector load form table
vop0 = vfadd_vv_f32m1(vop0, vop0_lo); // vector add
}
vse32_v_f32m1(get2df<float>(output, i, 0, dimension), vop0); // store data
}