flame / how-to-optimize-gemm Goto Github PK

C 94.87% MATLAB 1.46% Makefile 1.01% Python 2.67%

blis code-optimization gemm gotoblas matrix-multiplication

how-to-optimize-gemm's Issues

gcc bad codegen with union

GCC fails to keep v2df_t variables in register and writes them back onto the stack often.
I got 40% performance improvement by using __m128 directly where possible (i7-920).
Clang doesn't have that problem.

before: https://godbolt.org/z/cNxgkG
after: https://godbolt.org/z/55MvWD
may require to change loaddup to something else to compile on godbolt

/* Create macros so that the matrices are stored in column-major order */

#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]

/* Block sizes */
#define mc 256
#define kc 128
#define nb 1000

#define min( i, j ) ( (i)<(j) ? (i): (j) )

/* Routine for computing C = A * B + C */

void AddDot4x4( int, double *, int, double *, int, double *, int );
void PackMatrixA( int, double *, int, double * );
void PackMatrixB( int, double *, int, double * );
void InnerKernel( int, int, int, double *, int, double *, int, double *, int, int );

void MY_MMult( int m, int n, int k, double *a, int lda,
                                    double *b, int ldb,
                                    double *c, int ldc )
{
  int i, p, pb, ib;

  /* This time, we compute a mc x n block of C by a call to the InnerKernel */

  for ( p=0; p<k; p+=kc ){
    pb = min( k-p, kc );
    for ( i=0; i<m; i+=mc ){
      ib = min( m-i, mc );
      InnerKernel( ib, n, pb, &A( i,p ), lda, &B(p, 0 ), ldb, &C( i,0 ), ldc, i==0 );
    }
  }
}

void InnerKernel( int m, int n, int k, double *a, int lda,
                                       double *b, int ldb,
                                       double *c, int ldc, int first_time )
{
  int i, j;
  double
    packedA[ m * k ];
  static double
    packedB[ kc*nb ];    /* Note: using a static buffer is not thread safe... */

  for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
    if ( first_time )
      PackMatrixB( k, &B( 0, j ), ldb, &packedB[ j*k ] );
    for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
      /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
   one routine (four inner products) */
      if ( j == 0 )
  PackMatrixA( k, &A( i, 0 ), lda, &packedA[ i*k ] );
      AddDot4x4( k, &packedA[ i*k ], 4, &packedB[ j*k ], k, &C( i,j ), ldc );
    }
  }
}

void PackMatrixA( int k, double *a, int lda, double *a_to )
{
  int j;

  for( j=0; j<k; j++){  /* loop over columns of A */
    double
      *a_ij_pntr = &A( 0, j );

    *a_to     = *a_ij_pntr;
    *(a_to+1) = *(a_ij_pntr+1);
    *(a_to+2) = *(a_ij_pntr+2);
    *(a_to+3) = *(a_ij_pntr+3);

    a_to += 4;
  }
}

void PackMatrixB( int k, double *b, int ldb, double *b_to )
{
  int i;
  double
    *b_i0_pntr = &B( 0, 0 ), *b_i1_pntr = &B( 0, 1 ),
    *b_i2_pntr = &B( 0, 2 ), *b_i3_pntr = &B( 0, 3 );

  for( i=0; i<k; i++){  /* loop over rows of B */
    *b_to++ = *b_i0_pntr++;
    *b_to++ = *b_i1_pntr++;
    *b_to++ = *b_i2_pntr++;
    *b_to++ = *b_i3_pntr++;
  }
}

#include <mmintrin.h>
#include <xmmintrin.h>  // SSE
#include <pmmintrin.h>  // SSE2
#include <emmintrin.h>  // SSE3

typedef union
{
  __m128d v;
  double d[2];
} v2df_t;

void AddDot4x4( int k, double *a, int lda,  double *b, int ldb, double *c, int ldc )
{
  /* So, this routine computes a 4x4 block of matrix A
           C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).
           C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).
           C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).
           C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).
     Notice that this routine is called with c = C( i, j ) in the
     previous routine, so these are actually the elements
           C( i  , j ), C( i  , j+1 ), C( i  , j+2 ), C( i  , j+3 )
           C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 )
           C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 )
           C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 )

     in the original matrix C
     And now we use vector registers and instructions */

  int p;
  __m128d
    c_00_c_10_vreg,    c_01_c_11_vreg,    c_02_c_12_vreg,    c_03_c_13_vreg,
    c_20_c_30_vreg,    c_21_c_31_vreg,    c_22_c_32_vreg,    c_23_c_33_vreg,
    a_0p_a_1p_vreg,
    a_2p_a_3p_vreg,
    b_p0_vreg, b_p1_vreg, b_p2_vreg, b_p3_vreg;

  c_00_c_10_vreg = _mm_setzero_pd();
  c_01_c_11_vreg = _mm_setzero_pd();
  c_02_c_12_vreg = _mm_setzero_pd();
  c_03_c_13_vreg = _mm_setzero_pd();
  c_20_c_30_vreg = _mm_setzero_pd();
  c_21_c_31_vreg = _mm_setzero_pd();
  c_22_c_32_vreg = _mm_setzero_pd();
  c_23_c_33_vreg = _mm_setzero_pd();

  for ( p=0; p<k; p++ ){
    a_0p_a_1p_vreg = _mm_load_pd( (double *) a );
    a_2p_a_3p_vreg = _mm_load_pd( (double *) ( a+2 ) );
    a += 4;

    b_p0_vreg = _mm_loaddup_pd( (double *) b );       /* load and duplicate */
    b_p1_vreg = _mm_loaddup_pd( (double *) (b+1) );   /* load and duplicate */
    b_p2_vreg = _mm_loaddup_pd( (double *) (b+2) );   /* load and duplicate */
    b_p3_vreg = _mm_loaddup_pd( (double *) (b+3) );   /* load and duplicate */

    b += 4;

    /* First row and second rows */
    c_00_c_10_vreg += a_0p_a_1p_vreg * b_p0_vreg;
    c_01_c_11_vreg += a_0p_a_1p_vreg * b_p1_vreg;
    c_02_c_12_vreg += a_0p_a_1p_vreg * b_p2_vreg;
    c_03_c_13_vreg += a_0p_a_1p_vreg * b_p3_vreg;

    /* Third and fourth rows */
    c_20_c_30_vreg += a_2p_a_3p_vreg * b_p0_vreg;
    c_21_c_31_vreg += a_2p_a_3p_vreg * b_p1_vreg;
    c_22_c_32_vreg += a_2p_a_3p_vreg * b_p2_vreg;
    c_23_c_33_vreg += a_2p_a_3p_vreg * b_p3_vreg;
  }
  v2df_t
    c_00_c_10_vregx,    c_01_c_11_vregx,    c_02_c_12_vregx,    c_03_c_13_vregx,
    c_20_c_30_vregx,    c_21_c_31_vregx,    c_22_c_32_vregx,    c_23_c_33_vregx;

  c_00_c_10_vregx.v = c_00_c_10_vreg;
  c_01_c_11_vregx.v = c_01_c_11_vreg;
  c_02_c_12_vregx.v = c_02_c_12_vreg;
  c_03_c_13_vregx.v = c_03_c_13_vreg;
  c_20_c_30_vregx.v = c_20_c_30_vreg;
  c_21_c_31_vregx.v = c_21_c_31_vreg;
  c_22_c_32_vregx.v = c_22_c_32_vreg;
  c_23_c_33_vregx.v = c_23_c_33_vreg;

  C( 0, 0 ) += c_00_c_10_vregx.d[0];  C( 0, 1 ) += c_01_c_11_vregx.d[0];
  C( 0, 2 ) += c_02_c_12_vregx.d[0];  C( 0, 3 ) += c_03_c_13_vregx.d[0];

  C( 1, 0 ) += c_00_c_10_vregx.d[1];  C( 1, 1 ) += c_01_c_11_vregx.d[1];
  C( 1, 2 ) += c_02_c_12_vregx.d[1];  C( 1, 3 ) += c_03_c_13_vregx.d[1];

  C( 2, 0 ) += c_20_c_30_vregx.d[0];  C( 2, 1 ) += c_21_c_31_vregx.d[0];
  C( 2, 2 ) += c_22_c_32_vregx.d[0];  C( 2, 3 ) += c_23_c_33_vregx.d[0];

  C( 3, 0 ) += c_20_c_30_vregx.d[1];  C( 3, 1 ) += c_21_c_31_vregx.d[1];
  C( 3, 2 ) += c_22_c_32_vregx.d[1];  C( 3, 3 ) += c_23_c_33_vregx.d[1];
}

MMult_4x4_12.c (segmentation fault)

When I run with MMult_4x4_12.c optimization (packed A), I got Segmentation fault with the error on AddDot4x4 ().

This is output from make run:
echo "version = 'MMult_4x4_12';" > output_MMult_4x4_12.m
./test_MMult.x >> output_MMult_4x4_12.m
Segmentation fault (core dumped)
make: *** [run] Error 139

This is output from gdb:
Core was generated by `./test_MMult.x'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0 0x0000000000400b4f in AddDot4x4 ()
(gdb) backtrace
#0 0x0000000000400b4f in AddDot4x4 ()
#1 0x0000000000400eb8 in InnerKernel ()
#2 0x0000000000400fe6 in MY_MMult ()
#3 0x00000000004008ea in main ()

Thank you very much!
Le Van Duc

Why the LDA/B/C all set 1000 in the parameter.h?

Such setting will lead to the matrix distributed in the double array and a great deal of useless space to store the zeros.

Won't compile under MINGW64 - drand48 is not partof stdlib

Trying to compile this under Windows 10 on a MINGW64 (which is actually part of the windows Octave package)

Downloaded and built pcg_random.

Added in the headers and suggested on pcg_random help pages to use ldexp(pcg32_random(), -32) - 1.0; (not after cryptographic quality random numbers).

It builds but I'm getting
40 1.#INF00e+000 0.000000e+000
80 1.#INF00e+000 0.000000e+000
120 1.#INF00e+000 0.000000e+000
in the output_MMult0.m file which wont plot

Suggestions on fixing for Windows MINGW64?

Thanks

Optimization_1x4_6

In this optimization, it replace C(0, 1), ... , and C(0, 3) with register variable, I think if this step have benefits, it comes from two reasons:

avoid redundant computing index of C(0, x) and A(0, p);
Have more chances in utilizing data-reuse for C(0, x)；
3） it may come from the use of register variable.

Instead, I replace these variables with ordinary float/double c_00, c_01, ..., c_03, the result show we get more benefits.

So, I guess whether to use register variable is not the key optimization.

About Optimization_1x4_5 benefit

https://github.com/flame/how-to-optimize-gemm/wiki/Optimization_1x4_5

I think it's benefit form "B( p, 0 ),B( p, 1 ),B( p, 2 ),B( p, 3 )", only get B( p, 0 ) once，the cache or cache line（if enough large） will auto hit B( p, 1 ),B( p, 2 ),B( p, 3 ).

When I unroll kernel function and use indirect address(1x4_10 or 1x4_9), Segment error (core dump)

When I unroll kernel function (the third FOR LOOP in function) and use indirect address(1x4_10 or 1x4_9), build is okay.
however, it shows Segment error (core dump) when executing.

Wiki - Broken link

http://wiki.cs.utexas.edu/rvdg/HowToOptimizeGemm

how do you calculate the gflops ?

for example , you have 2 matrix A and B ,A.shape is 3X4 B.shape is 4X3, so the result is 3X3 ,the cost time is about 0.001ms, flops = 3X4x2/0.0001ms ?

if the matrix's width and height %4 is not zero ,how to optimize it ?

I do not see the optimize way about this ?

can you share it ?

Why are i and j flipped?

Are there any reasons why i and j are flipped? I thought it should be a[ (i)*lda + (j) ] instead of the code below:

#define A(i,j) a[ (j)*lda + (i) ]

Example:
Result of matrices a, b and cref (randomized integers):

// a
83.000000 86.000000 
77.000000 15.000000 
// b
93.000000 35.000000 
86.000000 92.000000 
// cref
10463.000000 8544.000000 
14284.000000 8803.000000

Missing semicolon in MMult_1x4_3.c on Wiki

Markup

https://github.com/flame/how-to-optimize-gemm#related-links

[BLISlab: A Sandbox for Optimizing GEMM] (https://github.com/flame/blislab)
[GEMM: From Pure C to SSE Optimized Micro Kernels] (http://apfel.mathematik.uni-ulm.de/~lehn/sghpc/gemm/)

You need to remove the space between [Text] to (url).

作者 MMult_4x4_11的代码中编译错误

原因是void InnerKernel（）声明写在了使用之后，改到前面就可以了。

Document error in Optimization_4x4_10

In Optimization_4x4_10 ,
I think OLD should be MMult_1x4_9, not MMult_4x4_9.

how to calculate the mc and mk in MMult_4x4_11.c

when i have cpu with 32k cache L2 , how to calculate the mc and mk in MMult_4x4_11.c . in the function MY_MMult i think it should be for ( p=0; p<k + kc ; p+=kc ) also for for ( i=0; i<m+mc ; i+=mc )

Inner unloop leads to wrong results

Hi, Thanks for this excellent work! I tried the unroll trick as Optimization2 suggests, which unrolls the outer loop, i.e., j. This works and gives me the right calculation result (although no speed boosts on my machine, too). However, when I tried further to unroll the inner loop, i.e., i or p, a calculation error occurs.

Here is my code:

// params.h
// Note that I re-implement in a [row-major] manner
#define A(i, j) A[i*lda + j]
#define B(i, j) B[i*ldb + j]
#define C(i, j) C[i*ldc + j]
#define mat(i, j) mat[(i)*(ldm) + (j)]
#define mat2(i, j) mat2[(i)*(ldm) + (j)]

// MMult_base.c
#include "params.h"
#include "MMult.h"

void MMult_base(int m, int k, int n, double *A, double *B, double *C, int lda, int ldb, int ldc)
{
  for (int i = 0; i < m; ++i){
    for (int j  = 0; j < n; ++j){
      for (int p = 0; p < k; ++p)
        C(i, j) += A(i, p) * B(p, j);
    }
  }
}

// MMult_unroll.c
#include "params.h"
#include "MMult.h"
#include <stdio.h>

// This gives a correct calculation result
void MMult_unroll(int m, int k, int n, double *A, double *B, double *C, int lda, int ldb, int ldc)
{
  for (int j = 0; j < n; j += 2){
    for (int i  = 0; i < m; i += 1){
      for (int p = 0; p < k; ++p){
        C(i, j) += A(i, p) * B(p, j);
        C(i, j + 1) += A(i, p) * B(p, j + 1);
      }
    }
  }
}

// However, when I tried inner loop, i or p, a wrong result occurs
void MMult_unroll_inner(int m, int k, int n, double *A, double *B, double *C, int lda, int ldb, int ldc)
{
  for (int j = 0; j < n; j += 1){
    for (int i  = 0; i < m; i += 1){
      for (int p = 0; p < k; p += 2){

        C(i, j) += A(i, p) * B(p, j);
        printf("C(%d, %d) added by [A(%d, %d)=%f] * [B(%d, %d)=%f]\n", i, j, i, p, A(i, p), p, j, B(p, j));
        C(i, j) += A(i, p + 1) * B(p + 1, j);
        printf("C(%d, %d) added by [A(%d, %d)=%f] * [B(%d, %d)=%f]\n", i, j, i, p+1, A(i, p+1), p+1, j, B(p+1, j));
        
      }
      printf("\n");
    }
  }
}

// main.c
#include "params.h"
#include "utils.h"
#include "MMult.h"

int main(){
    //for (int msize = 40; msize <= 800; msize += 40){
    for (int msize = 4; msize <= 4; msize += 4){     // small matrix for debugging
        double *A, *B, *C_base, *C_optim;
        int m = msize, k = msize, n = msize;
        int lda = k, ldb = n, ldc = n;

        A = (double*) malloc(m * k * sizeof(double));
        B = (double*) malloc(k * n * sizeof(double));
        C_base = (double*) malloc(m * n * sizeof(double));
        C_optim = (double*) malloc(m * n * sizeof(double));
        random_matrix(m, k, A, lda);
        random_matrix(k, n, B, ldb);
        zero_matrix(m, n, C_base, ldc);
        zero_matrix(m, n, C_optim, ldc);

        printf("A\n");
        print_matrix(m, k, A, lda);
        printf("B\n");
        print_matrix(k, n, B, ldb);

        MMult_base(m, k, n, A, B, C_base, lda, ldb, ldc);          // store the baseline result into C_base
        // MMult_unroll_inner(m, k, n, A, B, C_optim, lda, ldb, ldc); 
        MMult_unroll_inner(m, k, n, A, B, C_optim, lda, ldb, ldc); // store the optimized result into C_optim

        printf("C_base\n");
        print_matrix(m, n, C_base, ldc);
        printf("C_optim\n");
        print_matrix(m, n, C_optim, ldc);

        double max_diff = compare_matrix(m, n, C_base, C_optim, ldc); // compare C_base and C_optim
        assert(max_diff == 0);                                     
    }
}

Then I got the output

A
1.000000        1.000000        1.000000        1.000000
2.000000        2.000000        2.000000        2.000000
3.000000        3.000000        3.000000        3.000000
4.000000        4.000000        4.000000        4.000000

B
1.000000        1.000000        1.000000        1.000000
2.000000        2.000000        2.000000        2.000000
3.000000        3.000000        3.000000        3.000000
4.000000        4.000000        4.000000        4.000000

C(0, 0) added by [A(0, 0)=1.000000] * [B(0, 0)=1.000000]
C(0, 0) added by [A(0, 1)=1.000000] * [B(1, 0)=2.000000]
C(0, 0) added by [A(0, 2)=1.000000] * [B(2, 0)=3.000000]
C(0, 0) added by [A(0, 3)=1.000000] * [B(3, 0)=2.000000]

C(1, 0) added by [A(1, 0)=2.000000] * [B(0, 0)=1.000000]
C(1, 0) added by [A(1, 1)=2.000000] * [B(1, 0)=2.000000]
C(1, 0) added by [A(1, 2)=2.000000] * [B(2, 0)=3.000000]
C(1, 0) added by [A(1, 3)=2.000000] * [B(3, 0)=2.000000]

C(2, 0) added by [A(2, 0)=3.000000] * [B(0, 0)=1.000000]
C(2, 0) added by [A(2, 1)=3.000000] * [B(1, 0)=2.000000]
C(2, 0) added by [A(2, 2)=3.000000] * [B(2, 0)=3.000000]
C(2, 0) added by [A(2, 3)=3.000000] * [B(3, 0)=2.000000]

C(3, 0) added by [A(3, 0)=4.000000] * [B(0, 0)=1.000000]
C(3, 0) added by [A(3, 1)=4.000000] * [B(1, 0)=2.000000]
C(3, 0) added by [A(3, 2)=4.000000] * [B(2, 0)=3.000000]
C(3, 0) added by [A(3, 3)=4.000000] * [B(3, 0)=2.000000]

C(0, 1) added by [A(0, 0)=1.000000] * [B(0, 1)=1.000000]
C(0, 1) added by [A(0, 1)=1.000000] * [B(1, 1)=2.000000]
C(0, 1) added by [A(0, 2)=1.000000] * [B(2, 1)=3.000000]
C(0, 1) added by [A(0, 3)=1.000000] * [B(3, 1)=2.000000]

C(1, 1) added by [A(1, 0)=2.000000] * [B(0, 1)=1.000000]
C(1, 1) added by [A(1, 1)=2.000000] * [B(1, 1)=2.000000]
C(1, 1) added by [A(1, 2)=2.000000] * [B(2, 1)=3.000000]
C(1, 1) added by [A(1, 3)=2.000000] * [B(3, 1)=2.000000]

C(2, 1) added by [A(2, 0)=3.000000] * [B(0, 1)=1.000000]
C(2, 1) added by [A(2, 1)=3.000000] * [B(1, 1)=2.000000]
C(2, 1) added by [A(2, 2)=3.000000] * [B(2, 1)=3.000000]
C(2, 1) added by [A(2, 3)=3.000000] * [B(3, 1)=2.000000]

C(3, 1) added by [A(3, 0)=4.000000] * [B(0, 1)=1.000000]
C(3, 1) added by [A(3, 1)=4.000000] * [B(1, 1)=2.000000]
C(3, 1) added by [A(3, 2)=4.000000] * [B(2, 1)=3.000000]
C(3, 1) added by [A(3, 3)=4.000000] * [B(3, 1)=2.000000]

C(0, 2) added by [A(0, 0)=1.000000] * [B(0, 2)=1.000000]
C(0, 2) added by [A(0, 1)=1.000000] * [B(1, 2)=2.000000]
C(0, 2) added by [A(0, 2)=1.000000] * [B(2, 2)=3.000000]
C(0, 2) added by [A(0, 3)=1.000000] * [B(3, 2)=3.000000]

C(1, 2) added by [A(1, 0)=2.000000] * [B(0, 2)=1.000000]
C(1, 2) added by [A(1, 1)=2.000000] * [B(1, 2)=2.000000]
C(1, 2) added by [A(1, 2)=2.000000] * [B(2, 2)=3.000000]
C(1, 2) added by [A(1, 3)=2.000000] * [B(3, 2)=3.000000]

C(2, 2) added by [A(2, 0)=3.000000] * [B(0, 2)=1.000000]
C(2, 2) added by [A(2, 1)=3.000000] * [B(1, 2)=2.000000]
C(2, 2) added by [A(2, 2)=3.000000] * [B(2, 2)=3.000000]
C(2, 2) added by [A(2, 3)=3.000000] * [B(3, 2)=3.000000]

C(3, 2) added by [A(3, 0)=4.000000] * [B(0, 2)=1.000000]
C(3, 2) added by [A(3, 1)=4.000000] * [B(1, 2)=2.000000]
C(3, 2) added by [A(3, 2)=4.000000] * [B(2, 2)=3.000000]
C(3, 2) added by [A(3, 3)=4.000000] * [B(3, 2)=3.000000]

C(0, 3) added by [A(0, 0)=1.000000] * [B(0, 3)=1.000000]
C(0, 3) added by [A(0, 1)=1.000000] * [B(1, 3)=2.000000]
C(0, 3) added by [A(0, 2)=1.000000] * [B(2, 3)=3.000000]
C(0, 3) added by [A(0, 3)=1.000000] * [B(3, 3)=3.000000]

C(1, 3) added by [A(1, 0)=2.000000] * [B(0, 3)=1.000000]
C(1, 3) added by [A(1, 1)=2.000000] * [B(1, 3)=2.000000]
C(1, 3) added by [A(1, 2)=2.000000] * [B(2, 3)=3.000000]
C(1, 3) added by [A(1, 3)=2.000000] * [B(3, 3)=3.000000]

C(2, 3) added by [A(2, 0)=3.000000] * [B(0, 3)=1.000000]
C(2, 3) added by [A(2, 1)=3.000000] * [B(1, 3)=2.000000]
C(2, 3) added by [A(2, 2)=3.000000] * [B(2, 3)=3.000000]
C(2, 3) added by [A(2, 3)=3.000000] * [B(3, 3)=3.000000]

C(3, 3) added by [A(3, 0)=4.000000] * [B(0, 3)=1.000000]
C(3, 3) added by [A(3, 1)=4.000000] * [B(1, 3)=2.000000]
C(3, 3) added by [A(3, 2)=4.000000] * [B(2, 3)=3.000000]
C(3, 3) added by [A(3, 3)=4.000000] * [B(3, 3)=3.000000]

C_base
10.000000       10.000000       10.000000       10.000000
20.000000       20.000000       20.000000       20.000000
30.000000       30.000000       30.000000       30.000000
40.000000       40.000000       40.000000       40.000000

C_optim
8.000000        8.000000        9.000000        9.000000
16.000000       16.000000       18.000000       18.000000
24.000000       24.000000       27.000000       27.000000
32.000000       32.000000       36.000000       36.000000

test_MMult.x: main.c:37: main: Assertion `max_diff == 0' failed.

This has confused me a lot. Can anyone help me?

Thanks in advance.

flame / how-to-optimize-gemm Goto Github PK

how-to-optimize-gemm's Issues

Recommend Projects

Recommend Topics

Recommend Org