/* matrix vector multiplication: c = A * b
 * A: m-by-n matrix
 * b: n elements vector
 * c: m elements result vector
 * Device code.
 * Author: Michel Steuwer (michel.steuwer@wwu.de)
 */
 
#ifndef _MATRIXVECMUL_KERNEL_H_
#define _MATRIXVECMUL_KERNEL_H_

#include <stdio.h>
#include "matrixVectorMul.h"

// calculate a part of the Matrix row vector multiplikation (A_i * b) 
__device__ float calculatePart( float* A, float* b, int size, int n )
{
    float result;
	
    for ( int i  = threadIdx.x * size;
              i < (threadIdx.x + 1) * size;
              i += 1 ) {
		if (i < n)
            result += A[blockIdx.x*n + i] * b[i];
    }
    return result;
}

// collect all intermediate results and calculates the end result
__device__ void sumUpAllParts( float* data )
{
	__shared__ float tempData[BLOCK_SIZE/2];
	
    int thread_count = BLOCK_SIZE;
    while (thread_count > 1) {
        if( threadIdx.x < (thread_count / 2) ) {
            tempData[threadIdx.x] = data[threadIdx.x] + data[threadIdx.x+(thread_count/2)];
        }
        
        thread_count = thread_count / 2;
        
        data[threadIdx.x] = tempData[threadIdx.x];
        
        __syncthreads();
    }
}

// simple matrix vector multiplikation (c=A*b)
// one block executes a Matrix row vector multiplikation devided into 
// multiple "parts" calculatet by the threads inside the block
// afterwards the intermediate results of all threads must be collected
__global__ void matrixVectorMul( float* c, float* A, float* b, int m, int n)
{
    __shared__ float data[BLOCK_SIZE];
    
    int size = (n / BLOCK_SIZE);
    if ( (n % BLOCK_SIZE) != 0 ) {
        size += 1;
    }
    
    data[threadIdx.x] = calculatePart(A, b, size, n);
    
    __syncthreads();
    
	sumUpAllParts( data );
    
    if (threadIdx.x == 0) {
        c[blockIdx.x] = data[0];
    }
}

#endif // #ifndef _MATRIXVECMUL_KERNEL_H_
