/* simple matrix vector multiplication: c = A * b
 * A: m-by-n matrix
 * b: n elements vector
 * c: m elements result vector
 * Host code.
 * Author: Michel Steuwer (michel.steuwer@wwu.de)
 */
#include <stdlib.h>
#include <stdio.h>
// include math.h for random init
#include <math.h>
// include cuda utilities, for time measurement
#include <cutil.h>
// include my own kernel
#include <matrixVectorMul_kernel.cu>

void randomInit(float*, int);

int main(int argc, char** argv)
{
    // set seed for rand()
    srand(2009);

    // allocate host memory for matrix A
    float* host_A = (float*) malloc(sizeof(float) * M * N);

    // allocate host memory for vector b
    float* host_b = (float*) malloc(sizeof(float) * N);

    // initialize host memory with radom values
    randomInit(host_A, M * N);
    randomInit(host_b, N);

    // allocate device memory for matrix A
    float* device_A;
    cudaMalloc( (void**) &device_A, sizeof(float) * M * N );
    
    // allocate device memory for vector b
    float* device_b;
    cudaMalloc( (void**) &device_b, sizeof(float) * N );

    // create and start timer for measurement
    unsigned int timer = 0;
    cutCreateTimer(&timer);
    cutStartTimer(timer);

    // copy host memory to device (both matrix A and vector b)
    cudaMemcpy(device_A, host_A, sizeof(float) * M * N, cudaMemcpyHostToDevice);
    cudaMemcpy(device_b, host_b, sizeof(float) * N, cudaMemcpyHostToDevice);

    // allocate device memory for result vector c
    float* device_c;
    cudaMalloc( (void**) &device_c, sizeof(float) * M );

    // allocate host memory for the result
    float* host_c = (float*) malloc(sizeof(float) * M);

    // setup execution parameters
    dim3 blockSize(BLOCK_SIZE);
    dim3 gridSize(M);

    // execute the kernel
    matrixVectorMul<<<gridSize, blockSize>>>(device_c, device_A, device_b, M, N);

    // copy result from device to host
    cudaMemcpy(host_c, device_c, sizeof(float) * M, cudaMemcpyDeviceToHost);

    // stop and destroy timer and produce output
    cutStopTimer(timer);
    printf("%f\n", cutGetTimerValue(timer));
    cutDeleteTimer(timer);

    // clean up memory
    free(host_A);
    free(host_b);
    free(host_c);
    cudaFree(device_A);
    cudaFree(device_b);
    cudaFree(device_c);
}

// Allocates an array with random float entries.
void randomInit(float* data, int size)
{
    for (int i = 0; i < size; ++i)
        data[i] = rand() / (float)RAND_MAX;
}