// simple Sobel Filter example
// Author: Michel Steuwer (michel.steuwer@wwu.de)
// The hole code is based on the Sobel Filter example of the NVIDIA CUDA SDK

#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>

#include "SobelFilter_kernels.h"

// Texture reference for reading the original image
texture<unsigned char, 2> tex;
static cudaArray *array = NULL;

// performe the actual convole for one pixel
__device__ unsigned char
ComputeSobel(unsigned char ul, // upper left
             unsigned char um, // upper middle
             unsigned char ur, // upper right
             unsigned char ml, // middle left
             unsigned char mm, // middle (unused)
             unsigned char mr, // middle right
             unsigned char ll, // lower left
             unsigned char lm, // lower middle
             unsigned char lr, // lower right
             float brightness )
{
    short Horz = ur + 2*mr + lr - ul - 2*ml - ll;
    short Vert = ul + 2*um + ur - ll - 2*lm - lr;
    short Sum = (short) (brightness*(abs(Horz)+abs(Vert)));
    if ( Sum < 0 ) return 0; else if ( Sum > 0xff ) return 0xff;
    return (unsigned char) Sum;
}

// set up the parameters for ComputeSobel
// one block executes the filter on one row, 
// so if the block dimension is smaller then the image width 
// some threads has to performe multiple sobel operations
// this kernel uses 21 registers
__global__ void 
FullImageSobel( unsigned char *oImage, int w, float brightness )
{
	unsigned char *oRow = oImage + blockIdx.x * w;
	
	for( int i = threadIdx.x; i < w; i += blockDim.x ) {
		unsigned char pix00 = tex2D( tex, (float) i-1, (float) blockIdx.x-1 );
		unsigned char pix01 = tex2D( tex, (float) i+0, (float) blockIdx.x-1 );
		unsigned char pix02 = tex2D( tex, (float) i+1, (float) blockIdx.x-1 );
		unsigned char pix10 = tex2D( tex, (float) i-1, (float) blockIdx.x+0 );
		unsigned char pix11 = tex2D( tex, (float) i+0, (float) blockIdx.x+0 );
		unsigned char pix12 = tex2D( tex, (float) i+1, (float) blockIdx.x+0 );
		unsigned char pix20 = tex2D( tex, (float) i-1, (float) blockIdx.x+1 );
		unsigned char pix21 = tex2D( tex, (float) i+0, (float) blockIdx.x+1 );
		unsigned char pix22 = tex2D( tex, (float) i+1, (float) blockIdx.x+1 );
		oRow[i] = ComputeSobel( pix00, pix01, pix02, 
    	                        pix10, pix11, pix12,
    	                        pix20, pix21, pix22, brightness );
	}
}

// set up the parameters for ComputeSobel
// one block executes the filter on one row, 
// so if the block dimension is smaller then the image width 
// the filter is only performed on a part of the image
// this kernel uses 14 registers
__global__ void 
VariableSobel( unsigned char *oImage, int w, float brightness)
{
		unsigned char *oRow = oImage + blockIdx.x * w;
		int i = threadIdx.x;
		
		unsigned char pix00 = tex2D( tex, (float) i-1, (float) blockIdx.x-1 );
		unsigned char pix01 = tex2D( tex, (float) i+0, (float) blockIdx.x-1 );
		unsigned char pix02 = tex2D( tex, (float) i+1, (float) blockIdx.x-1 );
		unsigned char pix10 = tex2D( tex, (float) i-1, (float) blockIdx.x+0 );
		unsigned char pix11 = tex2D( tex, (float) i+0, (float) blockIdx.x+0 );
		unsigned char pix12 = tex2D( tex, (float) i+1, (float) blockIdx.x+0 );
		unsigned char pix20 = tex2D( tex, (float) i-1, (float) blockIdx.x+1 );
		unsigned char pix21 = tex2D( tex, (float) i+0, (float) blockIdx.x+1 );
		unsigned char pix22 = tex2D( tex, (float) i+1, (float) blockIdx.x+1 );
		oRow[i] = ComputeSobel( pix00, pix01, pix02, 
	                            pix10, pix11, pix12,
	                            pix20, pix21, pix22, brightness );
}

// copy the image from the textue tex to the buffer in oImage to draw it
__global__ void 
CopyImage( unsigned char *oImage, int w, int h, float brightness )
{
    unsigned char *oRow = oImage + blockIdx.x * w;
    for ( int i = threadIdx.x; i < w; i += blockDim.x ) {
        oRow[i] = min( max((tex2D( tex, (float) i, (float) blockIdx.x ) * brightness), 0.f), 255.f);
    }
}

// this function sets up the memory for the image and copies it to the device
extern "C" void setupTexture(int iw, int ih, unsigned char *pixels, int Bpp)
{
    cudaChannelFormatDesc desc;
    
    if (Bpp == 1) {
        desc = cudaCreateChannelDesc<unsigned char>();
    } else {
        desc = cudaCreateChannelDesc<uchar4>();
    }

    cutilSafeCall(cudaMallocArray(&array, &desc, iw, ih));
    cutilSafeCall(cudaMemcpyToArray(array, 0, 0, pixels, Bpp*sizeof(unsigned char)*iw*ih, cudaMemcpyHostToDevice));
}

// clean up
extern "C" void deleteTexture(void)
{
    cutilSafeCall(cudaFreeArray(array));
}

// this function binds the image stored in array to the texture tex
// after that the access to the image is performed by the texture unit on the device
// depending on the current mode the right cuda kernel is called
extern "C" void sobelFilter(unsigned char *oImage, int iw, int ih, enum SobelDisplayMode mode, float brightness, int sobelWidth) 
{
    cutilSafeCall(cudaBindTextureToArray(tex, array));

    switch ( mode ) {
        case  SOBELDISPLAY_IMAGE: 
			CopyImage<<<ih, min(iw, 512)>>>( oImage, iw, ih, brightness );
            break;
		case SOBELDISPLAY_SOBEL:
			CopyImage<<<ih, min(iw, 512)>>>( oImage, iw, ih, brightness );	
            VariableSobel<<<ih, min(sobelWidth, 512)>>>( oImage, iw, brightness );
            break;
		case SOBELDISPLAY_SOBEL_FULL:
			FullImageSobel<<<ih, min(iw, 384)>>>( oImage, iw, brightness ); // 384 ~ 8192 / 21
			break;
    }
	cudaThreadSynchronize();
	cudaError_t error = cudaGetLastError();
	if (error != cudaSuccess)
		printf("Error: %s\n", cudaGetErrorString(error));

    cutilSafeCall(cudaUnbindTexture(tex));
}
