

#include "nefft.h"
#include <math.h>





/* internally used subroutine to get the
   integer equispaced node for a given nonequispaced node
   input: nonequispaced node x in [-M_PI,M_PI]
   restult: neaarest in [-FFT_N/2-L/2..FFT_N/2+L/2]
*/
int getintminnode(double x,int FFT_N ,int L) {
  if(L&1) /* odd length like in paper:  -L/2..L/2  */
    return (int)rint(.5*x*FFT_N/M_PI)-L/2;
  /* even L: -L/2..L/2-1 */
  return (int)ceil(.5*x*FFT_N/M_PI)-L/2;
}

/* NAG Bessel_io Routine: */
double s18aef_(double *,int *);

/* bessel routine from natlib */
double i0(double x);

double bessel_i0(double x)
{
// /* use io from bessel.c / netlib */

  return i0(x);

/* uncomment the following if you are using NAG: */
/*
  int ifail;  
  return s18aef_(&x,&ifail);
*/
}

/*
 * used internally to copute the Kaise Bessel
 * (kb) weight
 */

double kb_weight(double x,double a,double b) {

  if(fabs(x)>fabs(a))
    return 0;
  x=b*sqrt(a*a-x*x);
  return bessel_i0(x);
}

/*
 * used internally to compute the Kaisser Bessel
 * (kb) window
 */
double kb_coef(double y,double a, double b) {

  double scale=M_PI;
  if(fabs(y)==b)
    return a/scale;
  if(fabs(y)>b)
    return sin(a*sqrt(y*y-b*b))/sqrt(y*y-b*b)/scale;
  else
    return sinh(a*sqrt(b*b-y*y))/sqrt(b*b-y*y)/scale;
}

/*
 * compute the approximation error
 * for our expansion of e^(ikx_0)
 */
double nefft_calc_approx_err(int N,int FFT_N,int L,
			     REALTYPE x0,REALTYPE *WeightDiv,REALTYPE *coef
			     )
{
  double sr,si,e;
  double e_max;
  int i,j,k,l0;

  sr=si=0.0;
  e_max=0.0;
  for(i=0;i!=N;++i) {
    k=i-N/2; 
    l0=getintminnode(x0,FFT_N,L);
    for(j=0;j!=L;++j) {
      sr+=(double)coef[j]*(double)WeightDiv[i]*cos((double)(2.0*M_PI*k*(l0+j)/FFT_N));
      si+=(double)coef[j]*(double)WeightDiv[i]*sin((double)(2.0*M_PI*k*(l0+j)/FFT_N));
    }
    sr-=cos((double)x0*k);
    si-=sin((double)x0*k);
    e=sr*sr+si*si;
    if(e>e_max)
      e_max=e;
  }
  e_max=sqrt(e_max);
  return e_max; 
}

/*
 * compute the total approximatino error
 */
double nefft_approx_err(int N,int FFT_N,int L,
				   int M,REALTYPE *Nodes,
				   int *IntNodes,
				   REALTYPE *WeightDiv,
				   REALTYPE *Coef)
{

  int l;
  double e_max,e;
  e_max=0;
  for(l=0 ; l!=M ; ++l) {
    e=nefft_calc_approx_err(N,FFT_N,L,Nodes[l],WeightDiv,Coef+l*L);
    if(e>e_max)
      e_max=e;
  }
  return e_max;
}

/*
 * creates the weight vector \phi_k
 * k=0...N-1
 * k must be of size N
 * FFT_N is the size of the FFT: FFT_N = cN
 * where c is the oversampling
 * alpha can be set to 0 for default choice
 * L is the total length of the interpolation (=2K)
 * typically 7, 13 for float, double resp.
 */

void init_nefft_kb_weight(int N,int FFT_N,int L,
			  REALTYPE *WeightDiv,
			  REALTYPE alpha)
{

  int i;
  REALTYPE L2=(REALTYPE)L/2.0;
  REALTYPE scale=kb_weight(0,M_PI*alpha,L2);

  /*  default alpha if alpha=0.0 */
  REALTYPE defalpha=2.0-1.0*N/FFT_N-.01;
  if(alpha<=.1 || alpha >defalpha)
    alpha=defalpha;

  for(i=0;i!=N;++i)
    WeightDiv[i]=1.0/(kb_weight( 2.0*M_PI*(i-N/2)/FFT_N , M_PI*alpha,1.0*L2)/scale);

}

/*
 * used internally to
 * create the window vector \phi_{l,m}
 * nodes is nodes[0...M-1]
 * IntNodes is IntNodes[0..M-1]
 * Coef is Coef[0...LM-1]
 * FFT_N is the size of the FFT: FFT_N = cN
 * where c is the oversampling
 * alpha can be set to 0 for default choice
 * L is the total length of the interpolation (=2K)
 * typically 7, 13 for float, double resp.
 */

void init_nefft_kb_coef(int N,int FFT_N,int L,
			int M,REALTYPE *Nodes,
			int *IntNodes,
			REALTYPE *Coef,
			REALTYPE alpha) {
  int i,j,l;
  REALTYPE L2=(REALTYPE)L/2.0;
  REALTYPE scale=kb_weight(0,M_PI*alpha,L2);

  /*  default alpha if alpha=0.0 */
  REALTYPE defalpha=2.0-1.0*N/FFT_N-.01;
  if(alpha<=.1 || alpha >defalpha)
    alpha=defalpha;
  
  for(i=0;i!=M;++i) {
    IntNodes[i]=getintminnode(Nodes[i],FFT_N,L);
    if(IntNodes[i]<0) /* bereich <0 nach >N/2 spiegeln (fr 0..N FFT) */
      IntNodes[i]+=FFT_N;
  }

  for(l=0;l!=M;++l) {
    for(j=0;j!=L;++j) {
      REALTYPE dx=.5*FFT_N/M_PI*Nodes[l]-(getintminnode(Nodes[l],FFT_N,L)+j);
      /*if(fabs(dx)>L2)
	cerr << "ERROR! unexpected dx: " << dx << endl; */
      Coef[l*L+j]=kb_coef(dx,M_PI*alpha,L2)/scale;
    }
  }

}

/*
 * this routine does the initalisation
 * for the nonequispaced fourier transform
 * (ned as well as ner)
 * it basically calls init_nefft_kb_coef and
 * init_nefft_kb_weight
 *
 * input parameters
 * N: the size of fourier transform
 * FFT_N: sizeof the internally used
 * fft (see below)
 * L: length of approximatoin, L=2K+1 
 * (see below)
 * M: number of nonequispaced nodes.
 * typically similar to N
 
 * Output parameters: coefficients
 * used for nonequispaced fast fourier transform
 * IntNodes,WeightDiv,Coef
 *
 * the approximation accuracy is
 * determined by the oversamping 
 * FFT_N/N and the expansion size L
 * a good choice is FFT_N about 2*N
 * with FFT_N FFT-friendly and
 * L=7,13 for single, double precision resp.
 * This routine is also used for 2d neffts.
 * In this case it has to be called twice,
 * once for the x coordinates and once for the
 * y coordinates of the nonequispaced nodes.
 * In this case M is most likely about N*N
 */

void init_nefft_kb(int N,int FFT_N,int L,
		   int M,REALTYPE *Nodes,
		   int *IntNodes,
		   REALTYPE *WeightDiv,
		   REALTYPE *Coef) {

  int i,j,l;
  REALTYPE L2=(REALTYPE)L/2.0;
  /*  use default alpha  */
  REALTYPE alpha=2.0-1.0*N/FFT_N-.01;

  REALTYPE scale=kb_weight(0,M_PI*alpha,L2);


  init_nefft_kb_weight(N,FFT_N,L,WeightDiv,alpha);

  init_nefft_kb_coef(N,FFT_N,L,M,Nodes,IntNodes,
		     Coef,alpha);

}

/*********************************** end of init part **********/

/*
  rtemp must be of size 2(FFT_N+L)
*/

void nedgridding(REALTYPE *rtemp,const REALTYPE *rinput,
		 const int M,const int FFT_N,const int L,
		 const int *IntNodes,const REALTYPE *P
		 )
     
{
  int l,j,jc;
  const REALTYPE *PP;
  REALTYPE ir,ii;
  REALTYPE *rUU;
  for(l=0;l!=(FFT_N+L)*2;++l) /* will working space with zeros */
    rtemp[l]=0.0;
  if(L==7) { /* loop unrolling for L=7 */
    for(l=0;l<M;++l) {
      PP=P+l*L;
      rUU=rtemp+IntNodes[l]*2;
      ir=rinput[2*l];
      ii=rinput[2*l+1];
      rUU[0]+=PP[0]*ir;
      rUU[1]+=PP[0]*ii;
      rUU[2]+=PP[1]*ir;
      rUU[3]+=PP[1]*ii;
      rUU[4]+=PP[2]*ir;
      rUU[5]+=PP[2]*ii;
      rUU[6]+=PP[3]*ir;
      rUU[7]+=PP[3]*ii;
      rUU[8]+=PP[4]*ir;
      rUU[9]+=PP[4]*ii;
      rUU[10]+=PP[5]*ir;
      rUU[11]+=PP[5]*ii;
      rUU[12]+=PP[6]*ir;
      rUU[13]+=PP[6]*ii;

    }
  }
  else {
    for(l=0;l<M;++l) {
      PP=P+l*L;
      rUU=rtemp+IntNodes[l]*2;
      ir=rinput[2*l];
      ii=rinput[2*l+1];
      for(j=jc=0;j!=L;++j) {
	rUU[jc++]+=PP[j]*ir;
	rUU[jc++]+=PP[j]*ii;
      }
    }
  }

  for(j=0;j!=2*L;++j)             /*  copy wrap-around buffer at end back */
    rtemp[j]+=rtemp[2*FFT_N+j];    /* to beginning (did no modulo calc in loop */
}

void nedscaling(REALTYPE *routput,const REALTYPE *rtemp,
		const REALTYPE *weight,
		const int n,const int FFT_N)
{
  int i;
  for(i=0;i<n-n/2;++i) {
    routput[n+2*i  ]=rtemp[2*i]*weight[n/2+i];
    routput[n+2*i+1]=rtemp[2*i+1]*weight[n/2+i];
  }
  for(i=0;i<n/2;++i) {
    routput[2*i]  =rtemp[2*FFT_N-n+2*i    ]*weight[i]; 
    routput[2*i+1]=rtemp[2*FFT_N-n+2*i+1]*weight[i]; 
  }
}


void nergridding(REALTYPE *routput,REALTYPE *rtemp,
		 int M,int FFT_N,int L,
		 const int *IntNodes,const REALTYPE *coef)
{

  int j,l,lc;
  const REALTYPE *QQ;
  const REALTYPE *xtemp;
  REALTYPE sr,si;

  for(j=0;j!=2*L;++j)
    rtemp[(2*FFT_N)+j]=rtemp[j];

  /* compute gridding (convolution) 
     and swap [0..N] to [-N/2,N/2] to convert
     symmetric FFT to zero based FFT as provided
     by external FFT package
  */

  if(L==7) { /* loop unrolling for L=7 */
    for(j=0;j<M;++j) {
      QQ=coef+j*L;
      xtemp=rtemp+2*IntNodes[j];
      routput[2*j]  
	=QQ[0]*xtemp[0]
	+QQ[1]*xtemp[2]
	+QQ[2]*xtemp[4]
	+QQ[3]*xtemp[6]
	+QQ[4]*xtemp[8]
	+QQ[5]*xtemp[10]
	+QQ[6]*xtemp[12];
      
      routput[2*j+1]
	=QQ[0]*xtemp[1]
	+QQ[1]*xtemp[3]
	+QQ[2]*xtemp[5]
	+QQ[3]*xtemp[7]
	+QQ[4]*xtemp[9]
	+QQ[5]*xtemp[11]
	+QQ[6]*xtemp[13];

    }
  } else {
    if(L==13) { /* loop unrolling for L=13 */
      for(j=0;j<M;++j) {
	QQ=coef+j*L;
	xtemp=rtemp+2*IntNodes[j];
	routput[2*j]  
	  =QQ[0]*xtemp[0]
	  +QQ[1]*xtemp[2]
	  +QQ[2]*xtemp[4]
	  +QQ[3]*xtemp[6]
	  +QQ[4]*xtemp[8]
	  +QQ[5]*xtemp[10]
	  +QQ[6]*xtemp[12]
	  +QQ[7]*xtemp[14]
	  +QQ[8]*xtemp[16]
	  +QQ[9]*xtemp[18]
	  +QQ[10]*xtemp[20]
	  +QQ[11]*xtemp[22]
	  +QQ[12]*xtemp[24];
	
	routput[2*j+1]
	  =QQ[ 0]*xtemp[ 1]
	  +QQ[ 1]*xtemp[ 3]
	  +QQ[ 2]*xtemp[ 5]
	  +QQ[ 3]*xtemp[ 7]
	  +QQ[ 4]*xtemp[ 9]
	  +QQ[ 5]*xtemp[11]
	  +QQ[ 6]*xtemp[13]
	  +QQ[ 7]*xtemp[15]
	  +QQ[ 8]*xtemp[17]
	  +QQ[ 9]*xtemp[19]
	  +QQ[10]*xtemp[21]
	  +QQ[11]*xtemp[23]
	  +QQ[12]*xtemp[25];
      }
    } 
    else {
      for(j=0;j<M;++j) {
	QQ=coef+j*L;
	xtemp=rtemp+2*IntNodes[j];
	sr=si=0.0;
	for(l=lc=0;l<L;++l) {
	  sr+=QQ[l]*xtemp[lc++];
	  si+=QQ[l]*xtemp[lc++]; 
	}
	routput[2*j]  =sr;
	routput[2*j+1]=si;
      }
    }
  }
}
void nerscaling(REALTYPE *rtemp,const REALTYPE *rinput,
		const REALTYPE *weight,
		int n,int FFT_N
		) {
  REALTYPE *rdest;

  int i,j,k,l;
  const REALTYPE *rcsrc;
  const REALTYPE *rrsrc;

  rdest=rtemp;
  rcsrc=rinput+n/2*2;
  rrsrc=weight+n/2;
  for(i=0;i!=n-n/2;++i) {
    rdest[2*i  ]=rcsrc[2*i  ]*rrsrc[i];
    rdest[2*i+1]=rcsrc[2*i+1]*rrsrc[i];
  }
  
  rdest=rtemp+(FFT_N-n/2)*2;  /*(REAL*)(Temp1+FFT_N-n/2); */
  rcsrc=rinput;
  rrsrc=weight;
  for(i=0;i!=n/2;++i) {
    rdest[2*i  ]=rcsrc[2*i  ]*rrsrc[i];
    rdest[2*i+1]=rcsrc[2*i+1]*rrsrc[i];
  }
  
  rdest=rtemp+(n-n/2)*2; /*(REAL*)(Temp1+n-n/2); */
  for(i=0;i!=(FFT_N-n)*2;++i)   /* fill in between with zeroes */
    rdest[i]=0.0;      
}

/* like nerscaling, but with real valued input data */
void nerscaling_real(REALTYPE *rtemp,const REALTYPE *rinput,
		     const REALTYPE *weight,
		     int n,int FFT_N
		     ) {
  REALTYPE *rdest;

  int i,j,k,l;
  const REALTYPE *rcsrc;
  const REALTYPE *rrsrc;

  rdest=rtemp;
  rcsrc=rinput+n/2;
  rrsrc=weight+n/2;
  for(i=0;i!=n-n/2;++i) {
    rdest[i]=rcsrc[i]*rrsrc[i];
  }
  
  rdest=rtemp+(FFT_N-n/2);  /*(REAL*)(Temp1+FFT_N-n/2); */
  rcsrc=rinput;
  rrsrc=weight;
  for(i=0;i!=n/2;++i) {
    rdest[i]=rcsrc[i]*rrsrc[i];
  }
  
  rdest=rtemp+(n-n/2); /*(REAL*)(Temp1+n-n/2); */
  for(i=0;i!=(FFT_N-n);++i)   /* fill in between with zeroes */
    rdest[i]=0.0;      
}


void ner2gridding(REALTYPE *rResult,const REALTYPE *rTempBase,
		  const int *xIntNodesPtr,const int *yIntNodesPtr,
		  const REALTYPE *xCoef,const REALTYPE *yCoef,
		  int FFT_Nx,int FFT_Ny,int M,int L) {
  
  REALTYPE sr,si;
  REALTYPE ssr,ssi;
  const REALTYPE *rtemp;
  int ll1,ll2,j,l1,l2,l1_0,l2_0;

  for(j=0;j!=M;++j) {
    sr=si=0.0;
    l1_0=xIntNodesPtr[j];
    l2_0=yIntNodesPtr[j]*2;
    ll1=l1_0;
    rtemp=rTempBase+(ll1*FFT_Ny*2);
    for(l1=0;l1!=L;++l1) {
      ssr=ssi=0.;
      if(ll1>=FFT_Nx) {   
	ll1-=FFT_Nx;
	rtemp=rTempBase+(ll1*FFT_Ny*2);
      }
      ll2=l2_0;
      for(l2=0;l2!=L;++l2) {
	if(ll2>=FFT_Ny*2)  /* entfernen aus der inneren Schleife bringt 10%! */
	  ll2-=FFT_Ny*2;	
	ssr+=yCoef[l2]*rtemp[ll2++];
	ssi+=yCoef[l2]*rtemp[ll2++];	    
      }
      sr+=xCoef[l1]*ssr;
      si+=xCoef[l1]*ssi;
      ++ll1;
      rtemp+=FFT_Ny*2;
    }
    rResult[2*j  ]=sr;
    rResult[2*j+1]=si;	
    xCoef+=L;
    yCoef+=L;
  }
}



void ned2gridding(REALTYPE *rtemp,const REALTYPE *rinput,
		  const int *xIntNodesPtr,const int *yIntNodesPtr,
		  const REALTYPE *xCoef,const REALTYPE *yCoef,		  
		  int FFT_Nx,int FFT_Ny,int M,int L)
{


  REALTYPE *TempPtr;
  int TempLength=2*FFT_Nx*FFT_Ny;
  REALTYPE *rtempEnd=rtemp+TempLength;
  REALTYPE tr,ti;
  int x0,jj,y0,kk,j,l,k;

  for(j=0;j!=FFT_Nx*FFT_Ny*2;++j) {
    rtemp[j]=0.0;
  }

  for(l=0;l!=M;++l) {
    x0=xIntNodesPtr[l];
    y0=yIntNodesPtr[l]*2;
    TempPtr=rtemp+2*FFT_Ny*x0; 

    for(j=0;j!=L;j++) {
      tr=rinput[2*l  ]*xCoef[j];
      ti=rinput[2*l+1]*xCoef[j];
      kk=y0;
      for(k=0;k!=L;++k) {	  
	if(kk>=2*FFT_Ny)
	  kk-=2*FFT_Ny;
	TempPtr[kk++]+=tr*yCoef[k];
	TempPtr[kk++]+=ti*yCoef[k];
      }
      TempPtr+=2*FFT_Ny;
      if(TempPtr>=rtempEnd)
	TempPtr-=TempLength;
    }
    yCoef+=L;
    xCoef+=L;
  }
}

void ner2scalling(REALTYPE *rTemp,REALTYPE *rInput,
		  const REALTYPE *xWeight,const REALTYPE *yWeight,
		  int Nx,int Ny,int FFT_Nx,int FFT_Ny
		  ) {
  REALTYPE *t1,*t2,*t3,*t4;
  const REALTYPE *i1,*i2,*i3,*i4;
  int i,j;
  REALTYPE xw1,xw2;

  for(i=0;i!=FFT_Nx*FFT_Ny*2;++i) {
    rTemp[i]=0.0;
  }

  for(i=0;i!=Nx/2;++i) {
    t1=rTemp+(i*FFT_Ny)*2;
    t2=rTemp+((FFT_Nx-Nx/2+i)*FFT_Ny)*2;
    t3=rTemp+(i*FFT_Ny+FFT_Ny-Ny/2)*2;
    t4=rTemp+((FFT_Nx-Nx/2+i)*FFT_Ny+FFT_Ny-Ny/2)*2;
    i1=rInput+((Nx/2+i)*Ny+Ny/2)*2;
    i2=rInput+(i*Ny+Ny/2)*2;
    i3=rInput+((Nx/2+i)*Ny)*2;
    i4=rInput+(i*Ny)*2;
    xw1=xWeight[Nx/2+i];
    xw2=xWeight[i];
    for(j=0;j!=Ny/2;++j) {
      *(t1++)=*(i1++)*xw1*yWeight[Ny/2+j];
      *(t1++)=*(i1++)*xw1*yWeight[Ny/2+j];
      *(t2++)=*(i2++)*xw2*yWeight[Ny/2+j];
      *(t2++)=*(i2++)*xw2*yWeight[Ny/2+j];
      *(t3++)=*(i3++)*xw1*yWeight[j];
      *(t3++)=*(i3++)*xw1*yWeight[j];
      *(t4++)=*(i4++)*xw2*yWeight[j];
      *(t4++)=*(i4++)*xw2*yWeight[j];
    }
  }

}

void ned2scalling(REALTYPE *rOutput,const REALTYPE *rTemp,
		  const REALTYPE *xWeight,const REALTYPE *yWeight,
		  int Nx,int Ny,int FFT_Nx,int FFT_Ny
		  ) {
  const REALTYPE *t1,*t2,*t3,*t4;
  REALTYPE *o1,*o2,*o3,*o4;
  int i,j;
  REALTYPE xw1,xw2;


  for(i=0;i!=Nx/2;++i) {
    t1=rTemp+(i*FFT_Ny)*2;
    t2=rTemp+((FFT_Nx-Nx/2+i)*FFT_Ny)*2;
    t3=rTemp+(i*FFT_Ny+FFT_Ny-Ny/2)*2;
    t4=rTemp+((FFT_Nx-Nx/2+i)*FFT_Ny+FFT_Ny-Ny/2)*2;
    o1=rOutput+((Nx/2+i)*Ny+Ny/2)*2;
    o2=rOutput+(i*Ny+Ny/2)*2;
    o3=rOutput+((Nx/2+i)*Ny)*2;
    o4=rOutput+(i*Ny)*2;
    xw1=xWeight[Nx/2+i];
    xw2=xWeight[i];
    for(j=0;j!=Ny/2;++j) {
      *(o1++)=*(t1++)*xw1*yWeight[Ny/2+j];
      *(o1++)=*(t1++)*xw1*yWeight[Ny/2+j];
      *(o2++)=*(t2++)*xw2*yWeight[Ny/2+j];
      *(o2++)=*(t2++)*xw2*yWeight[Ny/2+j];
      *(o3++)=*(t3++)*xw1*yWeight[j];
      *(o3++)=*(t3++)*xw1*yWeight[j];
      *(o4++)=*(t4++)*xw2*yWeight[j];
      *(o4++)=*(t4++)*xw2*yWeight[j];
    }
  }

}

/* dim of temp array:
   at least N*(2*L+6), i.e. 2NL for Matrix C, 2N for vector b and 4*N for nag_temp
   */

