/* ----------------------------------------------------------------------
 * onearray_run_scg_fast_logsph_c.c
 *
 * C implementation of the "run_scg_fast.m" function,
 *   fully compatible with MEX.
 *
 * It is used for example in "FULL_detect_locate.m" and
 * "FAST_detect_locate.m".
 *
 * This is an implementation of the Scaled Conjugate Gradient (SCG)
 * algorithm (NN 1993, vol. 6 article from M. Moller).
 *
 * The SCG part was written in a completely generic manner.
 * The cost/gradient part was written for a microphone array application.
 *
 * -> for another application, just rewrite "dE_E_fun()" in this file,
 *    and recompile with mex.
 *
 * General note: I tried using "*ptr++" instead of "array_ptr[ index ]",
 * but it did not give me any speed improvement at all (GCC on pentium4).
 % -> so I chose "array_ptr[ index ]" for clarity.
 * 
 * Specific note: dE_E_fun() can deal with increase or decrease of
 * dimensionality while doing the SCG descent.  In the case of
 * increase, it will re-allocate memory automatically.
 *
 * Finally, "log spherical" means that radius is expressed in log domain,
 * which is a more convenient domain to model a strictly positive variable.
 * Hence the space is:  ( azimuth in radians,
 *                        elevation in radians,
 *                        log( radius ) where radius is in meters )
 * 
 *   
 * ----------
 *
 * MATLAB syntax: 
 *
 * ws = onearray_run_scg_fast_logsph_c( ws )
 *
 * where "ws" is a MATLAB structure containing parameters, inputs and outputs.
 *
 * The initialization is "ws.w_1", the output is "ws.w_kp1".
 * See "run_scg_fast.m" for more info.
 *
 * 
 * 
 * G. Lathoud 2005
 * lathoud@idiap.ch
 */

#include <math.h>
#include <string.h>
#include "mex.h"

/* 1 or 0 */
#define VERBOSE             0

#if     VERBOSE == 1
#include "stdio.h"
#endif

/* Input Argument */

#define  N_INPUTS        1

#define  WS_IN           prhs[0]


/* Output Argument */

#define  N_OUTPUTS       1

#define  WS_OUT          plhs[0]


/* Misc */

#if !defined(MAX)
#define MAX(A, B)       ((A) > (B) ? (A) : (B))
#endif

#if !defined(MIN)
#define MIN(A, B)       ((A) < (B) ? (A) : (B))
#endif

#define PI 3.14159265


void dE_E_fun( const double* logsph_ptr, const unsigned int logsph_numel, double* dE_ptr, double* E_ptr, const unsigned short compute_E, short force_init_or_destroy );


#if VERBOSE == 1
/* ---------------------------------------------------------------------- */
/* debugging functions */

void print_double_array( double* doublePtr, unsigned int N, const char * a_string ) {

  unsigned int n;

  printf( "\n" );
  printf( a_string );
  printf( "[]: " );

  for( n = 0; n < N; n++ ) {
    printf( " [%d]:%g,", n, doublePtr[n] );
  }

  printf( "\n" );
  
}

void print_unsigned_short_array( unsigned short* usPtr, unsigned int N, const char * a_string ) {

  unsigned int n;
  unsigned short * usmax;

  printf( "\n" );
  printf( a_string );
  printf( "[]: " );

  for( n = 0; n < N; n++ ) {
    printf( " [%d]:%d,", n, usPtr[n] );
  }

  printf( "\n" );
  
}
#endif


/* ----------------------------------------------------------------------
 * Wrapper: this is the gateway function to MATLAB
 * ---------------------------------------------------------------------- */

void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray*prhs[] )

{

  unsigned int   N_sizeof_double;
  
  double          *w_k_ptr, *p_k_ptr, *r_k_ptr, *w_k_sigma_ptr, *w_k_alpha_ptr, *s_k_ptr;
  double          *dE_w_k_ptr, *E_w_k_ptr, *dE_w_k_sigma_ptr, *dE_w_k_alpha_ptr, *E_w_k_alpha_ptr;
  double          *dE_w_kp1_ptr, *E_w_kp1_ptr, *r_kp1_ptr, *p_kp1_ptr;
  

  /* Input: one mandatory parameter */
  mxArray   *w_1_mx;
  
  double    *w_1_ptr;
  
  /* Output: one parameter */

  mxArray   *w_kp1_mx;
  double    *w_kp1_ptr;

  /* Other parameters */
  
  mxArray   *sigma_mx,  *lambda_1_mx,  *min_iter_mx,  *max_iter_mx,  *max_iter_nosuccess_mx;
  double    sigma, lambda_1, min_iter, max_iter, max_iter_nosuccess;
  
  mxArray   *verbose_mx,    *cv_thr_mx;
  double    verbose, cv_thr;
  
  /* Variables used internally, during processing */

  
  double   lambda_k, lambda_bar_k, norm_p_k_square, norm_p_k, sigma_k;
  double   delta_k, alpha_k, DELTA_k, mu_k, beta_k;
  
  unsigned int N;  /* Dimensionality */
  unsigned int success;  /* 0 or 1 */
  unsigned int k, iter_nosuccess;  /* Count of iterations */

  unsigned int has_converged, do_continue;  /* For the termination test */

  /* crappy temporary variables */


  double *        doublePtr;  
  char *          c_string;
  unsigned int    a,b,c,d,e, i,j, m,n,o,p,q, t, u, v;
  double          aa,bb,cc,dd,ee, rr, ss,tt,uu,vv, xx, yy, zz;
  int  * tmp_int_array;
  int  * tmp_int_array2;
  
  c_string = (char*)malloc(2000*sizeof(char));
  
  /*---------------------------------------------------------------------- */
  /* ( 1 ) Check that we have enough input arguments */
  
  if ( nrhs != N_INPUTS ) {
    sprintf( c_string, "onearray_run_scg_fast_logsph_c.c: %d input argument(s) required.",N_INPUTS );
    mexErrMsgTxt( c_string );
  }
  
  /* Check we have at least one output argument */

  if ( nlhs != N_OUTPUTS ) {
    sprintf( c_string, "onearray_run_scg_fast_logsph_c.c: %d output argument(s) required.",N_OUTPUTS );
    mexErrMsgTxt( c_string );
  }
  
  
  /* Check the type of the input argument */
  
  if ( !mxIsStruct( WS_IN ) ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: input argument 'ws' must be a structure." );
  }
    
  /* ---------------------------------------------------------------------- */
  /* ( 2 ) Get the various dimensions of the input parameters
   * as well as a pointer access to each of them. */

  /* Mandatory input parameter */

  w_1_mx = mxGetField( WS_IN, 0, "w_1" );
  if ( w_1_mx == NULL ) { mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: could not access mandatory parameter 'ws.w_1'." ); }
  if ( mxGetClassID( w_1_mx ) != mxDOUBLE_CLASS ) {  mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: 'ws.w_1' must be an array of doubles." ); }
  
  w_1_ptr = mxGetPr( w_1_mx );
  
#if VERBOSE == 1
  print_double_array( w_1_ptr, mxGetNumberOfElements( w_1_mx ), "w_1" );
#endif

  /* Allocate memory for the output, initialize it with the input (deep copy) */
  
  WS_OUT = mxDuplicateArray( WS_IN );
  
  /* Default parameters */
  
  /*    sigma   */
  
  sprintf( c_string, "sigma" );
  sigma_mx = mxGetField( WS_OUT, 0, c_string );
  if ( sigma_mx == NULL ) { 
    
    sigma_mx = mxCreateDoubleScalar( 1e-5 );
    mxAddField( WS_OUT, c_string );
    mxSetField( WS_OUT, 0, c_string, sigma_mx );
    
  } else if ( mxGetClassID( sigma_mx ) != mxDOUBLE_CLASS ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: 'ws.sigma' must be a double." );
  }
  sigma = mxGetScalar( sigma_mx );
    
  
  /*   lambda_1   */

  sprintf( c_string, "lambda_1" );
  lambda_1_mx = mxGetField( WS_OUT, 0, c_string );
  if ( lambda_1_mx == NULL ) { 
    
    lambda_1_mx = mxCreateDoubleScalar( 1e-7 );
    mxAddField( WS_OUT, c_string );
    mxSetField( WS_OUT, 0, c_string, lambda_1_mx );
    
  } else if ( mxGetClassID( lambda_1_mx ) != mxDOUBLE_CLASS ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: 'ws.lambda_1' must be a double." );
  }
  lambda_1 = mxGetScalar( lambda_1_mx );
  
  
  /*   min_iter   */

  sprintf( c_string, "min_iter" );
  min_iter_mx = mxGetField( WS_OUT, 0, c_string );
  if ( min_iter_mx == NULL ) { 
    
    min_iter_mx = mxCreateDoubleScalar( -1 );
    mxAddField( WS_OUT, c_string );
    mxSetField( WS_OUT, 0, c_string, min_iter_mx );
    
  } else if ( mxGetClassID( min_iter_mx ) != mxDOUBLE_CLASS ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: 'ws.min_iter' must be a double." );
  }
  min_iter = mxGetScalar( min_iter_mx );
  
  /*   max_iter   */

  sprintf( c_string, "max_iter" );
  max_iter_mx = mxGetField( WS_OUT, 0, c_string );
  if ( max_iter_mx == NULL ) { 
    
    max_iter_mx = mxCreateDoubleScalar( mxGetInf() );
    mxAddField( WS_OUT, c_string );
    mxSetField( WS_OUT, 0, c_string, max_iter_mx );
    
  } else if ( mxGetClassID( max_iter_mx ) != mxDOUBLE_CLASS ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: 'ws.max_iter' must be a double." );
  }
  max_iter = mxGetScalar( max_iter_mx );
  

  /*   max_iter_nosuccess   */

  sprintf( c_string, "max_iter_nosuccess" );
  max_iter_nosuccess_mx = mxGetField( WS_OUT, 0, c_string );
  if ( max_iter_nosuccess_mx == NULL ) { 
    
    max_iter_nosuccess_mx = mxCreateDoubleScalar( mxGetInf() );
    mxAddField( WS_OUT, c_string );
    mxSetField( WS_OUT, 0, c_string, max_iter_nosuccess_mx );
    
  } else if ( mxGetClassID( max_iter_nosuccess_mx ) != mxDOUBLE_CLASS ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: 'ws.max_iter_nosuccess' must be a double." );
  }
  max_iter_nosuccess = mxGetScalar( max_iter_nosuccess_mx );
  

  /*   verbose   */

  sprintf( c_string, "verbose" );
  verbose_mx = mxGetField( WS_OUT, 0, c_string );
  if ( verbose_mx == NULL ) { 
    
    verbose_mx = mxCreateDoubleScalar( 0 );
    mxAddField( WS_OUT, c_string );
    mxSetField( WS_OUT, 0, c_string, verbose_mx );
    
  } else if ( mxGetClassID( verbose_mx ) != mxDOUBLE_CLASS ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: 'ws.verbose' must be a double." );
  }
  verbose = mxGetScalar( verbose_mx );
  
  /*   cv_thr   */

  sprintf( c_string, "cv_thr" );
  cv_thr_mx = mxGetField( WS_OUT, 0, c_string );
  if ( cv_thr_mx == NULL ) { 
    
    cv_thr_mx = mxCreateDoubleScalar( 1e-10 );
    mxAddField( WS_OUT, c_string );
    mxSetField( WS_OUT, 0, c_string, cv_thr_mx );
    
  } else if ( mxGetClassID( cv_thr_mx ) != mxDOUBLE_CLASS ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c: 'ws.cv_thr' must be a double." );
  }
  cv_thr = mxGetScalar( cv_thr_mx );



  /* Dimensionality */
  N = mxGetNumberOfElements( w_1_mx );
  
  
  if ( verbose ) {
    printf( "onearray_run_scg_fast_logsph_c.c: dimensionality N:%d\n", N );
  }

  
  /*    WS_OUT.w_kp1   (returned value)   */
  
  sprintf( c_string, "w_kp1" );
  w_kp1_mx = mxGetField( WS_OUT, 0, c_string );
  if ( w_kp1_mx != NULL ) {
    mxFree( w_kp1_mx );
  } else {
    mxAddField( WS_OUT, "w_kp1" );
  }
  w_kp1_mx  = mxCreateDoubleMatrix( N, 1, mxREAL );
  mxSetField( WS_OUT, 0, "w_kp1", w_kp1_mx );

  w_kp1_ptr = mxGetPr( w_kp1_mx );
  
  /* ---------------------------------------------------------------------- */
  /* ( 3 ) Initialize the Scaled Conjugate Gradient (SCG) descent. */

  /* Note that here we differ a bit from "run_scg_fast.m":
   * to make things lighter, we won't define additional fields within "ws". 
   * -> lighter and faster code. */

  /* Allocate some memory for gradient and cost,
   * as well as the various vectors. */
  
  N_sizeof_double = N * sizeof( double );
  
  w_k_ptr = ( double* ) malloc( N_sizeof_double );
  
  w_k_sigma_ptr = ( double* ) malloc( N_sizeof_double );
  w_k_alpha_ptr = ( double* ) malloc( N_sizeof_double );
  
  s_k_ptr = ( double* ) malloc( N_sizeof_double );
  
  E_w_k_ptr = ( double* ) malloc( sizeof( double ) );
  E_w_k_alpha_ptr = ( double* ) malloc( sizeof( double ) );
  E_w_kp1_ptr = ( double* ) malloc( sizeof( double ) );
  
  dE_w_k_ptr = ( double* ) malloc( N_sizeof_double );
 
  dE_w_k_sigma_ptr = ( double* ) malloc( N_sizeof_double );
  dE_w_k_alpha_ptr = ( double* ) malloc( N_sizeof_double );
  dE_w_kp1_ptr = ( double* ) malloc( N_sizeof_double );
  
  r_k_ptr = ( double* ) malloc( N_sizeof_double );
  p_k_ptr = ( double* ) malloc( N_sizeof_double );
  
  r_kp1_ptr = ( double* ) malloc( N_sizeof_double );
  p_kp1_ptr = ( double* ) malloc( N_sizeof_double );
  
  /* Step "1." in the article */

  /* iteration count */
  k = 1;
  
  /* Starting point in dimension N */
  memcpy( w_k_ptr, w_1_ptr, N_sizeof_double );
  
  lambda_k     = lambda_1;
  lambda_bar_k = 0;
  
  /* Evaluate gradient and cost */
  /* Also force initialization */
  dE_E_fun( w_k_ptr, N, dE_w_k_ptr, E_w_k_ptr, 1, 1 );


  for ( n = 0; n < N; n++ ) {
    p_k_ptr[ n ] = -dE_w_k_ptr[ n ];
  }
  
  memcpy( r_k_ptr, p_k_ptr, N_sizeof_double );
  
  success = 1;
  iter_nosuccess = 0;

  while( 1 ) {

    if ( verbose ) {
      printf( "\n[          onearray_run_scg_fast_logsph_c.c: starting iteration #%d.          ]\n", k );
    }

#if VERBOSE == 1
  print_double_array( w_k_ptr, N, "w_k" );
  print_double_array( w_kp1_ptr, N, "w_kp1" );
#endif

    /* Step "2." in the article. */
    if ( success ) {
      
      norm_p_k_square = 0;
      for ( n = 0; n < N; n++ ) {
	aa = p_k_ptr[ n ];
	norm_p_k_square += aa * aa;
      }
      
      norm_p_k = sqrt( norm_p_k_square );
      
      sigma_k = sigma / norm_p_k;
      
      for ( n = 0; n < N; n++ ) {
	w_k_sigma_ptr[ n ] = w_k_ptr[ n ] + sigma_k * p_k_ptr[ n ];
      }
      
      /* Only the gradient is needed here */
      dE_E_fun( w_k_sigma_ptr, N, dE_w_k_sigma_ptr, NULL, 0, 0 );
      
      delta_k = 0;
      for ( n = 0; n < N; n++ ) {
	aa = ( dE_w_k_sigma_ptr[ n ] - dE_w_k_ptr[ n ] ) / sigma_k;
	s_k_ptr[ n ] = aa;
	delta_k += p_k_ptr[ n ] * aa;
	
      }
      
    } /* end of if success */

    /* Step "3." in the article */
    delta_k = delta_k + ( lambda_k - lambda_bar_k ) * norm_p_k_square;

    /* Step "4." in the article */
    if ( delta_k <= 0 ) {
      
      lambda_bar_k = 2 * ( lambda_k - delta_k / norm_p_k_square );
      delta_k      = - delta_k + lambda_k * norm_p_k_square;
      lambda_k     = lambda_bar_k;
      
    }

    /* Step "5." in the article */
    mu_k = 0;
    for ( n = 0; n < N; n++ ) {
      mu_k += p_k_ptr[ n ] * r_k_ptr[ n ];
    }
    alpha_k = mu_k / delta_k;

    /* Step "6." in the article */
    
    /* Tentative w_k_alpha_ptr (will be kept as "w_kp1" only if success in step 7.) */
    
    for ( n = 0; n < N; n++ ) {
      w_k_alpha_ptr[ n ] = w_k_ptr[ n ] + alpha_k * p_k_ptr[ n ];
    }

    /* Evaluate both cost and gradient */
    dE_E_fun( w_k_alpha_ptr, N, dE_w_k_alpha_ptr, E_w_k_alpha_ptr, 1, 0 );
    

    DELTA_k = 2 * delta_k * ( *E_w_k_ptr - *E_w_k_alpha_ptr ) / ( mu_k * mu_k );

    
    /* Step "7." in the article */
    if ( DELTA_k >= 0 ) {
      
      /* success */

      if ( verbose ) {
	printf( "onearray_run_scg_fast_logsph_c.c: success at iter %d\n", k );
      }
      
      memcpy( w_kp1_ptr, w_k_alpha_ptr, N_sizeof_double );

      *E_w_kp1_ptr   = *E_w_k_alpha_ptr;

      memcpy( dE_w_kp1_ptr, dE_w_k_alpha_ptr, N_sizeof_double );
      
      for ( n = 0; n < N; n++ ) {
	r_kp1_ptr[ n ] = -dE_w_kp1_ptr[ n ];
      }
      
      lambda_bar_k = 0;

      success = 1;
      iter_nosuccess = 0;
      
      /* Restart or continue */
      
      if ( k % N == 0 ) {
	
	if ( verbose ) {
	  printf( "onearray_run_scg_fast_logsph_c.c: restarting the algorithm at iter %d\n", k );
	}
	
	memcpy( p_kp1_ptr, r_kp1_ptr, N_sizeof_double );
	
      } else {

	beta_k = 0;
	for( n = 0; n < N; n++ ) {
	  aa = r_kp1_ptr[ n ];
	  beta_k += aa * ( aa - r_k_ptr[ n ] );
	}
	beta_k /= mu_k;

	for( n = 0; n < N; n++ ) {
	  p_kp1_ptr[ n ] = r_kp1_ptr[ n ] + beta_k * p_k_ptr[ n ];
	}
	
      }

      /* Reduce the scale parameter if necessary */

      if ( DELTA_k >= 0.75 ) {
	
	if ( verbose ) {
	  printf( "onearray_run_scg_fast_logsph_c.c: reducing the scale parameter at iter %d\n", k );
	}
	
	lambda_k = lambda_k / 4;

      }

    } else {

      /* Failure */

      if ( verbose ) {
	printf( "onearray_run_scg_fast_logsph_c.c: failure at iter %d\n", k );
      }
      
      lambda_bar_k = lambda_k;

      success = 0;
      iter_nosuccess++;

    }  /* if DELTA_k >= 0 */



    /* Step "8." in the algorithm
     * Increase the scale parameter if necessary */
    
    if ( DELTA_k < 0.25 ) {

      if ( verbose ) { 
	printf( "onearray_run_scg_fast_logsph_c.c: increasing the scale parameter at iter %d\n", k );
      }
      
      lambda_k = lambda_k + delta_k * ( 1 - DELTA_k ) / norm_p_k_square;

    }


    if ( verbose ) {
      printf( "s051102_run_scg_fast_c: finishing iteration #%d. About to do the termination test.\n", k );
    }


    /* Step "9." in the article */
    /* Termination test. */
    
    aa = abs( r_k_ptr[ 0 ] );
    for ( n = 1; n < N; n++ ) {
      aa = MAX( aa, fabs( r_k_ptr[ n ] ) );
    }
    has_converged = ( aa < cv_thr );

    if ( verbose ) {
      printf( "s051102_run_scg_fast_c:  aa:%g  cv_thr:%g  has_converged:%d\n\n", aa, cv_thr, has_converged );
    }
    
    do_continue = ( !has_converged ) && ( k < max_iter ) && ( iter_nosuccess < max_iter_nosuccess );
    
    do_continue = do_continue || ( k < min_iter );


    if ( do_continue ) {

      k++;
      
      if ( success ) {

	memcpy( w_k_ptr,      w_kp1_ptr,    N_sizeof_double );
	*E_w_k_ptr = *E_w_kp1_ptr;
	memcpy( dE_w_k_ptr,   dE_w_kp1_ptr, N_sizeof_double );
	memcpy( r_k_ptr,      r_kp1_ptr,    N_sizeof_double );
	memcpy( p_k_ptr,      p_kp1_ptr,    N_sizeof_double );
		
      }

    } else {

      if ( verbose ) {
	if ( has_converged ) {
	  printf( "onearray_run_scg_fast_logsph_c.c: finished converging at iter %d\n", k );
	} else {
	  printf( "onearray_run_scg_fast_logsph_c.c: stopped at iter %d, before convergence.\n", k );
	}
      }
      
      break;
    }

  } /* while( 1 ) */

  /* ---------------------------------------------------------------------- */
  /* ( end ) Free memory */
  
  if ( verbose ) {
    printf( "onearray_run_scg_fast_logsph_c.c: about to free allocated memory.\n" );
  }
    
  free( c_string );

  free( w_k_ptr );  
  free( w_k_sigma_ptr );
  free( w_k_alpha_ptr );
  free( s_k_ptr );
    
  free( E_w_k_ptr );
  free( E_w_k_alpha_ptr );
  free( E_w_kp1_ptr );

  free( dE_w_k_ptr );
  free( dE_w_k_sigma_ptr );
  free( dE_w_k_alpha_ptr );
  free( dE_w_kp1_ptr );
    
  free( r_k_ptr );
  free( p_k_ptr );
  free( r_kp1_ptr );
  free( p_kp1_ptr );

  /* includes "destroy" call to dE_E_fun() */
  dE_E_fun( NULL, 0, NULL, NULL, 0, -1 );
  
  if ( verbose ) {
    printf( "onearray_run_scg_fast_logsph_c.c: done.\n" );
  }
  
}

  

/* ====================================================================== *
 * function "dE_E_fun()": Computation of the gradient and the cost.
 *
 * We assume that the outputs dE_ptr and E_ptr are already allocated,
 * with respective sizes equal to w_k_numel and 1.
 *
 * 3 cases:
 * 1) if  force_init_or_destroy == 0:  Default behavior, (re)init only at first call or when dimensionality increases.
 * 2) if  force_init_or_destroy == 1:  Force initialization, useful when changing some parameters.
 * 3) if  force_init_or_destroy == -1: clear allocated memory (static variables).
 * ====================================================================== */

void dE_E_fun( const double* logsph_ptr, const unsigned int logsph_numel, double* dE_ptr, double* E_ptr, const unsigned short compute_E, short force_init_or_destroy ) {
  
  static unsigned short is_initialized = 0;

  mxArray          *fs_in, *c_in, *geometry_in, *pairs_in, *w_i_in, *theta_in, *weight_in;
  static double           weight_sum;
  
  static double       fs_val, c_val;
  static double       *geometry_ptr, *xyz_ptr, *w_i_ptr, *theta_ptr, *weight_ptr;

  static unsigned int     memory_logsph_numel;
  static unsigned int     nPairs, nChannels, nLocations, nDimensions, nFreq;
  static double        *D_ptr;
  static double           *t_ptr, *d1_ptr, *d2_ptr, *dxyz1_ptr, *dxyz2_ptr;
  static unsigned short   *pairs_ptr;
  
  /* These two are used to simplify pair indexing */
  static int  * tmp_int_array;
  static int  * tmp_int_array2;

  short   do_create_pairs_ptr;  /* 0 or 1 */
  
  double dE_x, dE_y, dE_z;

  
  
  /* global variable containing all parameters given to us by the user */
  
  const mxArray  *dE_E_par;
  
  /* crappy temporary variables */

  static char *          c_string;
  
  double *        doublePtr;  
  unsigned int    a,b,c,d,e, i,j,k, m,n,o,p,q, t, u, v, nPairs_nLocations;
  unsigned short  do_clear;
  double          aa,bb,cc,dd,ee, rr, ss,tt,uu,vv, xx, yy, zz;

  /* --- the work starts here --- */

  /* Check that flags are within set of defined values */

  if ( ( compute_E != 0 ) && ( compute_E != 1 ) ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): compute_E must be 0 or 1.\n" );
  }

  if ( ( force_init_or_destroy < -1 ) || (  force_init_or_destroy > 1 ) ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): force_init_or_destroy must be -1 (destroy), 0 (normal) or 1 (force init).\n" );
  }

 
  if ( is_initialized ) {
    
    /* In case the user wants to increase the dimensionality, we
     * have to redo the initialization. */
    
    if ( memory_logsph_numel < logsph_numel ) {
      force_init_or_destroy = 1;
      fprintf( stderr, "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): WARNING! Dimensionality has increased. Forcing reinitialization.\n" );
    }
    
  }


 
  /* There are three reasons why we would like to clear the memory:
   *
   * 1)  because the user is asking for destruction ( force_init_or_destroy == -1 ).
   *
   * 2)  because the user is asking for initialization: if already allocated,
   *     the memory must be cleared first.
   *     ( is_initialized && ( force_init_or_destroy == 1 ) )
   *
   * 3)  because the user is asking for dimensionality increase (similar to 2.).
   */
  
  do_clear = ( force_init_or_destroy == -1 ) || ( is_initialized && ( force_init_or_destroy == 1 ) );

  
  if ( do_clear ) {

    /* ( 1 ) Destruction */
    
    if ( !is_initialized ) {
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): philosophical issue: cannot destroy something that does not exist.\n" );
    }

#if VERBOSE == 1    
    printf( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): starting destruction.\n" );
#endif
    free( c_string );


    free( pairs_ptr );

    free( t_ptr );
    free( d1_ptr );
    free( d2_ptr );

    free( dxyz1_ptr );
    free( dxyz2_ptr );
    free( D_ptr );
    free( tmp_int_array );
    free( tmp_int_array2 );
    

    
#if VERBOSE == 1
    printf( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): destruction done.\n" );
#endif

    /* Not initialized anymore */

    is_initialized = 0;
    
    if ( force_init_or_destroy == -1 ) {
      /* In the destruction case, we are done. */
      return;
    }

  }
  
  
  /* Get access to the global variable "dE_E_par" */
  /* we do it every time because it might change */

  dE_E_par = ( mxArray * ) mexGetVariablePtr( "global", "dE_E_par" );
  if ( dE_E_par == NULL ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): could not get a pointer to the global variable 'dE_E_par'." );
  }
  
  if ( !mxIsStruct( dE_E_par ) ) {
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): the global variable 'dE_E_par' must be a structure." );
  }



  
  if ( ( !is_initialized ) || ( force_init_or_destroy == 1 ) ) {

#if VERBOSE == 1
      printf( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): starting intialization.\n" );
#endif
      
    c_string = ( char* ) malloc( 2000 * sizeof( char ) );

    
    /* ( 3 ) Initialization: retrieve parameters from "global" MATLAB
     * workspace ( done only the first time we are called ) */

    
    /* Get access to various fields of "dE_E_par" 
    * Get the various dimensions.
    * Define the pairs of microphones if not done yet. */

  
    geometry_in  = mxGetField( dE_E_par, 0, "geometry" );
    if ( geometry_in == NULL ) { mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): could not access 'dE_E_par.geometry'." );  }
    if ( mxGetClassID( geometry_in ) != mxDOUBLE_CLASS ) { 
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): 'dE_E_par.geometry' must be an array of doubles." );
    }

#if VERBOSE == 1
    print_double_array( mxGetPr( geometry_in ), mxGetNumberOfElements( geometry_in ), "geometry" );
#endif

    pairs_in     = mxGetField( dE_E_par, 0, "pairs" );
    if ( pairs_in == NULL ) { mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): could not access 'dE_E_par.pairs'." );  }  
    if ( mxGetClassID( pairs_in ) != mxUINT16_CLASS ) { 
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): 'dE_E_par.pairs' must be an array of uint16." );
    }    


#if VERBOSE == 1
    print_double_array( mxGetPr( pairs_in ), mxGetNumberOfElements( pairs_in ), "pairs" );
#endif

    fs_in        = mxGetField( dE_E_par, 0, "fs" ); 
    if ( fs_in == NULL ) { mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): could not access 'dE_E_par.fs'." );  }  
    if ( mxGetClassID( fs_in ) != mxDOUBLE_CLASS ) { 
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): 'dE_E_par.fs' must be a double." );
    }    


#if VERBOSE == 1
    print_double_array( mxGetPr( fs_in ), mxGetNumberOfElements( fs_in ), "fs" );
#endif

    
    c_in         = mxGetField( dE_E_par, 0, "c" ); 
    if ( c_in == NULL ) { mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): could not access 'dE_E_par.c'." );  }  
    if ( mxGetClassID( c_in ) != mxDOUBLE_CLASS ) { 
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): 'dE_E_par.c' must be a double." );
    }
    
#if VERBOSE == 1
    print_double_array( mxGetPr( c_in ), mxGetNumberOfElements( c_in ), "c" );
#endif

    nPairs      = mxGetM( pairs_in );
    nDimensions = mxGetM( geometry_in );
    nChannels   = mxGetN( geometry_in );
    
    /* Get access to input data/parameters */
    geometry_ptr  = ( double * ) mxGetData( geometry_in );
    
    fs_val = mxGetScalar( fs_in );
    c_val  = mxGetScalar( c_in );



    do_create_pairs_ptr = ( nPairs < 1 );
    
    /* List of the pairs of microphones: 2 cases */
    if ( do_create_pairs_ptr ) {
      
      /* Create a default list of pairs */
      nPairs = nChannels * ( nChannels - 1 ) / 2;
      
      pairs_ptr = ( unsigned short* ) malloc( nPairs * 2 * sizeof( unsigned short ) );
      
      c = 0;  /* Pair index */
      
      for ( a = 1; a <= nChannels; a++ ) {
	
	for ( b = a+1; b <= nChannels; b++ ) {
	  
	  pairs_ptr[ c ] =  a;
	  pairs_ptr[ c + nPairs ] =  b;
	  
	  c++;
	  
	}
	
      }
      
    } else {
      
      a =  nPairs * 2 * sizeof( unsigned short ) ;
      pairs_ptr = ( unsigned short * ) malloc( a );
      memcpy( pairs_ptr, mxGetData( pairs_in ), a );
      
    }
    
#if VERBOSE == 1
    print_unsigned_short_array( pairs_ptr, 2 * nPairs, "pairs_ptr" );
#endif


    w_i_in      = mxGetField( dE_E_par, 0, "w_i" );
    if ( w_i_in == NULL ) { mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): could not access 'dE_E_par.w_i'." );  }  
    if ( mxGetClassID( w_i_in ) != mxDOUBLE_CLASS ) { 
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): 'dE_E_par.w_i' must be an array of double." );
    }    

    w_i_ptr     = mxGetPr( w_i_in );
    

#if VERBOSE == 1
    print_double_array( w_i_ptr, mxGetNumberOfElements( w_i_in ), "w_i_ptr" );
#endif


    nFreq       = mxGetNumberOfElements( w_i_in );
    
    theta_in    = mxGetField( dE_E_par, 0, "theta" );
    if ( theta_in == NULL ) { mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): could not access 'dE_E_par.theta'." );  }  
    if ( mxGetClassID( theta_in ) != mxDOUBLE_CLASS ) { 
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): 'dE_E_par.theta' must be an array of double." );
    }

    /* Classical error: passing the complex (wrong) instead of its argument (correct). */
    if ( mxIsComplex( theta_in ) ) {
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): 'dE_E_par.theta' must be real, not complex. Maybe you passed me Z instead of angle(Z)." );
    }
    

    if ( ( mxGetM( theta_in ) != nFreq ) || ( mxGetN( theta_in ) != nPairs ) ) {
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): inconsistent dimensions ('dE_E_par.theta' must be nFreq by nPairs)." );
    }
    
    theta_ptr   = mxGetPr( theta_in );

#if VERBOSE == 1
    print_double_array( theta_ptr, mxGetNumberOfElements( theta_in ), "theta_ptr" );
#endif

    


  
    /* To detect possible dimensionality increase 
     * next time this dE_E_fun() is called. */
    memory_logsph_numel = logsph_numel;
     
    nLocations = logsph_numel / nDimensions;
   
    /* where to store the time delays */
    t_ptr = ( double* ) malloc( nPairs * nLocations * sizeof( double ) );
    
    d1_ptr = ( double* ) malloc( nPairs * nLocations * sizeof( double ) );
    d2_ptr = ( double* ) malloc( nPairs * nLocations * sizeof( double ) );
    
    dxyz1_ptr = ( double* ) malloc( nPairs * nLocations * nDimensions * sizeof( double ) );
    dxyz2_ptr = ( double* ) malloc( nPairs * nLocations * nDimensions * sizeof( double ) );

    D_ptr = ( double* ) malloc( nFreq * nPairs * nLocations * sizeof( double ) );

    
    /* saving a bit of time */
    tmp_int_array  = ( int* ) malloc( nPairs * sizeof( int ) );
    tmp_int_array2 = ( int* ) malloc( nPairs * sizeof( int ) );
    for ( a = 0; a < nPairs ; a++ ) {
      tmp_int_array[ a ]  = ( pairs_ptr[ a          ] - 1 ) * nDimensions;
      tmp_int_array2[ a ] = ( pairs_ptr[ a + nPairs ] - 1 ) * nDimensions;
    }
    
    
    /* Mark that the initialization is finished, so that we would not do it again */
    /* ( unless the user sets "force_initialization" to 1 in the next call to us ) */
    is_initialized = 1;
    
#if VERBOSE == 1
    printf( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): intialization done.\n" );
#endif
    
  }



  
  /* ---------------------------------------------------------------------- */
  /* ( 4 ) Check dimensionality
   *       Normalize weights.
   *       Convert coordinates from logspherical to Euclidean. */
  
  if ( logsph_numel % nDimensions != 0 ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): inconsistent dimensions ('logsph' and 'dE_E_par.geometry')." );
  }
  
  /* In future versions of the SCG descent, the weights might change over time
   * ( as with "collapse()" in "run_scg_fast.m" )
   */

  weight_in   = mxGetField( dE_E_par, 0, "weight" );
  if ( weight_in == NULL ) { mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): could not access 'dE_E_par.weight'." );  }
  if ( mxGetClassID( weight_in ) != mxDOUBLE_CLASS ) { 
      mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): 'dE_E_par.weight' must be an array of double." );
  }
  
  weight_ptr  = mxGetPr( weight_in );
  
  if ( mxGetNumberOfElements( weight_in ) != nLocations ) {
    mexErrMsgTxt( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): inconsistent dimensions ('dE_E_par.weight' must have nLocations elements)." );
  }

  


  /* Sum the location weights */
  
  weight_sum = 0;
  for ( a = 0; a < nLocations; ) {
    weight_sum += weight_ptr[ a++ ];
  }
  
  /* Convert logspherical coordinates ( azimuth, elevation, log( radius ) ) 
   * contained by "logsph_ptr"
   * into Euclidean coordinates, stored into "xyz_ptr". */
  
  xyz_ptr = ( double* ) malloc( logsph_numel * sizeof( double ) );
  
  for ( c = 0; c < logsph_numel; ) {
    
    d = c;

    aa = logsph_ptr[ c++ ];  /* azimuth */
    bb = logsph_ptr[ c++ ];  /* elevation */

    /* This is the difference with classical spherical coordinates */
    rr = exp( logsph_ptr[ c++ ] );  /* radius, note the "exp" */
    
    xyz_ptr[ d++ ] = rr * cos( bb ) * cos( aa );  /* x */
    xyz_ptr[ d++ ] = rr * cos( bb ) * sin( aa );  /* y */
    xyz_ptr[ d++ ] = rr * sin( bb );              /* z */
    
  }
  
  

  /* ---------------------------------------------------------------------- */
  /* ( 5 ) Compute dxyz and time-delays */

  nPairs_nLocations = nPairs * nLocations;

  memset( d1_ptr, 0, nPairs_nLocations * sizeof( double ) );
  memset( d2_ptr, 0, nPairs_nLocations * sizeof( double ) );
  




  /* m: element index in dxyz1 and dxyz2 arrays */
  m = 0;
  
  for( c = 0; c < nDimensions; c++ ) {
    
    /* i: element index in d1 and d2 arrays */
    i = 0;
    
    for( b = 0; b < nLocations; b++ ) {
      
      d = b * nDimensions;

      for ( a = 0; a < nPairs; a++ ) {
	
	aa = xyz_ptr[ c + d  ] - geometry_ptr[ c + tmp_int_array[ a ]  ];
	bb = xyz_ptr[ c + d  ] - geometry_ptr[ c + tmp_int_array2[ a ] ];
	
	dxyz1_ptr[ m ] = aa; 
	dxyz2_ptr[ m ] = bb; 
	
	/* The next two lines are okay since
	 * both 'd1' and 'd2' were initialized with zeroes. */
	d1_ptr[ i ] += aa * aa;
	d2_ptr[ i ] += bb * bb;
	
	m++;
	i++;
	
      }  /* for a */

    } /* for b */
  
  } /* for c */
  




  /* Finish computing the Euclidean distances + time-delays */

  cc = fs_val / c_val;
  
  /* i: element index in d1, d2 and t arrays */
  i = 0;
  
  for ( i = 0; i < nPairs_nLocations; i++ ) {
      
    aa = sqrt( d1_ptr[ i ] );
    bb = sqrt( d2_ptr[ i ] );
    
    d1_ptr[ i ] = aa;
    d2_ptr[ i ] = bb;
    
    t_ptr[ i ] = ( bb - aa ) * cc;
    
  }


  /* ---------------------------------------------------------------------- */
  /* ( 6 ) Compute the gradient dE:
   * we do it in Euclidean coordinates, then move back to logspherical coordinates */


  /* Difference between measured and theoretical phase */
  /* ( this is also used in the cost computation ). */

  a = 0;  /* index in 'D_ptr' */
  c = 0;  /* index in 't_ptr' */

  for ( n = 0; n < nLocations; n++ ) {
    
    b = 0; /* index in 'theta_ptr' */

    for ( p = 0; p < nPairs; p++ ) {
    
      /* Get the next time-delay in the 't_ptr' matrix */
      tt = t_ptr[ c++ ];
      
      for ( i = 0; i < nFreq; i++ ) {
	
	D_ptr[ a++ ] = theta_ptr[ b++ ] - w_i_ptr[ i ] * tt;
	
      }
      
    }
    
  }
  
  /* Now compute each component of the gradient dE */
  
  aa = - fs_val / ( 2 * c_val * nPairs * nFreq * weight_sum );
  
  /* index in 'xyz_ptr' */
  a = 0; /* x */
  b = 1; /* y */
  c = 2; /* z */

  t = 0;   /* index in 'd1' and 'd2', as well as x in 'dxyz1' and 'dxyz2' */
  u =     nPairs_nLocations;       /* index of y in 'dxyz1' and 'dxyz2' */
  v = 2 * nPairs_nLocations;       /* index of z in 'dxyz1' and 'dxyz2' */

  /* index in D */
  d = 0;

  for( k = 0; k < nLocations; k++ ) {
    
    /* Where we'll temporarily store the gradient
     * in Euclidean coordinates, for location #k. */

    dE_x = 0;
    dE_y = 0;
    dE_z = 0;

    for( p = 0; p < nPairs; p++ ) {
      
      /* Compute the factor in sine, which is common to x,y,z */
      
      cc = 0;
      for ( i = 0; i < nFreq; i++ ) {
	
	cc += w_i_ptr[ i ] * sin( D_ptr[ d++ ] );
	
      }
      
      /* Update the sum over pairs, for each coordinate x,y,z */
      
      dd = cc / d2_ptr[ t ];
      ee = cc / d1_ptr[ t ];
      
      dE_x += dxyz2_ptr[ t ] * dd - dxyz1_ptr[ t ] * ee;        /* x */
      dE_y += dxyz2_ptr[ u ] * dd - dxyz1_ptr[ u ] * ee;        /* y */
      dE_z += dxyz2_ptr[ v ] * dd - dxyz1_ptr[ v ] * ee;        /* z */
      
      /* move to next element in 'd1', 'd2', 'dxyz1', 'dxyz2' */
      t++;
      u++;
      v++;
      
    } /* for p */

    /* Scaling factor */
    
    bb = aa * weight_ptr[ k ];

    dE_x *= bb;
    dE_y *= bb;
    dE_z *= bb;

    /* Finally translate back to logspherical coordinates */
    
    xx = xyz_ptr[ a ]; /* x */
    yy = xyz_ptr[ b ]; /* y */
    zz = xyz_ptr[ c ]; /* z */

    tt = logsph_ptr[ a ]; /* azimuth    */
    uu = logsph_ptr[ b ]; /* elevation  */
    vv = logsph_ptr[ c ]; /* log radius */
    
 
    dE_ptr[ a ] = -yy               * dE_x + xx        * dE_y;
    dE_ptr[ b ] = -zz * ( cos( tt ) * dE_x + sin( tt ) * dE_y ) + vv * cos( uu ) * dE_z;

    /* Note: compared to classical spherical coordinates, */
    /* only this line differs */
    /* It is the definition of ( dE / ds ) where s = log( radius ) */
    dE_ptr[ c ] = xx                * dE_x + yy        * dE_y     + zz           * dE_z;
    

    /* Move to the next location in 'xyz_ptr', 'logsph_ptr' and 'dE_ptr' */
    a = ++c;
    b = ++c;
    c++;
    
  } /* for k */
  

  
  
  /* ---------------------------------------------------------------------- */
  /* ( 7 ) If relevant, compute the cost E */

  if ( compute_E ) {
    
    /* Where we'll store the partial sum */
    aa = 0;
    
    /* Index in 'D_ptr' */
    a = 0;
    
    for ( n = 0; n < nLocations; n++ ) {

      bb = 0;

      q = nPairs * nFreq;
      for ( p = 0; p < q; p++ ) {
	
	bb += cos( D_ptr[ a++ ] );

      }

      /* scale with the weight of this location,
       * and update the partial sum. */
      aa += bb * weight_ptr[ n ];
      
    }
    
    
    *E_ptr = 0.5 * ( 1 - aa / ( nFreq * nPairs * weight_sum ) );

  }


  /* ---------------------------------------------------------------------- */
  /* ( 8 ) Free some memory */

  free( xyz_ptr );

#if VERBOSE == 1
  printf( "\n" );
  print_double_array( dE_ptr, memory_logsph_numel, "dE_E_fun(end): dE_ptr" );

  if ( compute_E ) {
    print_double_array( E_ptr, 1, "dE_E_fun(end) E_ptr" );
  }

  printf( "onearray_run_scg_fast_logsph_c.c.dE_E_fun(): done.\n" );
#endif
  
} /* end of function "dE_E_fun()" */



