/*

 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

 *

 * NOTICE TO USER:  

 *

 * This source code is subject to NVIDIA ownership rights under U.S. and

 * international Copyright laws.  Users and possessors of this source code

 * are hereby granted a nonexclusive, royalty-free license to use this code

 * in individual and commercial software.

 *

 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE

 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH

 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF

 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.

 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,

 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS

 * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE

 * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE

 * OR PERFORMANCE OF THIS SOURCE CODE. 

 *

 * U.S. Government End Users.   This source code is a "commercial item" as

 * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of

 * "commercial computer  software"  and "commercial computer software

 * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)

 * and is provided to the U.S. Government only as a commercial end item. 

 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through

 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the

 * source code with only those rights set forth herein.

 *

 * Any use of this source code in individual and commercial software must

 * include, in the user documentation and internal comments to the code,

 * the above Disclaimer and U.S. Government End Users Notice.

 */

 

#if !defined(CU_COMPLEX_H_)

#define CU_COMPLEX_H_

 

#if defined(__cplusplus)

extern "C" {

#endif /* __cplusplus */

 

#include <math.h>       /* import fabsf, sqrt */

#include "vector_types.h"

 

 

/* versions for hosts without native support for 'complex' */

 

#if (!defined(__CUDACC__) && defined(CU_USE_NATIVE_COMPLEX))

#include <complex.h>

 

/* wrapper functions around C99 native complex support. NOTE: Untested! */

 

/* -- Single Precision -- */

typedef complex cuFloatComplex;

__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) {

    return crealf(x);

}

 

__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) {

    return cimagf(x);

}

 

__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex

                                                             (float x, float y)

{

    return x + I * y;

}

 

__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)

{

    return conjf (x);

}

 

__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,

                                                              cuFloatComplex y)

{

    return x + y;

}

 

__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,

                                                              cuFloatComplex y)

{

    return x - y;

}

 

__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,

                                                              cuFloatComplex y)

{

    return x * y;

}

 

__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,

                                                              cuFloatComplex y)

{

    return x / y;

}

 

__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)

{

    return cabsf (x);

}

 

/* -- Double Precision -- */

typedef double complex cuDoubleComplex;

__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)

{

    return creal(x);

}

 

__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)

{

    return cimag(x);

}

 

__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex

                                                           (double x, double y)

{

    return x + I * y;

}

 

__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)

{

    return conj (x);

}

 

__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,

                                                             cuDoubleComplex y)

{

    return x + y;

}

 

__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,

                                                             cuDoubleComplex y)

{

    return x - y;

}

 

__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,

                                                             cuDoubleComplex y)

{

    return x * y;

}

 

__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,

                                                             cuDoubleComplex y)

{

    return x / y;

}

 

__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)

{

    return cabs (x);

}

 

 

       

/* versions for target or hosts without native support for 'complex' */

 

#else /* (defined(__CUDACC__) || (!(defined(CU_USE_NATIVE_COMPLEX)))) */

 

typedef float2 cuFloatComplex;

 

__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)

{

    return x.x;

}

 

__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)

{

    return x.y;

}

 

__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex

                                                             (float r, float i)

{

    cuFloatComplex res;

    res.x = r;

    res.y = i;

    return res;

}

 

__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)

{

    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));

}

__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,

                                                              cuFloatComplex y)

{

    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),

                                cuCimagf(x) + cuCimagf(y));

}

 

__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,

                                                              cuFloatComplex y)

{

        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),

                                    cuCimagf(x) - cuCimagf(y));

}

 

/* This implementation could suffer from intermediate overflow even though

 * the final result would be in range. However, various implementations do

 * not guard against this (presumably to avoid losing performance), so we

 * don't do it either to stay competitive.

 */

__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,

                                                              cuFloatComplex y)

{

    cuFloatComplex prod;

    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) -

                                 (cuCimagf(x) * cuCimagf(y)),

                                 (cuCrealf(x) * cuCimagf(y)) +

                                 (cuCimagf(x) * cuCrealf(y)));

    return prod;

}

 

/* This implementation guards against intermediate underflow and overflow

 * by scaling. Such guarded implementations are usually the default for

 * complex library implementations, with some also offering an unguarded,

 * faster version.

 */

__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,

                                                              cuFloatComplex y)

{

    cuFloatComplex quot;

    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));

    float oos = 1.0f / s;

    float ars = cuCrealf(x) * oos;

    float ais = cuCimagf(x) * oos;

    float brs = cuCrealf(y) * oos;

    float bis = cuCimagf(y) * oos;

    s = (brs * brs) + (bis * bis);

    oos = 1.0f / s;

    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,

                                ((ais * brs) - (ars * bis)) * oos);

    return quot;

}

 

/*

 * We would like to call hypotf(), but it's not available on all platforms.

 * This discrete implementation guards against intermediate underflow and

 * overflow by scaling. Otherwise we would lose half the exponent range.

 * There are various ways of doing guarded computation. For now chose the

 * simplest and fastest solution, however this may suffer from inaccuracies

 * if sqrt and division are not IEEE compliant.

 */

__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)

{

    float a = cuCrealf(x);

    float b = cuCimagf(x);

    float v, w, t;

    a = fabsf(a);

    b = fabsf(b);

    if (a > b) {

        v = a;

        w = b;

    } else {

        v = b;

        w = a;

    }

    t = w / v;

    t = 1.0f + t * t;

    t = v * sqrtf(t);

    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {

        t = v + w;

    }

    return t;

}

 

/* Double precision */

typedef double2 cuDoubleComplex;

 

__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)

{

    return x.x;

}

 

__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)

{

    return x.y;

}

 

__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex

                                                           (double r, double i)

{

    cuDoubleComplex res;

    res.x = r;

    res.y = i;

    return res;

}

 

__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)

{

    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));

}

 

__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,

                                                             cuDoubleComplex y)

{

    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),

                                 cuCimag(x) + cuCimag(y));

}

 

__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,

                                                             cuDoubleComplex y)

{

    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),

                                 cuCimag(x) - cuCimag(y));

}

 

/* This implementation could suffer from intermediate overflow even though

 * the final result would be in range. However, various implementations do

 * not guard against this (presumably to avoid losing performance), so we

 * don't do it either to stay competitive.

 */

__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,

                                                             cuDoubleComplex y)

{

    cuDoubleComplex prod;

    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -

                                 (cuCimag(x) * cuCimag(y)),

                                 (cuCreal(x) * cuCimag(y)) +

                                 (cuCimag(x) * cuCreal(y)));

    return prod;

}

 

/* This implementation guards against intermediate underflow and overflow

 * by scaling. Such guarded implementations are usually the default for

 * complex library implementations, with some also offering an unguarded,

 * faster version.

 */

__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,

                                                             cuDoubleComplex y)

{

    cuDoubleComplex quot;

    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));

    double oos = 1.0 / s;

    double ars = cuCreal(x) * oos;

    double ais = cuCimag(x) * oos;

    double brs = cuCreal(y) * oos;

    double bis = cuCimag(y) * oos;

    s = (brs * brs) + (bis * bis);

    oos = 1.0 / s;

    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,

                                 ((ais * brs) - (ars * bis)) * oos);

    return quot;

}

 

/* This implementation guards against intermediate underflow and overflow

 * by scaling. Otherwise we would lose half the exponent range. There are

 * various ways of doing guarded computation. For now chose the simplest

 * and fastest solution, however this may suffer from inaccuracies if sqrt

 * and division are not IEEE compliant.

 */

__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)

{

    double a = cuCreal(x);

    double b = cuCimag(x);

    double v, w, t;

    a = fabs(a);

    b = fabs(b);

    if (a > b) {

        v = a;

        w = b;

    } else {

        v = b;

        w = a;

    }

    t = w / v;

    t = 1.0 + t * t;

    t = v * sqrt(t);

    if ((v == 0.0) ||

        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {

        t = v + w;

    }

    return t;

}

 

#endif /* (!defined(__CUDACC__) && defined(CU_USE_NATIVE_COMPLEX))) */

 

#if defined(__cplusplus)

}

#endif /* __cplusplus */

 

/* aliases */

typedef cuFloatComplex cuComplex;

__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,

                                                                float y)

{

    return make_cuFloatComplex (x, y);

}

 

/* float-to-double promotion */

__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble

                                                      (cuFloatComplex c)

{

    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));

}

 

__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat

(cuDoubleComplex c)

{

        return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));

}

 

 

__host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)

{

    float real_res;

    float imag_res;

   

    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);

    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);

           

    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res; 

    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;         

    

    return make_cuComplex(real_res, imag_res);

}

 

__host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)

{

    double real_res;

    double imag_res;

   

    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);

    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);

           

    real_res = -(cuCimag(x) * cuCimag(y))  + real_res; 

    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;    

    

    return make_cuDoubleComplex(real_res, imag_res);

}

 

#endif /* !defined(CU_COMPLEX_H_) */