#ifndef DSVDC_CPP
#define DSVDC_CPP 1


/*
Code for singular value decomposition of a matrix. This code modified
by Dave Curtis, 1998 from the clinpack code. Original code
by Jack Dongarra, converted to ANSI C by SV Schell.

Modifications by Dave Curtis:

Use doubles instead of complex numbers.

Index all vectors/matrices from 0, all increment by 1.

Add new test for success as suggested by Allan Curtis, since
apparently the original test could fail with some optimising compilers.

Put all the necessary BLAs into this single source file.

The code has been designed to work with the C++ wrapper svd::dcmp(). I have no
idea if there are redundancies which were needed for the complex numbers but
which could now be dealt with more efficiently. I do not know if I may have lost
any original functionality which is not used by svd::dcmp() (e.g. if job!=21). The
emphasis has been on convenience (for me) not efficiency or flexibility.

*/

#ifndef NODCURTISCHANGES
#define FRACT_TOL 1e-15
#endif

#define fsign(z1,z2) (z1*(z2==0?0:z2>0?1:-1))

double fdotf( int n, double *cx, double *cy)
/*
 *  n     int number of elements to process
 *  cx    double vector, indexed from 0
 *  cy    double vector, indexed from 0
 *      forms the dot product of two vectors, conjugating the first vector.
*/
{
    int     i;
    double ctemp;

    ctemp = 0.0;
    for ( i = 0;  i < n;  i++ )
            ctemp += cy[i] * cx[i];
    return( ctemp );
}

void faxpy( int n, double ca, double *cx, double *cy)
/*
 *  n     int number of elements to process
 *  ca    double scalar
 *  cx    double vector, indexed from 0
 *  cy    double vector, indexed from 0
       add cx multiplied by ca to cy
 */
{
    int i;
    if (ca == 0.0 ) return;
    for ( i = 0;  i < n;  i++ )
            cy[i] += ca * cx[i];
}

void fscal( int n, double ca, double *cx)
/*
 *  n     int number of elements to process
 *  ca    double scalar
 *  cx    double vector, indexed from 0
 *
 *     scales a vector by a constant.
*/
{
    int     i;
    if ( ca == 1.0 ) return;
/*  DC - I am changing behaviour here. Previously it returned if cabs1(ca) was zero.
    Now if ca==1, which seems more sensible. However it does differ from
    the original. - DC
*/
    for ( i = 0;  i < n;  i++ ) cx[i] *= ca;
}

void fswap( int n, double *cx, double *cy)
/*
 *  n     int number of elements to process
 *  cx    double vector, indexed from 0
 *  cy    double vector, indexed from 0
 *    swap two vectors
 */
{
    int     i;
    double ctemp;
        for ( i = 0;  i < n;  i++ )
          { ctemp  = cx[i]; cx[i] = cy[i]; cy[i] = ctemp; }
}

void fsrot( int n, double *cx,double *cy,double c, double s )
/*
 *  n     int number of elements to process
 *  cx    double vector, indexed from 1
 *  cy    double vector, indexed from 1
 *    rotate points (cx,cy) by c,s
 */
{
    int     i;
    double ctemp;
        for ( i = 0;  i < n;  i++ ) {
            ctemp  = c * cx[i] + s * cy[i];
            cy[i] = c * cy[i] - s * cx[i];
            cx[i] = ctemp;
        }
}

void srotg( double *sa, double *sb, double *c, double *s)
/*
 *    construct given plane rotation.
 */
{
    double   roe, scale, r, z;

    roe = *sb;
    if ( fabs(*sa) > fabs(*sb) )
        roe = *sa;
    scale = fabs(*sa) + fabs(*sb);
    if ( scale ==  0.0 ) {
       *c = 1.0;
       *s = 0.0;
       r = 0.0;
    } else {
        r = scale * sqrt( (*sa * *sa + *sb * *sb) / (scale*scale) );
        if ( roe < 0.0 )
            r = -r;
        *c = *sa / r;
        *s = *sb / r;
    }
    z = 1.0;
    if ( fabs(*sa) > fabs(*sb) )
        z = *s;
    if ( (fabs(*sb) >= fabs(*sa))  &&  (*c !=  0.0) )
        z = 1.0 / *c;
    *sa = r;
    *sb = z;
}

#ifdef NODCURTISCHANGES
#if defined(apollo) || defined(sun) || defined(alliant)
#define FLT_MIN     1.1754943508222875e-38   /* min decimal value of a double */
#define FLT_MAX     3.4028234663852886e+38   /* max decimal value of a double */
#define FLT_EPSILON 1.1920928955078125e-7
#else
#include <double.h>
#endif
#else
/* I am just going to use these constants for now, even though code is now double precision */
//#define FLT_MIN     1.1754943508222875e-38   /* min decimal value of a double */
//#define FLT_MAX     3.4028234663852886e+38   /* max decimal value of a double */
//#define FLT_EPSILON 1.1920928955078125e-7
#endif

double snrm2( int n, double sx[])
/*
 *  n     int number of elements to process
 *  sx    double vector, indexed from 0
 *
 *    euclidean norm of the n-vector stored in sx() with storage
 *    increment incx .
 *    if    n .le. 0 return with result = 0.
 *    if n .ge. 1 then incx must be .ge. 1
 *
 *          c.l.lawson, 1978 jan 08
 *
 *    four phase method     using two built-in constants that are
 *    hopefully applicable to all machines.
 *        cutlo = maximum of  sqrt(u/eps)  over all known machines.
 *        cuthi = minimum of  sqrt(v)      over all known machines.
 *    where
 *        eps = smallest no. such that eps + 1. .gt. 1.
 *        u   = smallest positive no.   (underflow limit)
 *        v   = largest  no.            (overflow  limit)
 *
 *    brief outline of algorithm..
 *
 *    phase 1    scans zero components.
 *    move to phase 2 when a component is nonzero and .le. cutlo
 *    move to phase 3 when a component is .gt. cutlo
 *    move to phase 4 when a component is .ge. cuthi/m
 *    where m = n for x() real and m = 2*n for double.
 *-------------------------------------------------------------------
 * Converted to ANSI C by S.V. Schell, 1990 Sep 14
 * Notable changes:
 *    1. cutlo and cuthi are computed from constants in <double.h>
 *    2. the structure of the code is radically different
 *       (read: death to all Fortran programmers, especially the ones
 *        who use the "assign" and computed goto statements)
 *-------------------------------------------------------------------
*/
{
    int     i;
    double   cutlo, cuthi, hitest, sum, absx, xmax, stemp;


    cutlo = sqrt( FLT_MIN / FLT_EPSILON );
    cuthi = sqrt( FLT_MAX );
    hitest = cuthi / (double) n;
    sum = 0.0;
    i = 0;

    /* phase 1: scan for zero components */
    while ( (i < n)  &&  (fabs(sx[i]) == 0.0) )
        ++i;
    if ( i >= n )
        return( sum );

    /* phase 2: components in the range (0,cutlo) */
    absx = fabs(sx[i]);
    if ( absx < cutlo ) {
        sum = 1.0;
        xmax = absx;
        ++i;
        while ( (i < n)  &&  ((absx = fabs(sx[i])) < cutlo) ) {
            stemp = absx / xmax;
            sum += stemp * stemp;
            ++i;
        }
        if ( i >= n )
            return( xmax * sqrt(sum) );
        sum  *=  (xmax * xmax);
    }

    /* phase 3: components in the "nice" range [cutlo, hitest) */
    absx = fabs(sx[i]);
    if ( absx < hitest ) {
        while( (i < n)  &&  ((absx = fabs(sx[i])) < hitest) ) {
            sum += absx * absx;
            ++i;
        }
        if ( i >= n )
            return( sqrt(sum) );
    }

    /* phase 4: components greater than hitest */
    absx = fabs(sx[i]);
    sum = (sum / absx) / absx;
    xmax = absx;
    ++i;
    while ( i < n ) {
        if ( (absx = fabs(sx[i])) < xmax ) {
            stemp = absx / xmax;
            sum += stemp * stemp;
        } else {
            stemp = xmax / absx;
            sum = 1.0 + sum * (stemp * stemp);
            xmax = absx;
        }
        ++i;
    }

    return( xmax * sqrt(sum) );
}

void dsvdc( double **x, int n, int p,
            double *s, double *e, double **u, double **v,
            double *work, int job, int *info )
/* DC - Note that these matrices are in column-row rather than
   row column form, i.e. elements are indexed as x[c][r]. The C++
   wrapper therefore passes the transpose of x and accepts back
   the transpose of U and V
*/
/*
 *
 *
 *     dsvdc is a subroutine to reduce a double nxp matrix x by
 *     unitary transformations u and v to diagonal form.  the
 *     diagonal elements s[i] are the singular values of x.  the
 *     columns of u are the corresponding left singular vectors,
 *     and the columns of v the right singular vectors.
 *
 *     on entry
 *
 *         x         double[n][p], indexed from 0.
 *                   x contains the matrix whose singular value
 *                   decomposition is to be computed.
 *                   x is destroyed by dsvdc.
 *
 *         n         integer.
 *                   n is the number of rows of the matrix x.
 *
 *         p         integer.
 *                   p is the number of columns of the matrix x.
 *
 *         work      double[n], indexed from 0.
 *                   work is a scratch array.
 *
 *         job       integer.
 *                   job controls the computation of the singular
 *                   vectors.  it has the decimal expansion ab
 *                   with the following meaning
 *
 *                        a == 0    do not compute the left singular
 *                                  vectors.
 *                        a == 1    return the n left singular vectors
 *                                  in u.
 *                        a >= 2    returns the first min(n,p)
 *                                  left singular vectors in u.
 *                        b == 0    do not compute the right singular
 *                                  vectors.
 *                        b == 1    return the right singular vectors
 *                                  in v.
 *
 *     on return
 *
 *         s         double[mm], indexed from 0, where mm=min(n+1,p).
 *                   the first min(n,p) entries of s contain the
 *                   singular values of x arranged in descending
 *                   order of magnitude.
 *
 *         e         double[p], indexed from 0.
 *                   e ordinarily contains zeros.  however see the
 *                   discussion of info for exceptions.
 *
 *         u         double[n][k], indexed from 0.
 *                   if (joba == 1) then k = n
 *                   if (joba >= 1) then k = min(n,p).
 *                   u contains the matrix of left singular vectors.
 *                   u is not referenced if (joba == 0).
 *                   if ((n <= p) || (joba > 2)), then u may be
 *                   identified with x in the subroutine call.
 *
 *         v         double[p][p], indexed from 0.
 *                   v contains the matrix of right singular vectors.
 *                   v is not referenced if (jobb == 0).
 *                   if (p <= n), then v may be identified with x
 *                   in the subroutine call.
 *
 *         info      integer *.
 *                   the singular values (and their corresponding
 *                   singular vectors) s[info+1], s[info+2], ..., s[m]
 *                   are correct (here m=min(n,p)).  thus if
 *                   (info == 0), all the singular values and their
 *                   vectors are correct.  in any event, the matrix
 *                   b = ctrans(u)*x*v is the bidiagonal matrix
 *                   with the elements of s on its diagonal and the
 *                   elements of e on its super-diagonal (ctrans(u)
 *                   is the conjugate-transpose of u).  thus the
 *                   singular values of x and b are the same.
 *
 *     linpack. this version dated 03/19/79 .
 *              correction to shift calculation made 2/85.
 *     g.w. stewart, university of maryland, argonne national lab.
 *
 *-------------------------------------------------------------------
 * Converted to ANSI C by S.V. Schell, 1990 Sep 14
 *-------------------------------------------------------------------
*/
{

#define MIN(a,b)  ( (a) < (b) ? (a) : (b) )
#define MAX(a,b)  ( (a) > (b) ? (a) : (b) )

    int     i, iter, j, jobu, k, kase, kk, l, ll, lls, lm1, lp1, ls, lu,
            m,  maxit,  mm, mm1, mp1, nct, nctp1, ncu, nrt, nrtp1,
            OK, OK_main;
    double t, r, ctemp;
    double   b, c, cs, el, emm1, f, g, scale, shift, sl, sm, sn, smm1,
            t1, test, ztest;
    int     wantu, wantv;


/* set the maximum number of iterations. */

    maxit = 30;

/* determine what is to be computed. */

    wantu = 0;
    wantv = 0;
    jobu = (job % 100) / 10;
    ncu = n;
    if ( jobu > 1 )
        ncu = MIN( n, p );
    if ( jobu != 0 )
        wantu = 1;
    if ( (job % 10) != 0 )
        wantv = 1;

/*
 * reduce x to bidiagonal form, storing the diagonal elements
 * in s and the super-diagonal elements in e.
*/
    *info = 0;
    nct = MIN( n-1, p );
    nrt = MAX( 0, MIN(p-2, n) );
    lu = MAX( nct, nrt );
    if (lu >= 1) {
        for ( l = 0;  l < lu;  l++ ) {
            lp1 = l + 1;
            if (l < nct) {
                /*
                 * compute the transformation for the l-th column and
                 * place the l-th diagonal in s(l).
                */
                s[l] = snrm2( n-l, &x[l][l]);
                if ( s[l]  !=  0.0 ) {
                    if ( x[l][l]  !=  0.0 )
                        s[l] = fsign(s[l], x[l][l]);
                    fscal( n-l, 1.0/s[l], &x[l][l] );
                    x[l][l] += 1.0;
                }
                s[l] = -s[l];
            }

            if (lp1<p) {
                for ( j = lp1;  j < p;  j++ ) {
                    if (l < nct) {
                        if (s[l] != 0.0) {

                            /* apply the transformation. */
                            t = fdotf(n-l, &x[l][l], &x[j][l]);
                            faxpy( n-l, -t/x[l][l], &x[l][l], &x[j][l]);
                        }
                    }

                    /*
                    * place the l-th row of x into  e for the
                    * subsequent calculation of the row transformation.
                    */
                    e[j] = x[j][l];
                }
            }

            if ( wantu  &&  l < nct ) {
                /*
                * place the transformation in u for
                * subsequent back multiplication.
                */

                for ( i = l;  i < n;  i++ )
                    u[l][i] = x[l][i];
            }

            if ( l < nrt ) {
                /*
                 * compute the l-th row transformation and place the
                 *l-th super-diagonal in e(l).
                */
                e[l] = snrm2( p-l-1, &e[lp1]);
                if (e[l] != 0.0) {
                    if ( e[lp1] != 0.0 )
                        e[l] = fsign(e[l], e[lp1]);
                    fscal( p-l-1, 1/e[l], &e[lp1] );
                    e[lp1] += 1.0;
                }
                e[l] = -e[l];

                if ( (lp1 < n)  &&  (e[l] != 0.0) ) {

                    /* apply the transformation. */
                    for ( i = lp1;  i < n;  i++ )
                        work[i] = 0.0;
                    for ( j = lp1;  j < p;  j++ )
                        faxpy( n-l-1, e[j], &x[j][lp1], &work[lp1]);
                    for ( j = lp1;  j < p;  j++ ) {
                        faxpy( n-l-1, -e[j]/e[lp1], &work[lp1], &x[j][lp1]);                    }
                }

                if ( wantv ) {

                    /*
                     * place the transformation in v for subsequent
                     * back multiplication.
                    */
                    for ( i = lp1;  i < p;  i++ )
                        v[l][i] = e[i];
                }
            }
        }
    }

    /* set up the final bidiagonal matrix or order m. */

    m = MIN( p, n+1 );
    nctp1 = nct + 1;
    nrtp1 = nrt + 1;
    if ( nct < p )
        s[nctp1-1] = x[nctp1-1][nctp1-1];
    if ( n < m )
        s[m-1] = 0.0;
    if ( nrtp1 < m )
        e[nrtp1-1] = x[m-1][nrtp1-1];
    e[m-1] = 0.0;

    /* if required, generate u. */

    if ( wantu ) {
        if ( ncu >= nctp1) {
            for ( j = nctp1-1;  j < ncu;  j++ ) {
                for ( i = 0;  i < n;  i++ )
                    u[j][i] = 0.0;
                u[j][j] = 1.0;
            }
        }
        if ( nct >= 1 ) {
            for ( ll = 1;  ll <= nct;  ll++ ) {
                l = nct - ll;
                if (s[l] != 0.0) {
                    lp1 = l + 1;
                    if (lp1<ncu) {
                        for ( j = lp1;  j < ncu;  j++ ) {
                            t = fdotf(n-l, &u[l][l], &u[j][l]);
                            faxpy(n-l, -t/u[l][l], &u[l][l], &u[j][l]);
                        }
                    }
                    fscal(n-l, -1.0, &u[l][l]);
                    u[l][l] += 1.0;
                    lm1 = l - 1;
                    if (lm1 >= 0) {
                        for ( i = 0;  i <= lm1;  i++ )
                            u[l][i] = 0.0;
                    }
                } else {
                    for ( i = 0;  i < n;  i++ )
                        u[l][i] = 0.0;
                    u[l][l] = 1.0;
                }
            }
        }
    }

    /* if it is required, generate v. */

    if ( wantv ) {
        for ( ll = 1;  ll <= p;  ll++ ) {
            l = p - ll + 1-1;
            lp1 = l + 1;
            if ( l < nrt ) {
                if (e[l] != 0.0) {
                    for ( j = lp1;  j < p;  j++ ) {
                        t = fdotf(p-l-1, &v[l][lp1], &v[j][lp1]);
                        faxpy(p-l-1, -t/v[l][lp1], &v[l][lp1], &v[j][lp1]);
                    }
                }
            }
            for ( i = 0;  i < p;  i++ )
                v[l][i] = 0.0;
            v[l][l] = 1.0;
        }
    }

    /* main iteration loop for the singular values. */

    OK_main = 1;
    mm = m;
    iter = 0;
    while ( (m != 0)  &&  OK_main ) {
        /* quit if all the singular values have been found. */

        /* if too many iterations have been performed, set flag and return. */

        if (iter >= maxit) {
            *info = m;
            OK_main = 0;
        } else {
            /*
             * this section of the program inspects for
             * negligible elements in the s and e arrays.  on
             * completion the variables kase and l are set as follows.
             *
             * kase = 1     if s(m) and e(l-1) are negligible and l.lt.m
             * kase = 2     if s(l) is negligible and l.lt.m
             * kase = 3     if e(l-1) is negligible, l.lt.m, and
             *              s(l), ..., s(m) are not negligible (qr step).
             * kase = 4     if e(m-1) is negligible (convergence).
            */

            OK = 1;
            for ( ll = 1;  (ll <= m) && OK;  ll++ ) {
                l = m - ll-1;
                if ( (OK = (l >= 0)) !=0 ) {
                    test = fabs(s[l]) + fabs(s[l+1]);
                    ztest = test + fabs(e[l]);
#ifdef NODCURTISCHANGES
                    if ( !(OK = (ztest != test)) )
#else
                    if ( (OK = (ztest-test>test*FRACT_TOL)) ==0)
/*
new test suggested by my father, Allan Curtis, as apparently
original test could fail with optimising compilers
*/
#endif
                        e[l] = 0.0;
                }
            }

            if (l == m - 1-1)
                kase = 4;
            else {
                lp1 = l + 1;
                mp1 = m + 1;
                OK = 1;
                for ( lls = lp1+1;  (lls <= mp1) && OK;  lls++ ) {
                    ls = m - lls + lp1;
                    if ( (OK = (ls != l)) !=0) {
                        test = 0.0;
                        if (ls < m)
                            test += fabs(e[ls]);
                        if (ls != l + 1)
                            test += fabs(e[ls-1]);
                        ztest = test + fabs(s[ls]);
#ifdef NODCURTISCHANGES
                        if ( !(OK = (ztest != test)) )
#else
                        if ( (OK = (ztest-test>test*FRACT_TOL)) ==0)
/*
new test suggested by my father, Allan Curtis, as apparently
original test could fail with optimising compilers
*/
#endif
                            s[ls] = 0.0;
                    }
                }

                if (ls == l)
                    kase = 3;
                else if (ls == m-1)
                    kase = 1;
                else {
                    kase = 2;
                    l = ls;
                }
            }
            l++;


            /* perform the task indicated by kase. */

            switch ( kase ) {

                /* deflate negligible s(m). */
                case 1:
                    mm1 = m - 1;
                    f = e[m-1-1];
                    e[m-1-1] = 0;
                    for ( kk = l+1;  kk <= mm1;  kk++ ) {
                        k = mm1 - kk + l;
                        t1 = s[k];
                        srotg( &t1, &f, &cs, &sn );
                        s[k] = t1;
                        if ( k != l ) {
                            f = -sn * e[k-1];
                            e[k-1] *= cs;
                        }
                        if (wantv)
                            fsrot(p, v[k], v[m-1], cs, sn);
                    }
                    break;

                /* split at negligible s(l). */
                case 2:
                    f = e[l-1];
                    e[l-1] = 0.0;
                    for ( k = l;  k < m;  k++ ) {
                        t1 = s[k];
                        srotg(&t1, &f, &cs, &sn);
                        s[k] = t1;
                        f = -sn * e[k];
                        e[k] *= cs;
                        if (wantu)
                            fsrot(n, u[k], u[l-1], cs, sn);
                    }
                    break;

                /* perform one qr step. */
                case 3:
                    /* calculate the shift. */
                    scale = MAX( fabs(s[m-1]), fabs(s[m-1-1]) );
                    scale = MAX( scale, fabs(e[m-1-1]) );
                    scale = MAX( scale, fabs(s[l]) );
                    scale = MAX( scale, fabs(e[l]) );
                    sm = s[m-1] / scale;
                    smm1 = s[m-1-1] / scale;
                    emm1 = e[m-1-1] / scale;
                    sl = s[l] / scale;
                    el = e[l] / scale;
                    b = ((smm1 + sm) * (smm1 - sm) + emm1 * emm1) * 0.5;
                    c = sm * emm1;
                    c = c * c;
                    shift = 0.0;
                    if ( (b != 0.0)  ||  (c != 0.0) ) {
                        shift = sqrt(b * b + c);
                        if ( b < 0.0 )
                            shift = -shift;
                        shift = c / (b + shift);
                    }
                    f = (sl + sm) * (sl - sm) + shift;
                    g = sl * el;

                    /* chase zeros. */
                    mm1 = m - 1;
                    for ( k = l;  k< mm1;  k++ ) {
                        srotg(&f, &g, &cs, &sn);
                        if (k != l) {
                            e[k-1] = f;
                        }
                        f = cs*s[k] + sn*e[k];
                        e[k] = cs*e[k] - sn*s[k];
                        g = sn * s[k+1];
                        s[k+1] *= cs;
                        if (wantv)
                            fsrot(p, v[k], v[k+1], cs, sn);
                        srotg(&f, &g, &cs, &sn);
                        s[k] = f;
                        f = cs * e[k] + sn * s[k+1];
                        s[k+1] = -sn * e[k] + cs * s[k+1];
                        g = sn * e[k+1];
                        e[k+1] *= cs;
                        if (wantu  &&  (k < n))
                            fsrot(n, u[k], u[k+1], cs, sn);
                    }
                    e[m-1-1] = f;
                    iter++;
                    break;


                /* convergence. */
                case 4:
                    /* make the singular value  positive */
                    if (s[l] < 0.0) {
                        s[l] = -s[l];
                        if (wantv) {
                            fscal(p, -1.0, v[l]);
                        }
                    }

                    /* order the singular value. */

                    while ( (l < mm-1)  &&  (s[l] < s[l+1]) ) {
                        t = s[l];
                        s[l] = s[l+1];
                        s[l+1] = t;
                        if (wantv  &&  (l < p-1))
                            fswap(p, v[l], v[l+1]);
                        if (wantu  &&  (l < n-1))
                            fswap(n, u[l], u[l+1]);
                        l++;
                    }
                    iter = 0;
                    m--;
                    break;

                default:
                    *info = -1;
                    return;
                    break;

            }
        }
    }
}


#endif
