/***************************************************************************
 *   Copyright (C) 2010 by María del Mar Abad Grau   *
 *   mabad@ugr.es   *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/


#include "../FachadeGenoma.cpp"


using namespace BIOS;


/**
        @memo it computes correlation (Spearman) between transcriptome and genome

	@doc
        Definition: for each individual it computes correlation between their transcriptoma (genetic expression) and genotype for each SNP
				Input: A file with a list of SNPs to be used and other file with the normalized transcription level for a list of individuals (columns) and a list of genetic expression (rows)
				Output: A list of correlation values ordered by descending order with other information besides the correlation, such as p value, genetic expression code, p value using permutations, etc.




        @author María M. Abad-Grau
				@version 1.0
*/

/*___________________________________________________________________________________________________________________*/

HeteroPair<intSet*, intList*>* getRemovedPos ( stringList* individualList, GenomaSample* genomaSample, int inputFormat )
{
	try
	{
		HeteroPair<intSet*, intList*>* result;
		char car1, car2, cadena[100];
		int position;
		intSet* removedFromIndividualList=new intSet();
		intList* usedFromGenoma=new intList();
const char* val;
		for ( stringList::iterator it2=individualList->begin(); it2<individualList->end(); it2++ )
		{
val=individualList->getElement ( it2 ).c_str();

			if ( inputFormat==1 ) position=genomaSample->phenotypeSample->getPosition ( genomaSample->phenotypeSample->findIndividualByCode2 ( val ) );
			else position=genomaSample->phenotypeSample->getPosition ( genomaSample->phenotypeSample->findIndividualByCode ( val ) );

			if ( 0==1 )
				if ( individualList->size() >genomaSample->genotypeSample->size() )
				{
					cout << "There are some individuals in the file with transcriptions that are not included in the gou file\n";
					throw OutOfBounds ( individualList->size(), genomaSample->genotypeSample->size() );
				}
			if ( position==-1 )

				//if ( isNAN ( currentTranscriptionLevels->getElement ( individualList->getPosition ( it2 ) ) ) || position==-1 || genomaSample->genotypeSample->getElement ( position )->isMissing ( rsList->getPosition ( it ) ) )
				removedFromIndividualList->insertElement ( individualList->getPosition ( it2 ) );
			else
				usedFromGenoma->insertElement ( position );

		}
		result=new HeteroPair<intSet*, intList*> ( removedFromIndividualList, usedFromGenoma );
		zap ( removedFromIndividualList );
		zap ( usedFromGenoma );
		//cout <<*result->getFirst() <<", " <<*result->getSecond() <<"\n";
		//cout <<"end of computation of positions to be removed\n";
		return result;
	}
	catch ( BasicException& be ) {be.addMessage ( "\ncalled from void transcriptomeAssociation::getRemovedPos()" ); throw;};
}
/*___________________________________________________________________________________*/


doubleList** getArrayOfSecondRankList ( doubleList**  arrayOfGenotypeList, int size )
{
	try
	{
//cout << "here\n";
		doubleList** result=new doubleList*[size];
		for ( int i=0; i<size; i++ )
		{
			if ( ( i+1 ) %1000==0 )
				cout <<"snp " << i +1   <<" out of " << size  << " has the rank list computed\n";
			if ( arrayOfGenotypeList[i]==NULL )
				throw NullValue ( "doubleList** getArrayOfSecondRankList (doubleList**  arrayOfGenotypeList, int size)" );
//cout << "size of list is:"<< arrayOfGenotypeList[i]->size() <<"\n";
//cout << "list is:"<< *arrayOfGenotypeList[i] <<"\n";
			result[i]=Spearman::setRankingList ( arrayOfGenotypeList[i] );
		}
		return result;
	}
	catch ( BasicException& be ) {be.addMessage ( "\ncalled from doubleList** getArrayOfSecondRankList (doubleList**  arrayOfGenotypeList, int size)" ); throw;};
}
/*___________________________________________________________________________________*/


intList* getListOfTies ( doubleList**  arrayOfRankList, int size )
{
	try
	{
		intList* result=new intList();
		for ( int i=0; i<size; i++ )
		{
			if ( ( i+1 ) %1000==0 )
				cout <<"snp " << i +1   <<" out of " << size  << " has the ties computed\n";
			if ( arrayOfRankList[i]==NULL )
				throw NullValue ( "boolList* getListOfTies (doubleList**  arrayOfRankList, int size)" );
//cout << "size of list is:"<< arrayOfGenotypeList[i]->size() <<"\n";
//cout << "list is:"<< *arrayOfGenotypeList[i] <<"\n";
			result->insertElement ( Spearman::ties ( arrayOfRankList[i] ) );
		}
		return result;
	}
	catch ( BasicException& be ) {be.addMessage ( "\ncalled from boolList* getListOfTies (doubleList**  arrayOfRankList, int size)" ); throw;};
}

/*___________________________________________________________________________________*/


intList* getListOfMissingVars ( doubleList**  arrayOfRankList, int size )
{
	try
	{
		intList* result=new intList();
		bool hasMissing;
		int j;
		for ( int i=0; i<size; i++ )
		{
			hasMissing=false;
			j=0;
			if ( ( i+1 ) %1000==0 )
				cout <<"snp " << i +1   <<" out of " << size  << " has missing computed\n";
			if ( arrayOfRankList[i]==NULL )
				throw NullValue ( "boolList* getListOfTies (doubleList**  arrayOfRankList, int size)" );
//cout << "size of list is:"<< arrayOfGenotypeList[i]->size() <<"\n";
//cout << "list is:"<< *arrayOfGenotypeList[i] <<"\n";
			while ( hasMissing==false && j<arrayOfRankList[i]->size() )
			{
				if ( isNAN ( arrayOfRankList[i]->getElement ( j ) ) )
					hasMissing=true;
				j++;
			};
			result->insertElement ( hasMissing );
		}
		return result;
	}
	catch ( BasicException& be ) {be.addMessage ( "\ncalled from boolList* getListOfTies (doubleList**  arrayOfRankList, int size)" ); throw;};
}


/*___________________________________________________________________________________*/

doubleList** getListOfGenotypeValues ( intList* usedPositions, stringList* individualList, stringList* rsList, stringList* referenceRSList, GenomaSample* referenceGenomaSample, GenomaSample* genomaSample )
{
	try
	{
		doubleList** result=new doubleList*[rsList->size() ];
		char car1, car2, cadena[100];
		allele al1, al2;
		int j=0, line=0;
		doubleList* genotypeValues;
		stringList::iterator refIt;
		for ( stringList::iterator it=rsList->begin(); it<rsList->end(); it++ )
			//	if ( rsList->getElement ( it ) ==string ( "rs9271100" ) )
		{
			if ( ( j+1 ) %1000==0 )
				cout <<"snp " << rsList->getPosition ( it ) +1 <<": " << rsList->getElement ( it )  <<" out of " << rsList->size() << " has been read\n";
			genotypeValues=new doubleList();
			refIt=referenceRSList->findElement ( rsList->getElement ( it ) );
			if ( refIt==referenceRSList->end() )
				//cout << "Element " << rsList->getElement ( it ) << " not found in .rs file\n";
				//throw NullValue ( "rslist not found in main" );
				al1=genomaSample->genotypeSample->allAlleles[rsList->getPosition ( it ) ][0];
			else 			al1=referenceGenomaSample->genotypeSample->allAlleles[referenceRSList->getPosition ( refIt ) ][0];
			for ( intList::iterator it2=usedPositions->begin(); it2!=usedPositions->end(); it2++ )
			{
//cout << " ind is:" << *individualList->getElement(it2);
//cout <<"all act is: " << *genomaSample->genotypeSample->getElement(*it2) <<"\n";
				if ( genomaSample->genotypeSample->getElement ( *it2 )->isMissing ( rsList->getPosition ( it ) ) )
					genotypeValues->insertElement ( std::numeric_limits<double>::quiet_NaN() );
				else
					if ( genomaSample->genotypeSample->getElement ( *it2 )->isHomozygous1 ( rsList->getPosition ( it ), al1 ) )
						genotypeValues->insertElement ( 0.0 );
					else if ( genomaSample->genotypeSample->getElement ( *it2 )->isHeterozygous ( rsList->getPosition ( it ) ) )
						genotypeValues->insertElement ( 1.0 );
					else genotypeValues->insertElement ( 2.0 );
			}
			if ( genotypeValues->size() !=usedPositions->size() )
				throw OutOfBounds ( genotypeValues->size(), usedPositions->size() );
			result[j]=genotypeValues;
			//	cout <<"genotypes: " <<*genotypeValues <<"\n";
			j++;
		}
		return result;
	}
	catch ( BasicException& be ) {be.addMessage ( "\ncalled from void transcriptomeAssociation::getListOfGenotypeValues()" ); throw;};
}

/*-------------------------------------------------------------------------*/

int main ( int argc, char*argv[] )
{
	try
	{
		try
		{

			if ( argc<5 )
			{
				cerr<<"\nOnly " << argc << " arguments were introduced.";
				cerr << "\nError: you have to specify the following information:" << endl;
				cerr  << argv[0] << "<file with gene expression profiles> <gou/ped file> <reference gou/ped file to use same major allele><name to be used for the output file> <number of heading fields (default is 1)> <number of permutations to be used (0: no permutation test, default)><0:makeped (PLINK) format with individual ID read from the second column, 1: extended makeped format with individual ID read from the seventh column, 2: vcf? format with only one column for phenotype (first column, individual ID) (default is 0)>\n";

				exit ( -1 );
			}
			srand ( 1000 );
			char inputFileName1[1024], gouFile[1024], referenceGouFile[1024], rsFile[1024], referenceRSFile[1024], outputFileName[1024]="\0", outputFileName2[256]="\0", outputFileName3[256]="\0", ext[4];
			stringList *rsList, *referenceRSList, *chromosomes=NULL, *referenceChromosomes=NULL;
			stringSample *positions, *referencePositions;
			strcpy ( inputFileName1, argv[1] );
			strcpy ( gouFile, argv[2] );
			strcpy ( referenceGouFile, argv[3] );
			strcpy ( outputFileName, argv[4] );
			strcpy ( outputFileName2, outputFileName );
			strcat ( outputFileName2, ".null.csv" );
                        strcpy ( outputFileName3, outputFileName );
                        strcat ( outputFileName3, ".log" );


			int headings=1;
			int permutations=0;
			int inputFormat=0;
			if ( argc>=6 )
				headings=atoi ( argv[5] );

			if ( argc>=7 )
				permutations=atoi ( argv[6] );

			if ( argc>=8 )
				inputFormat=atoi ( argv[7] );

			stringList* row;

			stringList* individualList;
			switch ( inputFormat )
			{
				case 0: strcpy ( ext,"map\0" ); break;
				case 1: strcpy ( ext, "rs\0" ); break;
				case 2: strcpy ( ext, "pos\0" ); break;
			}

			changeExtension ( gouFile, rsFile, ext );
			changeExtension ( referenceGouFile, referenceRSFile, ext );

			switch ( inputFormat )
			{
				case 1: rsList=new stringList ( rsFile ); referenceRSList=new stringList ( referenceRSFile ); break;
				case 0:
				case 2: positions=new stringSample ( rsFile ); referencePositions=new stringSample ( referenceRSFile );
					chromosomes=positions->getColumn ( 0 ); referenceChromosomes=referencePositions->getColumn ( 0 );
					rsList=positions->getColumn ( 1 ); referenceRSList=referencePositions->getColumn ( 1 ); break;
    zap(positions); zap(referencePositions);
			}



			GenomaSample* genomaSample=new GenomaSample ( gouFile, NULL, 0, parent, MajorFirst, inputFormat ), *referenceGenomaSample=new GenomaSample ( referenceGouFile, NULL, 0, parent, MajorFirst, inputFormat );
	//		cout << *referenceGenomaSample <<"\n";
	//		exit ( 0 );
			BivariateCountsAlongSecondVar* counts=NULL;
			HeteroPair<intSet*, intList*>* listsOfRemovedPositions;
			doubleList* transcriptionLevels, *currentTranscriptionLevels, *rankedGenotypes, * genotypeValues, *secondRankList;
			doubleList** arrayOfSecondRankList=NULL;
			doubleList** arrayOfGenotypeValueList=NULL;
			intList* listOfSecondTies=NULL, *hasMissing=NULL;
			intList* remove, *usedPositions;
			intSet*removePos;
			int totalLines;
			TextFile* tf=new TextFile ( inputFileName1 );
			totalLines=tf->getTotalLines();
			individualList=tf->readLine();// all

//cout << "individual lis is:" << *individualList <<"\n";
//cout << "total inds:" << individualList->size() <<"\n";
//exit(0);
			OpenOutput ( outputFileName, &OutputFile );
			OpenOutput ( outputFileName2, &OutputFile2 );
			OpenOutput ( outputFileName3, &OutputFile3 );

			for ( int i=0; i<headings;i++ )
			{
				OutputFile << individualList->getElement ( 0 ) << "\t";
				individualList->removeNode ( 0 );
			}
   if (inputFormat==1)
			OutputFile <<"rs number\tSpearmanCorr\trawPVal\t-log10(rawPVal)\t" << permutations << "permutationsAdjustedPVal\t-log10(adjustedPVal)\n";
else 			OutputFile <<"chromosome\trs number\tSpearmanCorr\trawPVal\t-log10(rawPVal)\t" << permutations << "permutationsAdjustedPVal\t-log10(adjustedPVal)\n";
			//cout << "individual lis is: now " << *individualList <<"\n";
//cout << "total inds:" << individualList->size() <<"\n";
//exit(0);
//row=tf->readLine(); // all
//zap(row) // information about normalization
			row=tf->readLine(); // expression line
			int position;
			double measure, pVal;


			allele al1, al2;
			Spearman*spearman;
			int j=0, line=0;
			char car1, car2, cadena[100];

			listsOfRemovedPositions=getRemovedPos ( individualList, genomaSample, inputFormat );
			removePos=listsOfRemovedPositions->First;
// cout <<"removedpos:" << *removePos <<"\n";
			usedPositions=listsOfRemovedPositions->Second;

if (usedPositions->size()==0)
{
cout << "There are no individuals in common between genotypes and transcriptome files\n";
exit(0);
}

cout << "individual lis is: now " << *usedPositions <<"\n";
			arrayOfGenotypeValueList=getListOfGenotypeValues ( usedPositions, individualList, rsList,  referenceRSList, referenceGenomaSample, genomaSample );
cout << *arrayOfGenotypeValueList <<"\n";
//exit(0);
			arrayOfSecondRankList=getArrayOfSecondRankList ( arrayOfGenotypeValueList, rsList->size() );
//cout << "array of second rank list:" << *arrayOfSecondRankList <<"\n";
			listOfSecondTies=getListOfTies ( arrayOfSecondRankList, rsList->size() );

			hasMissing=getListOfMissingVars ( arrayOfSecondRankList, rsList->size() );

			double *nullArray=NULL;

//cout << "arrr\n";
			while ( row!=NULL ) //  && i<100 for each gene
			{
				transcriptionLevels=new doubleList();
//if (i%10==0)
				cout <<"line (transcription var) " << line+1 <<" out of " << totalLines-1 << " has been processed\n";
//cout << "row is:" << *row <<"\n";
				stringList::iterator refIt;
				for ( stringList::iterator it=row->getNode ( headings ); it<row->end(); it++ ) // reading elements after heading values
					if ( row->getElement ( it ) ==string ( "NA" ) )  transcriptionLevels->insertElement ( numeric_limits<double>::quiet_NaN( ) );
					else transcriptionLevels->insertElement ( atof ( row->getElement ( it ).c_str() ) );
				currentTranscriptionLevels=transcriptionLevels->clone();
				currentTranscriptionLevels->removeElementsWithPositionsIn ( removePos );
				if ( usedPositions->size() !=currentTranscriptionLevels->size() )
				{
					cout << "There are " << currentTranscriptionLevels->size() << " transcription levels for gene " << line+1 << " while there are " << usedPositions->size() << " individuals in the first row\nMake sure separation character is always a comma, a blank space or a tab";
					throw OutOfBounds ( usedPositions->size(), currentTranscriptionLevels->size() );
				}
				counts=new BivariateCountsAlongSecondVar ( currentTranscriptionLevels, arrayOfGenotypeValueList, permutations, rsList->size() );
				nullArray=NULL;
				for ( stringList::iterator it=rsList->begin(); it<rsList->end(); it++ )
				{
					if ( rsList->getPosition ( it ) %1000==0 )
						cout << "snp " << rsList->getPosition ( it ) +1 <<" has the statistic computed\n";
					for ( int i=0; i<headings;i++ )
						OutputFile << row->getElement ( i ) << "\t";
    if (inputFormat!=1)
OutputFile << chromosomes->getElement(rsList->getPosition(it)) << "\t";
					OutputFile << *it << "\t";
					spearman=new Spearman ( counts, rsList->getPosition ( it ), arrayOfSecondRankList , listOfSecondTies, hasMissing, false, NULL ); // no permutations
					measure=spearman->getMeasure();
					OutputFile << measure <<"\t";
					pVal=spearman->getPVal();
					OutputFile << pVal <<"\t";
//cout <<"pval is: " << pVal <<"\n";
					if ( pVal==1.0 ) OutputFile << 0 << "\t";
					else OutputFile << -std::log10 ( pVal ) << "\t";
					if (pVal==-1) OutputFile3 << "p val for snp "  << rsList->getPosition ( it ) +1 << " was not computed because there was no tie and the required algorithm AS89 is not impleented\n";
					zap ( spearman );
					if ( nullArray==NULL && rsList->getPosition ( it ) >0 )
						throw NullValue ( "transcriptomeAssociation::main()" );
					//if ( nullArray==NULL ) cout << "setting permutations\n";
					spearman=new Spearman ( counts, rsList->getPosition ( it ), arrayOfSecondRankList, listOfSecondTies, hasMissing, true, nullArray );
					pVal=spearman->getPVal(); // * ( totalLines-1 );
					OutputFile << pVal <<"\t";
//cout <<"permuts pval is: " << pVal <<"\n";
					if ( pVal==1.0 ) OutputFile << 0 << "\n";
					else OutputFile << -std::log10 ( pVal ) << "\n";
					nullArray=spearman->nullArray;
					zap ( spearman );
				} // for each snp

				OutputFile2 << row->getElement ( 0 ) << "\t";
				for ( int i=0; i<permutations;i++ )
					if ( i< ( permutations-1 ) ) OutputFile2 << nullArray[i]<< "\t";
					else OutputFile2 << nullArray[i]<< "\n";
				zaparr ( nullArray );
				zap ( transcriptionLevels );
				zap ( currentTranscriptionLevels );
				zap ( row );
				zap ( counts );
				row=tf->readLine(); // all
				line++;
			} // for each row

			//zap ( removePos );
    if (inputFormat!=1)
    {
zap(chromosomes); 
zap(referenceChromosomes);
}
			zap ( listsOfRemovedPositions );
			zaparr ( arrayOfGenotypeValueList, rsList->size() );
			zaparr ( arrayOfSecondRankList, rsList->size() );
			zap ( listOfSecondTies );
			OutputFile.close();
			OutputFile2.close();
			OutputFile3.close();
			zap ( genomaSample );
			zap ( referenceGenomaSample );
			zap ( rsList );
			cout << "Results have been written in file " << outputFileName <<"\n";
// }
//catch (BasicException & ns){ns.addMessage("\ncalled from Diplotype::OrderMajorFirst(allele& left, allele& right, allele MajorAllele)"); throw;}
		}
		catch ( BasicException& be ) {be.addMessage ( "\ncalled from void transcriptomeAssociation::main()" ); throw;};
	}
	catch ( NonProb np ) {np.PrintMessage();}
	catch ( NoMemory nm ) {}
	catch ( OutOfRange<int> rr ) {rr.PrintMessage();}
	catch ( OutOfRange<long long int> rl ) {rl.PrintMessage();}
	catch ( NullValue nv ) {nv.PrintMessage();}
	catch ( NonInteger nv ) {nv.PrintMessage();}
	catch ( Inconsistent in ) {in.PrintMessage();}
	catch ( BadFormat bf ) {bf.PrintMessage();}
	catch ( NonSNP ns ) {ns.PrintMessage();}
	catch ( NonDefined nf ) {nf.PrintMessage();}
	catch ( OverflowedSNP os ) {os.PrintMessage();}
	catch ( OutOfBounds ob ) {ob.PrintMessage();}
	catch ( NonImplemented ni ) {ni.PrintMessage();}
	catch ( bad_alloc ba ) {ba.what();cout <<"bad alloc\n"; exit ( 0 );}
	catch ( ZeroValue zv ) {zv.PrintMessage();}
	catch ( NoWindow nw ) {nw.PrintMessage();}
	catch ( EmptyFile ef ) {ef.PrintMessage();}
	catch ( AlreadyOpen ao ) {ao.PrintMessage();}
	catch ( ErrorFile & eef ) {eef.PrintMessage();}
	catch ( NoEOL & ne ) {ne.PrintMessage();}
	catch ( AlreadyExist & aee ) {aee.PrintMessage();}
	catch ( NanValue & nv ) {nv.PrintMessage();}
	catch ( BasicException & be ) {be.PrintMessage();};
	return 0;
};






