/* File: subset.cpp */


#ifndef __subset_cpp__
#define __subset_cpp__


#include "subset.h"// //




namespace BIOS {
	
/*************************************************************************/// 
/*									 */// 
/*      Evaluation of the subsetting of a discrete attribute		 */// 
/*      ----------------------------------------------------		 */// 
/*									 */// 
/*************************************************************************/// 
// 
// 
/*************************************************************************/// 
/*									 */// 
/*  Combine the distribution figures of discrete attribute values	 */// 
/*  x and y, putting the combined figures in Freq[x][] and		 */// 
/*  ValFreq[x][], and saving old values in Slice1 and Slice2		 */// 
/*									 */// 
/*************************************************************************/// 
// 
// 
   void Combine(DiscrValue x, DiscrValue y, DiscrValue Last)// 
/*  -------  */// 
{// 
    ClassNo c;// 
// 
    ForEach(c, 0, MaxClass)// 
    {// 
	Slice1[c] = Freq[x][c];// 
	Slice2[c] = Freq[y][c];// 
// 
	Freq[x][c] += Freq[y][c];// 
	Freq[y][c]  = Freq[Last][c];// 
    }// 
// 
    Slice1[MaxClass+1] = ValFreq[x];// 
    Slice2[MaxClass+1] = ValFreq[y];// 
// 
    ValFreq[x] += ValFreq[y];// 
    ValFreq[y]  = ValFreq[Last];// 
}// 
// 
// 
// 
/*************************************************************************/// 
/*									 */// 
/*  Restore old class distribution figures of discrete attribute	 */// 
/*  values x and y from Slice1 and Slice2				 */// 
/*									 */// 
/*************************************************************************/// 
// 
// 
   void Uncombine(DiscrValue x, DiscrValue y)// 
/*  ---------  */// 
{// 
    ClassNo c;// 
// 
    ForEach(c, 0, MaxClass)// 
    {// 
	Freq[x][c] = Slice1[c];// 
	Freq[y][c] = Slice2[c];// 
    }// 
// 
    ValFreq[x] = Slice1[MaxClass+1];// 
    ValFreq[y] = Slice2[MaxClass+1];// 
}// 
// 
/*************************************************************************/// 
/*									 */// 
/*  Print the values of attribute Att which are in the subset Ss	 */// 
/*									 */// 
/*************************************************************************/// 
// 
// 
    void PrintSubset(AttributeC45 Att, Setc45 Ss)// 
/*  -----------  */// 
{// 
    DiscrValue V1;// 
    Boolean First=true;// 
// 
    ForEach(V1, 1, MaxAttVal[Att])// 
    {// 
	if ( In(V1, Ss) )// 
	{// 
	    if ( First )// 
	    {// 
		First = false;// 
	    }// 
	    else// 
	    {// 
		printf(", ");// 
	    }// 
// 
	    printf("%s", AttValName[Att][V1]);// 
	}// 
    }// 
}// 
// 
// 
// 
// 
// 
// 
/*************************************************************************/// 
/*									 */// 
/*  Evaluate subsetting a discrete attribute and form the chosen	 */// 
/*  subsets Subset[Att][], setting Subsets[Att] to the number of	 */// 
/*  subsets, and the Info[] and Gain[] of a test on the attribute	 */// 
/*									 */// 
/*************************************************************************/// 
// 
// 
   void EvalSubset(AttributeC45 Att, ItemNo Fp, ItemNo Lp, ItemCount Items, int criterion)// 
/*  ----------  */ // 
{ // 
    DiscrValue V1, V2, BestV1, BestV2, Barred;// 
    ItemCount KnownItems;// 
    ClassNo c;// 
    float BaseInfo, MinGain, ThisGain, ThisInfo, MaxRisk, ThisRisk, // 
        Val, BestVal, BestGain, BestInfo, BestRisk,// 
        PrevVal, PrevGain, PrevInfo, PrevRisk; // 
    float DiscrKnownBaseInfo(ItemCount, DiscrValue);
	float Worth(float, float, float);//
    float ComputeGain(float, float, DiscrValue, ItemCount);
	float TotalInfo(ItemCount*, short, DiscrValue);//
    short Blocks=0, MissingValues=0, ReasonableSubsets, Bytes, b;// 
    Boolean MergedSubsets = false;// 
    int SaveMINOBJS;// 
// 
    SaveMINOBJS = MINOBJS;// 
    MINOBJS = 1;// 
// 
    /*  First compute Freq[][], ValFreq[], base info, and the gain// 
	and total info of a split on discrete attribute Att  */// 
// 
    ComputeFrequencies(Att, Fp, Lp);// 
// 
    KnownItems = Items - ValFreq[0];// 
    if ( KnownItems < Epsilon )// 
    {// 
	Verbosity(2) printf("\tAtt %s: no known values\n", AttName[Att]);// 
// 
	Gain[Att] = -Epsilon;// 
	Info[Att] = 0;//
        EmpRisk[Att]=maxreal;//
	return;// 
    }// 
// 
    BaseInfo = DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]);// 
// 
    PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], MaxAttVal[Att],KnownItems);// 
    PrevInfo = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items;// 
    PrevRisk = ComputeRisk(MaxAttVal[Att], KnownItems, criterion); //
//    if ((criterion==-5)||(criterion==-3)||(criterion==-1) ||(criterion==1)) //
     PrevVal = Worth(PrevInfo, PrevGain, Epsilon);//
//    else PrevVal=PrevRisk; //    
//
    Verbosity(2)// 
    {// 
	printf("\tAtt %s", AttName[Att]);// 
// 
	Verbosity(3) PrintDistribution(Att, MaxAttVal[Att], true);// 
// 
	printf("\tinf %.3f, gain %.3f, val=%.3f\n",// 
		PrevInfo, PrevGain, PrevVal);// 
    }// 
// 
    /*  Eliminate unrepresented attribute values from Freq[] and ValFreq[]// 
	and form a separate subset for each represented attribute value  */// 
// 
    Bytes = (MaxAttVal[Att]>>3) + 1;// 
    ClearBits(Bytes, Subset[Att][0]);// 
// 
    ForEach(V1, 1, MaxAttVal[Att])// 
    {// 
	if ( ValFreq[V1] > 0.5 )// 
	{// 
	    if ( ++Blocks < V1 )// 
	    {// 
		ValFreq[Blocks] = ValFreq[V1];// 
		ForEach(c, 0, MaxClass)// 
		{// 
		    Freq[Blocks][c] = Freq[V1][c];// 
		}// 
	    }// 
	    ClearBits(Bytes, Subset[Att][Blocks]);// 
	    SetBit(V1, Subset[Att][Blocks]);// 
	}// 
	else// 
	{// 
	    SetBit(V1, Subset[Att][0]);// 
	    MissingValues++;// 
	}// 
    }// 
// 
    /*  Merge any single-class subsets with others of the same class  */// 
    /*  Note: have ValFreq[V] > 0 for all V  */// 
// 
    ForEach(V1, 1, Blocks-1)// 
    {// 
	for ( c = 0 ; Freq[V1][c] < 0.1 ; c++ )// 
	    ;// 
// 
	if ( Freq[V1][c] < ValFreq[V1] - 0.1 ) continue;// 
// 
	/*  Now have a single class -- look for others  */// 
// 
	for ( V2 = V1+1 ; V2 <= Blocks ; )// 
	{// 
	    if ( Freq[V2][c] < ValFreq[V2] - 0.1 )// 
	    {// 
		V2++;// 
	    }// 
	    else// 
	    {// 
		/*  Merge these subsets  */// 
// 
		Combine(V1, V2, Blocks);// 
// 
		ForEach(b, 0, Bytes-1)// 
		{// 
		    Subset[Att][V1][b] |= Subset[Att][V2][b];// 
		    Subset[Att][V2][b] = Subset[Att][Blocks][b];// 
		}// 
// 
		Blocks--;// 
		MergedSubsets = true;// 
	    }// 
	}// 
    }// 
// 
    if ( MergedSubsets )// 
    {// 
	PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);// 
	PrevInfo = TotalInfo(ValFreq, 0, Blocks) / Items;// 
        PrevRisk = ComputeRisk(Blocks, KnownItems, criterion); //
//        if ((criterion==-5)||(criterion==-3)||(criterion==-1) ||(criterion==1)) //
         PrevVal = Worth(PrevInfo, PrevGain, Epsilon);//
//        else 
//         PrevVal = PrevRisk;//
// 
	Verbosity(2)// 
	{// 
	    printf("\tAfter merging single-class subsets:");// 
// 
	    Verbosity(3) PrintDistribution(Att, Blocks, false);// 
// 
	    printf("\tinf %.3f, gain %.3f, val=%.3f\n",// 
		    PrevInfo, PrevGain, PrevVal);// 
	}// 
    }// 
// 
    /*  Examine possible pair mergers and hill-climb  */// 
// 
    MinGain = PrevGain / 2;// 
    MaxRisk = PrevRisk*2;//
//
    while ( Blocks > 2 )// 
    {// 

    BestVal = BestV1 = 0; //
//    if ((criterion==-5)||(criterion==-3)||(criterion==-1) ||(criterion==1)) //
     BestGain = -Epsilon;// 
     BestRisk = maxreal;//
// 
	/*  Check reasonable subsets; if less than 3, bar mergers// 
	    involving the largest block  */// 
// 
	ReasonableSubsets = 0;// 
	Barred = 1;// 
// 
	ForEach(V1, 1, Blocks)// 
	{// 
	    if ( ValFreq[V1] >= SaveMINOBJS ) ReasonableSubsets++;// 
// 
	    if ( ValFreq[V1] > ValFreq[Barred] ) Barred = V1;// 
	}// 
// 
	if ( ReasonableSubsets >= 3 ) Barred = 0;// 
// 
	/*  For each possible pair of values, calculate the gain and// 
	    total info of a split in which they are treated as one.// 
	    Keep track of the pair with the best gain.  */// 
// 
	ForEach(V1, 1, Blocks-1)// 
	{// 
	    ForEach(V2, V1+1, Blocks)// 
	    {// 
		if ( V1 == Barred || V2 == Barred ) continue;// 
// 
		Combine(V1, V2, Blocks);// 
// 
		ThisGain = ComputeGain(BaseInfo, UnknownRate[Att],// 
					Blocks-1, KnownItems);// 
		ThisInfo = TotalInfo(ValFreq, 0, Blocks-1) / Items;// 
//                ThisRisk = ComputeRisk(Blocks-1, KnownItems, criterion); //
//                if ((criterion==-5)||(criterion==-3)||(criterion==-1)||(criterion==1)) //
                 Val = Worth(ThisInfo, ThisGain, Epsilon);//
//                else Val=ThisRisk; //
//
		Verbosity(4)// 
		{// 
		    printf("\tcombine %d %d info %.3f gain %.3f val %.3f",// 
		           V1, V2, ThisInfo, ThisGain, Val);// 
		    PrintDistribution(Att, Blocks-1, false);// 
		}// 
// 
		/*  Force a split if// 
			less than two reasonable subsets, or// 
			using GAIN criterion// 
		    Prefer this split to the previous one if// 
			gain >= MinGain (and previous < MinGain), or// 
			val >= previous best val  */// 
//
                if ( ! GAINRATIO || ReasonableSubsets < 2 )
                if ((criterion==-5)||(criterion==-3)||(criterion==-1)||(criterion==1)) //
                {
		if ( ThisGain >= MinGain && BestGain < MinGain ||//
		     Val >= BestVal ||// 
                     ! BestV1 )// 
		{// 
		    BestVal  = Val;// 
		    BestGain = ThisGain;// 
		    BestInfo = ThisInfo;// 
		    BestV1   = V1;// 
		    BestV2   = V2;// 
		}//
                }
                else //
                if ( ThisRisk <= MaxRisk &&  BestRisk > MaxRisk )// 
		{// 
                    BestRisk = ThisRisk;// 
                    BestV1   = V1;// 
                    BestV2   = V2;// 
		}//
                
                Uncombine(V1, V2);// 
	    }// 
	}// 
// 
       if ((criterion==-5)||(criterion==-3)||(criterion==-1)||(criterion==1)) //
       {
	if ( GAINRATIO &&// 
	     ReasonableSubsets >= 2 &&// 
	     ( ! BestV1 ||// 
	       BestVal < PrevVal + 1E-5 ||// 
               BestVal == PrevVal && BestGain < PrevGain )) break;//
       }
       else
        if ( ReasonableSubsets >= 2 &&// 
             ( ! BestV1 || BestRisk > PrevRisk ) ) break;//

	PrevGain = BestGain;// 
	PrevInfo = BestInfo;//
        PrevRisk= BestRisk;//
        PrevVal = BestVal;// 
// 
	Combine(BestV1, BestV2, Blocks);// 
// 
	ForEach(b, 0, Bytes-1)// 
	{// 
	    Subset[Att][BestV1][b] |= Subset[Att][BestV2][b];// 
	    Subset[Att][BestV2][b] = Subset[Att][Blocks][b];// 
	}// 
// 
	Blocks--;// 
// 
	Verbosity(2)// 
	{// 
	    printf("\t\tform subset ");// 
	    PrintSubset(Att, Subset[Att][BestV1]);// 
	    printf(": %d subsets, inf %.3f, gain %.3f, val %.3f\n",// 
		   Blocks, BestInfo, BestGain, BestVal);// 
	    Verbosity(3)// 
	    {// 
		printf("\t\tcombine %d, %d", BestV1, BestV2);// 
		PrintDistribution(Att, Blocks, false);// 
	    }// 
	}// 
    }// 
// 
    MINOBJS = SaveMINOBJS;// 
//
   if ( ( PrevVal <= 0  &&  // 
       (criterion==-5 || criterion==-3 || criterion==-1 || criterion==1)) //
       || ( PrevRisk >= maxreal  &&  // 
       (criterion==-4 || criterion==-2 || criterion==-0 ))) //
       {// 
	Gain[Att] = -Epsilon;// 
	Info[Att] = 0;//
        EmpRisk[Att] = maxreal; //
       }// 
      else// 
       {// 
	Gain[Att] = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems);// 
	Info[Att] = PrevInfo;//
        EmpRisk[Att] = ComputeRisk(Blocks, KnownItems, criterion); //
// 
	if ( MissingValues )// 
	{// 
	    Blocks++;// 
	    CopyBits(Bytes, Subset[Att][0], Subset[Att][Blocks]);// 
	}// 
// 
	Subsets[Att] = Blocks;// 
// 
	Verbosity(2) printf("\tFinal subsets:");// 
	Verbosity(3) PrintDistribution(Att, Blocks, false);// 
	Verbosity(2)// 
	    printf("\tinf %.3f gain %.3f val %.3f\n", // 
		   Info[Att], Gain[Att], Worth(Info[Att], Gain[Att], Epsilon));// 
    }// 
}//
// 
// 
// 
// 
/*************************************************************************/// 
/*									 */// 
/*  Construct and return a node for a test on a subset of values	 */// 
/*									 */// 
/*************************************************************************/// 
// 
// 
    void SubsetTest(TreeC45 Node, AttributeC45 Att)// 
/*  -----------  */// 
{ // 
    ItemCount CountItems();// 
    short S, Bytes;// 
// 
    Sprout(Node, Subsets[Att]);// 
// 
    Node->NodeType	= BrSubset;// 
    Node->Tested	= Att;// 
    Node->Errors	= 0;// 
    // 
    Bytes = (MaxAttVal[Att]>>3) + 1;// 
    Node->Subset = (Setc45 *) calloc(Subsets[Att] + 1, sizeof(Setc45));// 
    ForEach(S, 1, Node->Forks)// 
    {// 
	Node->Subset[S] = (Setc45) malloc(Bytes);// 
	CopyBits(Bytes, Subset[Att][S], Node->Subset[S]);// 
    }// 
} // 
}
#endif
