##This program aims to get the most important sequence features(SelFeatureNum) in the cross validation with RF

library(randomForest);
library(e1071);
RFSeqFeatSelect <- function(ChromArray,IChrom,Normed450KArray,SelFeatureNum)
{
	FeatureData <- read.table(paste0("./SeqFeature450K/SeqFeature450KIn500bp_chr",ChromArray[IChrom],".txt"),sep='\t',header=TRUE);

###~~~~~~~~~~~~~~~~~~~~~~~~~~~Normlaize each feature into [0,1]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~### 
	MaxColVal <- apply(FeatureData[,-1],2,max);
	MinColVal <- apply(FeatureData[,-1],2,min);
###~~~~~~~~~~~~~~~~~~~~~~~~~~~Normlaize each feature into [0,1]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~### 

###~~~~~~~~~~~~~~~~~~~~~~~~Get the most FeatureNum common used features in each tree~~~~~~~~~~~~~~~~~~~~~~~~###
	IndexMatrix <- c();
	SampleNum <- 10000;
	CVTimes <- 2;
	for(NumTree in 1:CVTimes)
	{
# If the CpG site number is less than 10000, use all the sample for training, and only for once			
		if(nrow(FeatureData) < SampleNum)
		{
			SampleNum <- nrow(FeatureData);
			NumTree <- CVTimes;
		}
			
		print(paste("Processing on ", NumTree, "th tree in ",CVTimes," times of cross validation...",sep = ""));
		SampleIndex <- sample(seq(1,nrow(FeatureData)),SampleNum);
		TrainData <- FeatureData[SampleIndex,];
			
		Train450K <- Normed450KArray[SampleIndex];
#Transform the methylation data to log
		Train450K[which(Train450K == 0)] <- 0.00001;
		Train450K[which(Train450K == 1)] <- 0.99999;
		Train450K <- log(Train450K/(1-Train450K));
		TrainFeature <- TrainData[,-1];

		MinColValMatrix <- matrix(rep(MinColVal,nrow(TrainFeature)),nrow = nrow(TrainFeature),byrow=TRUE);
		MaxColValMatrix <- matrix(rep(MaxColVal,nrow(TrainFeature)),nrow = nrow(TrainFeature),byrow=TRUE);
		NormedTrainFeature <- (TrainFeature - MinColValMatrix)/(MaxColValMatrix - MinColValMatrix);

## Delete the CpGs with response methylation value of NA
		DataIndex <- c(1:length(Train450K));
		NonNAIndex <- DataIndex[!is.na(Train450K)]; 

		TrainResult <- randomForest(NormedTrainFeature[NonNAIndex,],Train450K[NonNAIndex],importance = TRUE, proximity = TRUE);
		FeatureImport <- importance(TrainResult);
		IndexMatrix <- c(IndexMatrix,order(FeatureImport[,1],decreasing=TRUE)[1:SelFeatureNum]);

	}
	TempFrames <- as.data.frame(table(IndexMatrix));
	MostIndexLevel <- TempFrames[order(TempFrames[,2],decreasing = TRUE)[1:SelFeatureNum],1];
	MostFeatureIndex <- as.numeric(MostIndexLevel);
	
	MostFeatureIndex	
###~~~~~~~~~~~~~~~~~~~~~~~~Get the most FeatureNum common used features in each tree~~~~~~~~~~~~~~~~~~~~~~~~###
	
}