##This program aims to get the most important sequence features(SelFeatureNum) in the cross validation with RF library(randomForest); library(e1071); RFSeqFeatSelect <- function(ChromArray,IChrom,Normed450KArray,SelFeatureNum) { FeatureData <- read.table(paste0("./SeqFeature450K/SeqFeature450KIn500bp_chr",ChromArray[IChrom],".txt"),sep='\t',header=TRUE); ###~~~~~~~~~~~~~~~~~~~~~~~~~~~Normlaize each feature into [0,1]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~### MaxColVal <- apply(FeatureData[,-1],2,max); MinColVal <- apply(FeatureData[,-1],2,min); ###~~~~~~~~~~~~~~~~~~~~~~~~~~~Normlaize each feature into [0,1]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ###~~~~~~~~~~~~~~~~~~~~~~~~Get the most FeatureNum common used features in each tree~~~~~~~~~~~~~~~~~~~~~~~~### IndexMatrix <- c(); SampleNum <- 10000; CVTimes <- 2; for(NumTree in 1:CVTimes) { # If the CpG site number is less than 10000, use all the sample for training, and only for once if(nrow(FeatureData) < SampleNum) { SampleNum <- nrow(FeatureData); NumTree <- CVTimes; } print(paste("Processing on ", NumTree, "th tree in ",CVTimes," times of cross validation...",sep = "")); SampleIndex <- sample(seq(1,nrow(FeatureData)),SampleNum); TrainData <- FeatureData[SampleIndex,]; Train450K <- Normed450KArray[SampleIndex]; #Transform the methylation data to log Train450K[which(Train450K == 0)] <- 0.00001; Train450K[which(Train450K == 1)] <- 0.99999; Train450K <- log(Train450K/(1-Train450K)); TrainFeature <- TrainData[,-1]; MinColValMatrix <- matrix(rep(MinColVal,nrow(TrainFeature)),nrow = nrow(TrainFeature),byrow=TRUE); MaxColValMatrix <- matrix(rep(MaxColVal,nrow(TrainFeature)),nrow = nrow(TrainFeature),byrow=TRUE); NormedTrainFeature <- (TrainFeature - MinColValMatrix)/(MaxColValMatrix - MinColValMatrix); ## Delete the CpGs with response methylation value of NA DataIndex <- c(1:length(Train450K)); NonNAIndex <- DataIndex[!is.na(Train450K)]; TrainResult <- randomForest(NormedTrainFeature[NonNAIndex,],Train450K[NonNAIndex],importance = TRUE, proximity = TRUE); FeatureImport <- importance(TrainResult); IndexMatrix <- c(IndexMatrix,order(FeatureImport[,1],decreasing=TRUE)[1:SelFeatureNum]); } TempFrames <- as.data.frame(table(IndexMatrix)); MostIndexLevel <- TempFrames[order(TempFrames[,2],decreasing = TRUE)[1:SelFeatureNum],1]; MostFeatureIndex <- as.numeric(MostIndexLevel); MostFeatureIndex ###~~~~~~~~~~~~~~~~~~~~~~~~Get the most FeatureNum common used features in each tree~~~~~~~~~~~~~~~~~~~~~~~~### }