## R should be installed before running this program. And some packages must also be installed including preprocessCore, e1071 and randomForest. ## All the downloaded data for the 14 model tissues must be in the same main directory, as well as the 450K array data provided by users. ## Input Parameters: ## [1] The "InputDataFolder" is the folder name containing the 450K data provided by user. Remember that the 450K data must be provided as one file for each chromosome(the file name must be named as like *chr1.txt), and must be sorted in increasing order in each file; and the given 450K data file has two columns separated with '\t'.The first column would be the location of C in the version of hg19, the second column is the methylation value ranging from 0 to 1. ## [2] The ChromArray is an array contains the chromosome numbers needs to be predicted, only the autosomes are included; ## [3] Choose what kind of model would be used ## if(SeqModel == 0), use a simplified model (with only features of X1 and X2) which is much faster; ## if(SeqModel == 1), use a full model (with features of X1, X2 and X3); X3 is predicted with the 50 most frequently selected sequences, which were derived from 14 model tissues. ## if(SeqModel == 2), use a full model (with features of X1, X2 and X3), Random Forest would be used to select the 50 most frequently selected features for each chromosome by cross validation strategy, and SVM would be used to get the feature X3 based on the selected 50 features; This would be comparatively time consuming and memory consuming ## Output: ## The predicted methylation values for all the expanded CpG sites of each chromosome would be output to the specified output filefolder ## In each file, there are two columns, the first column is the location, and the second column is the predicted methylation value library(preprocessCore) library(e1071) PredictWith450K <- function(InputDataFolder,OutputDataFolder,ChromArray,SeqModel) { ChromNum <- length(ChromArray); TissueArray <- c('H1','Liver','Lung','Muscle','Aorta','Pancreas','Hippocampus','H9','Adipose','Adrenal','Esophagus','Intestine','Spleen','Thymus'); WGBSAccArray <- c('GSM432685','GSM916049','GSM983647','GSM1172596','GSM1120329','GSM983651','GSM916050','GSM706059','GSM1120330','GSM1120325','GSM983649','GSM983646','GSM983652','GSM1172595'); for(IChrom in 1:ChromNum) { FullFileNameArray <- list.files(paste0("./",InputDataFolder),pattern = paste0("chr",ChromArray[IChrom],".txt"),full.names=TRUE); LocFileNameArray <- list.files(paste0("./",InputDataFolder),pattern = paste0("chr",ChromArray[IChrom],".txt"),full.names=FALSE); SampleNum <- length(FullFileNameArray); if(SampleNum == 0) { print(paste0("No data file with required format is provided for Chr",ChromArray[IChrom])); stop(); } print(paste0("Processing on Chr",ChromArray[IChrom]," ...")); ## Combine the local methylation pattern of Model tissues into one array Model450KPattern <- array(); # methylation pattern of 10 CpG sites ModelWGBSArray <- array(); # WGBS values of model tissues print(paste0("Reading the local methylation patterns of Chr",ChromArray[IChrom]," of the 14 model tissues")); for(JSample in 1:14) { ModelWGBSWith450K <- read.table(paste0("./WGBSWith450KFlank5K5CpG/",WGBSAccArray[JSample],"_",TissueArray[JSample],"_Overlap14TissAndRAWith450K_chr",ChromArray[IChrom],".txt"),sep='\t',header=FALSE); # Model450KPattern is the 450K value of flanking 5 CpGs Model450KPattern <- cbind(Model450KPattern,do.call(cbind,ModelWGBSWith450K[,c(9:13,19:23)])); # ModelWGBSArray is the real value of WGBS of each site ModelWGBSArray <- cbind(ModelWGBSArray,ModelWGBSWith450K[,3]); } Model450KPattern <- round(Model450KPattern*10000)/10000; # For reducing the memory size; ModelWGBSArray <- round(ModelWGBSArray*10000)/10000; # For reducing the memory size; Model450KPattern <- Model450KPattern[,-1]; ModelWGBSArray[,1] <- ModelWGBSWith450K[,2]; # The first column is the CpG location ## Read in 450K data of one model to get the number of CpG sites Model450KData <- read.table(paste0("./Normed450K14Tis/GSM1139417_H1_450KOverlap14AndRA_chr",ChromArray[IChrom],".txt"),sep='\t',header=FALSE); ## Read in the Ratio file of up and down CpGs RatioData <- read.table(paste0("./RatioFile/UpDownCpGWeight_chr",ChromArray[IChrom],"_5000bp.txt"),sep='\t',header=FALSE); ## Normalize the 450K array data to the 14 tissue samples Mapped450KArray <- matrix(rep(NA,nrow(Model450KData)*SampleNum),ncol=SampleNum); print(paste0("Reading the 450K array data of Chr",ChromArray[IChrom]," of the tissues to be predicted")); for(ISample in 1:SampleNum) { TissData <- read.table(FullFileNameArray[ISample],sep='\t',header=FALSE); ModelOverlapTisIndex <- which(Model450KData[,1] %in% TissData[,1]); TisOverlapModelIndex <- which(TissData[,1] %in% Model450KData[,1]); Mapped450KArray[ModelOverlapTisIndex,ISample] <- TissData[TisOverlapModelIndex,2]; } Normed450KArray <- normalize.quantiles.use.target(Mapped450KArray,Model450KData[,2]); Normed450KArray <- round(Normed450KArray*10000)/10000; # For reducing the memory size; WGBSWith450KValue <- matrix(rep(1,nrow(Model450KPattern)*10),ncol=10); # Methylation values of 10CpGs of tissues to be predicted PredictbySimTis <- rep(1,nrow(Model450KPattern)); for(ISample in 1:SampleNum) { print(paste0("Calculating the feature X1 of Chr",ChromArray[IChrom]," of Sample ",ISample)); ##~~~~~~~~~~~~~~~~ Predit x1 according to similarity of the local methylation pattern~~~~~~~~~~~~~~~~~~~########## ## Get the 450K methylation values of upstream and downstream 5 CpGs LocationCol <- c(4:8,14:18); for(ICpG in 1:10) { LocIndex <- match(ModelWGBSWith450K[,LocationCol[ICpG]],Model450KData[,1]); WGBSWith450KValue[,ICpG] <- Normed450KArray[LocIndex,ISample]; } ## Compare the local methylation pattern with 14 model samples for(ILoc in 1:nrow(ModelWGBSWith450K)) { SelModel450KPattern <- matrix(rep(1,14*10),nrow=14); for(JSample in 1:14) { SelModel450KPattern[JSample,] <- Model450KPattern[ILoc,seq((JSample-1)*10+1,JSample*10)]; } CorreVect <- apply(SelModel450KPattern,1,cor,use='na.or.complete',y=WGBSWith450KValue[ILoc,]); MaxIndex <- which(CorreVect == max(CorreVect)); PredictbySimTis[ILoc] <- ModelWGBSArray[ILoc,MaxIndex[1]+1]; } ##~~~~~~~~~~~~~~~~~~~ Predit x2 according to the upstream and downstream CpGs~~~~~~~~~~~~~~~~~~~~~~########## print(paste0("Calculating the feature X2 of Chr",ChromArray[IChrom]," of Sample ",ISample)); WeightArray <- WGBSWith450KValue[,c(5,6)]*RatioData; PredictbyUpDowCpGs <- apply(WeightArray,1,sum); ##~~~~~~~~~~~~~~~~~~~ Predit x3 according to the sequence feature model~~~~~~~~~~~~~~~~~~~~~~########## ##~~~~~~~~~~~~~~~~~~~ AND ~~~~~~~~~~~~~~~~~~~~~########## ##~~~~~~~~~~~~~~~~~~~ Predict the final methylation values based on Logistic Regression model~~~~~~~~~~~~~~~~~~~~~~########## ## if SeqModel is equal to 0, ignore the sequence feature, predict with only features x1 and x2 ### ## if SeqModel is equal to 1, use 50 most frequently selected sequence features from 14 model tissues, then predict X3 with svm model### ## if SeqModel is equal to 2, Train the sequence model with RF by CV, select 50 most frequently selected sequence features, predict with svm model### if(SeqModel == 0) { print(paste0("Predicting on Chr",ChromArray[IChrom]," of Sample ",ISample," with features X1 and X2")); load(paste0("./ModelwithoutSeq/TrainedModelWith14Tis_WithoutSeq_chr",ChromArray[IChrom],".rda")); ToPredictData <- data.frame(TisVa=PredictbySimTis,M450KVa=PredictbyUpDowCpGs); PredictbyAll <- predict(LRModel,ToPredictData,type='response'); PredictbyAll450KReplace <- PredictbyAll; WGBSIndexOverlapwith450K <- which(ModelWGBSWith450K[,2] %in% Model450KData[,1]); M450KIndexOverlapwithWGBS <- which(Model450KData[,1] %in% ModelWGBSWith450K[,2]); PredictbyAll450KReplace[WGBSIndexOverlapwith450K] <- Mapped450KArray[M450KIndexOverlapwithWGBS]; } else { if(SeqModel == 1) { print(paste0("Using the 50 most importantant sequence features in CV for deriving the feature X3 of Chr",ChromArray[IChrom]," for Sample ",ISample)); FeatData <- read.table(paste0("./Top50SeqFeatureIndex/Top50SeqFeaturein14Tis_chr",ChromArray[IChrom],".txt"),sep='\t',header=TRUE); MostSelFeat <- FeatData[,1]; } if(SeqModel == 2) { source("RFSeqFeatSelect.R"); print(paste0("Running RF to get the 50 most importantant sequence features for deriving the feature X3 of Chr",ChromArray[IChrom]," for Sample ",ISample)); MostSelFeat <- RFSeqFeatSelect(ChromArray,IChrom,Normed450KArray[,ISample],50); } FeatureData <- read.table(paste0("./SeqFeature450K/SeqFeature450KIn500bp_chr",ChromArray[IChrom],".txt"),sep='\t',header=TRUE); Train450K <- Normed450KArray[,ISample]; Train450K[which(Train450K == 0)] <- 0.00001; Train450K[which(Train450K == 1)] <- 0.99999; Train450K <- log(Train450K/(1-Train450K)); TrainFeatureSel <- FeatureData[,MostSelFeat+1]; # The sequence features start from the second col. DataIndex <- c(1:length(Train450K)); NonNAIndex <- DataIndex[!is.na(Train450K)]; SVMModelSeq <- svm(TrainFeatureSel[NonNAIndex,],Train450K[NonNAIndex],scale=TRUE,type='eps-regression',kernel='radial'); #### Predict X3 based on the SVM models above print(paste0("Calculating the feature X3 of Chr",ChromArray[IChrom]," of Sample ",ISample)); ExpandSequData <- read.table(paste0("./SeqFeatureWGBSFlank5K5CpG/SeqFeatureWGBSIn500bp_chr",ChromArray[IChrom],".txt"),sep='\t',header=TRUE); ExpandSequFeatureSel <- ExpandSequData[,MostSelFeat+1]; PredictResults <- predict(SVMModelSeq,ExpandSequFeatureSel); FitResultProb <- exp(PredictResults)/(1+exp(PredictResults)); PredictbySeq <- round(FitResultProb*10000)/10000; # For reducing the memory size; print(paste0("Predicting on Chr",ChromArray[IChrom]," of Sample ",ISample," with features X1, X2 and X3")); load(paste0("./ModelwithSeq/TrainedModelWith14Tis_WithSeq_chr",ChromArray[IChrom],".rda")) ToPredictData <- data.frame(TisVa=PredictbySimTis,M450KVa=PredictbyUpDowCpGs,SeqVa=PredictbySeq); PredictbyAll <- predict(LRModel,ToPredictData,type='response'); PredictbyAll <- round(PredictbyAll*10000)/10000; # For reducing the memory size; PredictbyAll450KReplace <- PredictbyAll; WGBSIndexOverlapwith450K <- which(ModelWGBSWith450K[,2] %in% Model450KData[,1]); M450KIndexOverlapwithWGBS <- which(Model450KData[,1] %in% ModelWGBSWith450K[,2]); PredictbyAll450KReplace[WGBSIndexOverlapwith450K] <- Mapped450KArray[M450KIndexOverlapwithWGBS]; } PredictbyAll450KReplace <- round(PredictbyAll450KReplace*10000)/10000; PredictOut <- cbind(ModelWGBSWith450K[,2],PredictbyAll450KReplace) if(file.exists(paste0('./',OutputDataFolder,'/'))) {}else { dir.create(paste0('./',OutputDataFolder,'/'),showWarnings=FALSE) } OutFile <- paste0("./",OutputDataFolder,"/ExpandWithSeqModel_",SeqModel,"_",LocFileNameArray[ISample]); write.table(PredictOut,file=OutFile,sep='\t',col.names=FALSE,row.names=FALSE,quote=FALSE); } cat('\n'); } }