Kevin2015的个人博客分享 http://blog.sciencenet.cn/u/Kevin2015

博文

生存分析案例

已有 4143 次阅读 2016-7-21 21:33 |个人分类:知识点专题|系统分类:科研笔记

#!/usr/bin/Rscript

library(survival)
library(limma)
file.create("survive.txt")
#files
tumors <- scan("tumor_list",what="")
for (tumor in tumors){
   exp_file <- paste("/home/zhanghl/resource/raw_counts/",tumor,".uncv2.mRNAseq_raw_counts.txt.rpkm",sep="")

#read in files
   rna=read.table(exp_file,header=TRUE,row.names=1,sep="t",stringsAsFactors=FALSE)

#get the index of the normal/tumor samples
   t_index <- which(substr(colnames(rna),14,15) == "01")
   n_index <- which(substr(colnames(rna),14,15) == "11")
   
#first I remove genes whose expression is == 0 in more than 50% of the samples:
   rem <- function(x){
       x <- as.matrix(x)
       x <- t(apply(x,1,as.numeric))
       r <- as.numeric(apply(x,1,function(i) sum(i == 0)))
       remove <- which(r > dim(x)[2]*0.8)
       return(remove)
   }
   remove <- rem(rna)
   rna <- rna[-remove,]
   
#calculate z-scores :(value - mean normal)/SD normal
#    scal <- function(x,y){
#        mean_n <- rowMeans(y)  # mean of normal
#        sd_n <- apply(y,1,sd)  # SD of normal
#        res <- matrix(nrow=nrow(x), ncol=ncol(x))
#        colnames(res) <- colnames(x)
#        rownames(res) <- rownames(x)
#        for(i in 1:dim(x)[1]){
#            for(j in 1:dim(x)[2]){
#                res[i,j] <- (x[i,j]-mean_n[i])/sd_n[i]
#            }
#        }
#    return(res)
#    }

   z_rna <- rna[,t_index]
#set the rownames and colnames
   colnames(z_rna) <- gsub("\.","-",substr(colnames(rna[,t_index]),1,12))
   rm(rna)
print(paste(i,"-----------------finish first part",sep=""))

#######################################################################
# read in clinical data
   clinical_file      <-       paste("/home/zhanghl/resource/raw_counts/",tumor,".merged_only_clinical_clin_format.txt",sep="")
   clinical           <-       t(read.table(clinical_file,header=T, row.names=1, sep="t",fill=TRUE,quote = "",stringsAsFactors=FALSE))
   clinical           <-       as.data.frame(clinical)
   clinical$IDs       <-       toupper(clinical$patient.bcr_patient_barcode)
   rownames(clinical) <-       clinical$IDs
   sum(clinical$IDs %in% colnames(z_rna))
   
#days_to_new_tumor_event_after_initial_treatment : isolate the max value
   ind_keep <- grep("days_to_new_tumor_event_after_initial_treatment",colnames(clinical))
   new_tum <- as.matrix(clinical[,ind_keep])
   new_tum_collapsed <- c()
   for (k in 1:nrow(new_tum)){
       if(sum(is.na(new_tum[k,])) < ncol(new_tum)){
               m <- max(new_tum[k,],na.rm=T)
               new_tum_collapsed <- c(new_tum_collapsed,m)
         } else {
               new_tum_collapsed <- c(new_tum_collapsed,"NA")
             }
   }

#do the same to death: isolate the max value
   ind_keep <- grep("days_to_death",colnames(clinical))
   death <- as.matrix(clinical[,ind_keep])
   death_collapsed <- c()
   for (k in 1:nrow(death)){
       if( sum(is.na(death[k,])) < ncol(death) ){
           m <- max(death[k,],na.rm=T)
           death_collapsed <- c(death_collapsed,m)
         } else {
               death_collapsed <- c(death_collapsed,"NA")
           }
   }
   
#days_to_last_followup: isolate the min value
   ind_keep <- grep("days_to_last_followup",colnames(clinical))
   fl <- as.matrix(clinical[,ind_keep])
   fl_collapsed <- c()
   for (k in 1:nrow(fl)){
       if( sum(is.na(fl[k,])) < ncol(fl) ){
               m <- min(fl[k,],na.rm=T)
               fl_collapsed <- c(fl_collapsed,m)
         } else {
               fl_collapsed <- c(fl_collapsed,"NA")
             }
   }

#put everything together
   all_clin <- data.frame(new_tum_collapsed,death_collapsed,fl_collapsed)
   colnames(all_clin) <- c("new_tumor_days", "death_days", "followUp_days")
#now, to do survival analysis we need three main things:
#1- time: this is the time till an event happens
#2- status: this indicates which patients have to be kept for the analysis
#3- event: this tells i.e. which patients have the gene up- or down-regulated or have no changes in expression

#since we want to do censored analysis we need to have something to censor the data with.
#for example, if a patient has no death data BUT there is a date to last followup
#it means that after that day we know nothing about the patient, therefore after that day it
#cannot be used for calculations/Kaplan Meier plot anymore, therefore we censor it.
#so now we need to create vectors for both "time to new tumor" and 'time to death" that contain
#also the data from censored individuals

# create vector with time to new tumor containing data to censor for new_tumor
all_clin$new_time <- c()
for (i in 1:length(as.numeric(as.character(all_clin$new_tumor_days)))){
 all_clin$new_time[i] <- ifelse(is.na(as.numeric(as.character(all_clin$new_tumor_days))[i]),
                   as.numeric(as.character(all_clin$followUp_days))[i],as.numeric(as.character(all_clin$new_tumor_days))[i])
}

# create vector time to death containing values to censor for death
all_clin$new_death <- c()
for (i in 1:length(as.numeric(as.character(all_clin$death_days)))){
 all_clin$new_death[i] <- ifelse(is.na(as.numeric(as.character(all_clin$death_days))[i]),
                                as.numeric(as.character(all_clin$followUp_days))[i],as.numeric(as.character(all_clin$death_days))[i])
}

#now we need to create the vector for censoring the data which means telling R which patients are dead or have new tumor. in this case
#if a patient has a “days_to_death” it will be assigned 1, and used in the corresponding analysis. the reason why we
#censor with death events even for recurrence is pretty important. a colleague made me notice that this is a competitive
#risk problem, where, although a patient can recur and then die, if a patient is dead, it will not recur, therefore is
#more accurate to censor for death events.

#create vector for death censoring
   all_clin$death_event <- ifelse(clinical$patient.vital_status == "alive", 0,1)

#finally add row.names to clinical
   rownames(all_clin) <- clinical$IDs

#create event vector for RNASeq data
   #event_rna <- t(apply(z_rna,1,function(i) ifelse(i>1.96,1,ifelse(i< -1.96,0,2)))) #up and down  1:>1.96;0: <-1.96; 2: >-1.96 and <1.96
   #event_rna <- t(apply(z_rna, 1, function(x) ifelse(abs(x) > 1.96,1,0))) # altered and not altered
   event_rna <- matrix(ncol=ncol(z_rna),nrow=nrow(z_rna))
   event_rna <- as.data.frame(event_rna)
   colnames(event_rna) <- colnames(z_rna)
   rownames(event_rna) <- rownames(z_rna)
   med <- apply(z_rna,1,median,na.rm=TRUE)
   for (m in 1:nrow(z_rna)){
       for (n in 1:ncol(z_rna)){
           event_rna[m,n] <- ifelse(z_rna[m,n] > med[m],1,0)
       }
   }

#pick your gene
   gene_list <- scan("survival_gene_list",what="")
   genes     <- intersect(rownames(event_rna),gene_list)
   print(paste("intersect genes",intersect(genes,rownames(event_rna))))
   common_samples <- intersect( colnames(event_rna),rownames(all_clin) )

#Calculate each gene
for (gene in genes){  
   factor <- factor(as.numeric(event_rna[gene,common_samples]))
   time <- all_clin[common_samples,"new_death"]
   event <- all_clin[common_samples,"death_event"]
#survdiff
   s <- survfit(Surv(time,event)~factor)
   s1 <- tryCatch(survdiff(Surv(time,event)~factor), error = function(e) return(NA))
   pv <- ifelse(is.na(s1),next,(round(1 - pchisq(s1$chisq, length(s1$n) - 1),3)))[[1]]
#HR
   HR <- (s1$obs[2]/s1$exp[2])/(s1$obs[1]/s1$exp[1])
   up95  <- exp( log(HR)+ qnorm(0.975)*sqrt(1/s1$exp[2] + 1/s1$exp[1]) )
   low95 <- exp( log(HR)- qnorm(0.975)*sqrt(1/s1$exp[2] + 1/s1$exp[1]) )
#high_num and low_num
   high_num <- table(as.matrix(event_rna[gene,]))[[2]]
   low_num <- table(as.matrix(event_rna[gene,]))[[1]]
#plot data
   png(paste(tumor,gene,".survive.png",sep=""))
   plot(s,col=c(1:3), frame=F, lwd=2, main=paste(tumor,gene,sep=":"))
# add lines for the median survival
   x1 <- ifelse(is.na(summary(s)$table[,'median'][1]),"NA", summary(s)$table[,'median'][1])
   x2 <- as.numeric(summary(s)$table[,'median'][2])
   if(x1 != "NA" & x2 != "NA"){
       lines(c(0,x1),c(0.5,0.5),col="blue")
       lines(c(x1,x1),c(0,0.5),col="black")
       lines(c(x2,x2),c(0,0.5),col="red")
   }
# add legend
   legend(1800,0.995,legend=paste("p.value = ",pv[[1]],sep=""),bty="n",cex=1.4)
   legend(0.5,1.5,legend=paste("HR = ",HR,sep=""),bty="n",cex=1.4)
   legend(max(all_clin[common_samples,"new_death"],na.rm = T)*0.7,0.94,
      legend=c(paste("high=",high_num),paste("low=",low_num )),bty="n",cex=1.3,lwd=3,col=c("black","red"))
   dev.off()    
   
   sf_content <- paste("tumor",tumor,"gene",gene,"pv",pv,"HR",HR,"Hihg",high_num,"Low",low_num,sep=" ")
   print(sf_content)
   write.table(sf_content,"sf.survival",append=TRUE,quote=FALSE,col.names=FALSE,row.names=FALSE)
}    
}






https://wap.sciencenet.cn/blog-2609994-992077.html

上一篇:正则
下一篇:多变量计算不同水平的overlap
收藏 IP: 14.204.63.*| 热度|

0

该博文允许注册用户评论 请点击登录 评论 (0 个评论)

数据加载中...
扫一扫,分享此博文

Archiver|手机版|科学网 ( 京ICP备07017567号-12 )

GMT+8, 2024-5-20 16:40

Powered by ScienceNet.cn

Copyright © 2007- 中国科学报社

返回顶部