博文

生存分析案例

已有 4143 次阅读 2016-7-21 21:33 |个人分类:知识点专题|系统分类:科研笔记

#!/usr/bin/Rscript

library(survival)
library(limma)
file.create("survive.txt")
#files
tumors <- scan("tumor_list",what="")
for (tumor in tumors){
exp_file <- paste("/home/zhanghl/resource/raw_counts/",tumor,".uncv2.mRNAseq_raw_counts.txt.rpkm",sep="")

#read in files
rna=read.table(exp_file,header=TRUE,row.names=1,sep="t",stringsAsFactors=FALSE)

#get the index of the normal/tumor samples
t_index <- which(substr(colnames(rna),14,15) == "01")
n_index <- which(substr(colnames(rna),14,15) == "11")

#first I remove genes whose expression is == 0 in more than 50% of the samples:
rem <- function(x){
x <- as.matrix(x)
x <- t(apply(x,1,as.numeric))
r <- as.numeric(apply(x,1,function(i) sum(i == 0)))
remove <- which(r > dim(x)[2]*0.8)
return(remove)
}
remove <- rem(rna)
rna <- rna[-remove,]

#calculate z-scores :(value - mean normal)/SD normal
# scal <- function(x,y){
# mean_n <- rowMeans(y) # mean of normal
# sd_n <- apply(y,1,sd) # SD of normal
# res <- matrix(nrow=nrow(x), ncol=ncol(x))
# colnames(res) <- colnames(x)
# rownames(res) <- rownames(x)
# for(i in 1:dim(x)[1]){
# for(j in 1:dim(x)[2]){
# res[i,j] <- (x[i,j]-mean_n[i])/sd_n[i]
# }
# }
# return(res)
# }

z_rna <- rna[,t_index]
#set the rownames and colnames
colnames(z_rna) <- gsub("\.","-",substr(colnames(rna[,t_index]),1,12))
rm(rna)
print(paste(i,"-----------------finish first part",sep=""))

#######################################################################
# read in clinical data
clinical_file <- paste("/home/zhanghl/resource/raw_counts/",tumor,".merged_only_clinical_clin_format.txt",sep="")
clinical <- t(read.table(clinical_file,header=T, row.names=1, sep="t",fill=TRUE,quote = "",stringsAsFactors=FALSE))
clinical <- as.data.frame(clinical)
clinical$IDs <- toupper(clinical$patient.bcr_patient_barcode)
rownames(clinical) <- clinical$IDs
sum(clinical$IDs %in% colnames(z_rna))

#days_to_new_tumor_event_after_initial_treatment : isolate the max value
ind_keep <- grep("days_to_new_tumor_event_after_initial_treatment",colnames(clinical))
new_tum <- as.matrix(clinical[,ind_keep])
new_tum_collapsed <- c()
for (k in 1:nrow(new_tum)){
if(sum(is.na(new_tum[k,])) < ncol(new_tum)){
m <- max(new_tum[k,],na.rm=T)
new_tum_collapsed <- c(new_tum_collapsed,m)
} else {
new_tum_collapsed <- c(new_tum_collapsed,"NA")
}
}

#do the same to death: isolate the max value
ind_keep <- grep("days_to_death",colnames(clinical))
death <- as.matrix(clinical[,ind_keep])
death_collapsed <- c()
for (k in 1:nrow(death)){
if( sum(is.na(death[k,])) < ncol(death) ){
m <- max(death[k,],na.rm=T)
death_collapsed <- c(death_collapsed,m)
} else {
death_collapsed <- c(death_collapsed,"NA")
}
}

#days_to_last_followup: isolate the min value
ind_keep <- grep("days_to_last_followup",colnames(clinical))
fl <- as.matrix(clinical[,ind_keep])
fl_collapsed <- c()
for (k in 1:nrow(fl)){
if( sum(is.na(fl[k,])) < ncol(fl) ){
m <- min(fl[k,],na.rm=T)
fl_collapsed <- c(fl_collapsed,m)
} else {
fl_collapsed <- c(fl_collapsed,"NA")
}
}

#put everything together
all_clin <- data.frame(new_tum_collapsed,death_collapsed,fl_collapsed)
colnames(all_clin) <- c("new_tumor_days", "death_days", "followUp_days")
#now, to do survival analysis we need three main things:
#1- time: this is the time till an event happens
#2- status: this indicates which patients have to be kept for the analysis
#3- event: this tells i.e. which patients have the gene up- or down-regulated or have no changes in expression

#since we want to do censored analysis we need to have something to censor the data with.
#for example, if a patient has no death data BUT there is a date to last followup
#it means that after that day we know nothing about the patient, therefore after that day it
#cannot be used for calculations/Kaplan Meier plot anymore, therefore we censor it.
#so now we need to create vectors for both "time to new tumor" and 'time to death" that contain
#also the data from censored individuals

# create vector with time to new tumor containing data to censor for new_tumor
all_clin$new_time <- c()
for (i in 1:length(as.numeric(as.character(all_clin$new_tumor_days)))){
all_clin$new_time[i] <- ifelse(is.na(as.numeric(as.character(all_clin$new_tumor_days))[i]),
as.numeric(as.character(all_clin$followUp_days))[i],as.numeric(as.character(all_clin$new_tumor_days))[i])
}

# create vector time to death containing values to censor for death
all_clin$new_death <- c()
for (i in 1:length(as.numeric(as.character(all_clin$death_days)))){
all_clin$new_death[i] <- ifelse(is.na(as.numeric(as.character(all_clin$death_days))[i]),
as.numeric(as.character(all_clin$followUp_days))[i],as.numeric(as.character(all_clin$death_days))[i])
}

#now we need to create the vector for censoring the data which means telling R which patients are dead or have new tumor. in this case
#if a patient has a “days_to_death” it will be assigned 1, and used in the corresponding analysis. the reason why we
#censor with death events even for recurrence is pretty important. a colleague made me notice that this is a competitive
#risk problem, where, although a patient can recur and then die, if a patient is dead, it will not recur, therefore is
#more accurate to censor for death events.

#create vector for death censoring
all_clin$death_event <- ifelse(clinical$patient.vital_status == "alive", 0,1)

#finally add row.names to clinical
rownames(all_clin) <- clinical$IDs

#create event vector for RNASeq data
#event_rna <- t(apply(z_rna,1,function(i) ifelse(i>1.96,1,ifelse(i< -1.96,0,2)))) #up and down 1:>1.96;0: <-1.96; 2: >-1.96 and <1.96
#event_rna <- t(apply(z_rna, 1, function(x) ifelse(abs(x) > 1.96,1,0))) # altered and not altered
event_rna <- matrix(ncol=ncol(z_rna),nrow=nrow(z_rna))
event_rna <- as.data.frame(event_rna)
colnames(event_rna) <- colnames(z_rna)
rownames(event_rna) <- rownames(z_rna)
med <- apply(z_rna,1,median,na.rm=TRUE)
for (m in 1:nrow(z_rna)){
for (n in 1:ncol(z_rna)){
event_rna[m,n] <- ifelse(z_rna[m,n] > med[m],1,0)
}
}

#pick your gene
gene_list <- scan("survival_gene_list",what="")
genes <- intersect(rownames(event_rna),gene_list)
print(paste("intersect genes",intersect(genes,rownames(event_rna))))
common_samples <- intersect( colnames(event_rna),rownames(all_clin) )

#Calculate each gene
for (gene in genes){
factor <- factor(as.numeric(event_rna[gene,common_samples]))
time <- all_clin[common_samples,"new_death"]
event <- all_clin[common_samples,"death_event"]
#survdiff
s <- survfit(Surv(time,event)~factor)
s1 <- tryCatch(survdiff(Surv(time,event)~factor), error = function(e) return(NA))
pv <- ifelse(is.na(s1),next,(round(1 - pchisq(s1$chisq, length(s1$n) - 1),3)))[[1]]
#HR
HR <- (s1$obs[2]/s1$exp[2])/(s1$obs[1]/s1$exp[1])
up95 <- exp( log(HR)+ qnorm(0.975)*sqrt(1/s1$exp[2] + 1/s1$exp[1]) )
low95 <- exp( log(HR)- qnorm(0.975)*sqrt(1/s1$exp[2] + 1/s1$exp[1]) )
#high_num and low_num
high_num <- table(as.matrix(event_rna[gene,]))[[2]]
low_num <- table(as.matrix(event_rna[gene,]))[[1]]
#plot data
png(paste(tumor,gene,".survive.png",sep=""))
plot(s,col=c(1:3), frame=F, lwd=2, main=paste(tumor,gene,sep=":"))
# add lines for the median survival
x1 <- ifelse(is.na(summary(s)$table[,'median'][1]),"NA", summary(s)$table[,'median'][1])
x2 <- as.numeric(summary(s)$table[,'median'][2])
if(x1 != "NA" & x2 != "NA"){
lines(c(0,x1),c(0.5,0.5),col="blue")
lines(c(x1,x1),c(0,0.5),col="black")
lines(c(x2,x2),c(0,0.5),col="red")
}
# add legend
legend(1800,0.995,legend=paste("p.value = ",pv[[1]],sep=""),bty="n",cex=1.4)
legend(0.5,1.5,legend=paste("HR = ",HR,sep=""),bty="n",cex=1.4)
legend(max(all_clin[common_samples,"new_death"],na.rm = T)*0.7,0.94,
legend=c(paste("high=",high_num),paste("low=",low_num )),bty="n",cex=1.3,lwd=3,col=c("black","red"))
dev.off()

sf_content <- paste("tumor",tumor,"gene",gene,"pv",pv,"HR",HR,"Hihg",high_num,"Low",low_num,sep=" ")
print(sf_content)
write.table(sf_content,"sf.survival",append=TRUE,quote=FALSE,col.names=FALSE,row.names=FALSE)
}
}

转载本文请联系原作者获取授权，同时请注明本文来自张洪磊科学网博客。
链接地址：https://wap.sciencenet.cn/blog-2609994-992077.html

上一篇：正则
下一篇：多变量计算不同水平的overlap

收藏 IP: 14.204.63.*| 热度|

当前推荐数：0

该博文允许注册用户评论请点击登录评论 (0 个评论)

数据加载中...

返回顶部

张洪磊

扫一扫，分享此博文

Kevin2015的个人博客分享 http://blog.sciencenet.cn/u/Kevin2015

博文

生存分析案例

当前推荐数：0

该博文允许注册用户评论请点击登录评论 (0 个评论)

张洪磊

全部作者的其他最新博文

全部精选博文导读

相关博文

Kevin2015的个人博客分享 http://blog.sciencenet.cn/u/Kevin2015

博文

生存分析案例

当前推荐数：0

该博文允许注册用户评论 请点击登录 评论 (0 个评论)

张洪磊

全部作者的其他最新博文

全部精选博文导读

相关博文

该博文允许注册用户评论请点击登录评论 (0 个评论)