||
最近从基于高通量测序的16S分析转换到 环境样品的 Metagenome 分析时,需要统计组装后contig的GC含量和N50。于是花了点时间写了个小脚本,方便今后数据处理工作。
#GC_N50.py
print 'Python and Biopython needed for running this script!'
print "Script for calculating N50 of assembly"
fasta = raw_input('Enter your fasta/fa name: ')
# N50 calculation
BaseSum,Length= 0,[]
ValueSum,N50 = 0,0
no_c,no_g,no_a,no_t,no_n = 0,0,0,0,0
from Bio import SeqIO
for record in SeqIO.parse(open(fasta), "fasta"):
BaseSum += len(record.seq)
Length.append(len(record.seq))
seq =record.seq.lower()
no_c+=seq.count('c')
no_g+=seq.count('g')
no_a+=seq.count('a')
no_t+=seq.count('t')
no_n+=seq.count('n')
#N50 calcuation
N50_pos = BaseSum / 2.0
Length.sort()
Length.reverse()
for value in Length:
ValueSum += value
if N50_pos <= ValueSum:
N50 = value
break
print 'Sequences NO.:'+'t'+str(len(Length))
print 'Sequences Min.:'+'t'+str(min(Length))
print 'Sequences Max.:'+'t'+str(max(Length))
print 'N50: ' + str(N50)
#GC calcuation
print "Total number of bases:", BaseSum
print "percentage of A in given sequences","%.1f"%((float(no_a*100))/BaseSum),'%'
print "percentage of T in given sequences","%.1f"%((float(no_t*100))/BaseSum),'%'
print "percentage of C in given sequences","%.1f"%((float(no_c*100))/BaseSum),'%'
print "percentage of G in given sequences","%.1f"%((float(no_g*100))/BaseSum),'%'
print "percentage of N in given sequences","%.1f"%((float(no_n*100))/BaseSum),'%'
# Program written by Feng Ju,HKU; tested in python and biopython 2.7.3
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-6-17 16:02
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社