||
2014年学习R语言的时候,曾做过统计LPSN网站上的根瘤菌相关属的物种统计工作,到2021年,根瘤菌领域已增加了两个新属:Neorhizobium和Pararhizobium;将Sinorhizobium内的种,计为0了,但Sinorhizobium仍然有许多人在使用,故而将它的原有物种数依然统计在内。
今天重新统计一下各个属内的物种数目情况,并做图:
生成的图,不知道为什么属名不在一条线上?
当时学R语言时,还是用比较笨拙的手法,写的代码也不好看,不简洁,但容易理解(参见这里,科学网—用R语言统计一下LPSN网站上的各属内的物种数 - 陈文峰的博文 (sciencenet.cn))。今天重新运行了一下之前的代码,发现不能用了,仔细看了一下,是网站细节改了,正则表达式也修改了,因此,重新写一下,如下。
代码复杂了些,重复的地方可以使用循环,以简化代码,但懒得去做了。
library(stringr)
#统计中华根瘤菌属(Sinorhizobium)
SinorhizobiumPage <- readLines("http://www.bacterio.net/sinorhizobium.html")
line_num_Ensifer <- grep('Total number of child taxa:', SinorhizobiumPage)
pat1 <- "([0-9]+)"
(SinorhizobiumSpNum <- as.numeric(str_extract(SinorhizobiumPage[line_num_Ensifer], pat1)))
#统计剑菌属(Ensifer)
EnsiferPage <- readLines("https://www.bacterio.net/genus/ensifer")
line_num_Ensifer = grep('Number of child taxa with a validly published and correct name: ', EnsiferPage) #获得是的所在行
line_num_Ensifer
pat2 <- "([0-9]+)"
EnsiferSpNum <- as.numeric(str_extract(EnsiferPage[line_num], pat2))
#统计根瘤菌属(Rhizobium)
RhizobiumPage <- readLines("http://www.bacterio.net/rhizobium.html")
line_num_Rhizobium <- grep('Number of child taxa with a validly published and correct name: ', RhizobiumPage)
pat3 <- "([0-9]+)"
(RhizobiumSpNum <- as.numeric(str_extract(RhizobiumPage[line_num_Rhizobium], pat3)))
#统计慢生根瘤菌属(Bradyrhizobium)
BradyrhizobiumPage <- readLines("http://www.bacterio.net/bradyrhizobium.html")
(line_num_Bradyrhizobium <- grep('Number of child taxa with a validly published and correct name: ', BradyrhizobiumPage))
pat4 <- "([0-9]+)"
(BradyrhizobiumSpNum <- as.numeric(str_extract(BradyrhizobiumPage[line_num_Bradyrhizobium], pat4)))
#统计副根瘤菌属(Pararhizobium)
PararhizobiumPage <- readLines("https://www.bacterio.net/genus/pararhizobium")
(line_num_Pararhizobium <- grep('Number of child taxa with a validly published and correct name: ', PararhizobiumPage))
pat5 <- "([0-9]+)"
(PararhizobiumSpNum <- as.numeric(str_extract(PararhizobiumPage[line_num_Pararhizobium], pat5)))
#统计新根瘤菌属(Neorhizobium)
NeorhizobiumPage <- readLines("https://www.bacterio.net/genus/neorhizobium")
(line_num_Neorhizobium <- grep('Number of child taxa with a validly published and correct name: ', NeorhizobiumPage))
pat6 <- "([0-9]+)"
(NeorhizobiumSpNum <- as.numeric(str_extract(NeorhizobiumPage[line_num_Neorhizobium], pat6)))
rhizobiaNumbers <- data.frame(S=SinorhizobiumSpNum, E=EnsiferSpNum, R=RhizobiumSpNum,
B=BradyrhizobiumSpNum, P = PararhizobiumSpNum,
N = NeorhizobiumSpNum)
sino <- expression(italic(Sinorhizobium))
ensi <- expression(italic(Ensifer))
rhi <- expression(italic(Rhizobium))
brady <- expression(italic(Bradyrhizobium))
para <- expression(italic(Pararhizobium))
neo <- expression(italic(Neorhizobium))
t.rhizobia <- t(rhizobiaNumbers)
t1.rhizobia <- as.data.frame(t.rhizobia)
par(cex = 0.8)
barp <- barplot(t1.rhizobia$V1, names.arg=c(sino, ensi, rhi, brady, para, neo),
xlab="属", ylab="种数", col=rainbow(6),
main="各属内根瘤菌的种数(截至到2021年12月1日)",
ylim=c(0, 100))
abline(h=0)
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-12-22 09:09
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社