tony8310的个人博客分享 http://blog.sciencenet.cn/u/tony8310

博文

R语言randomForest包的随机森林分类模型以及对重要变量的选择(修改)

已有 8982 次阅读 2021-11-4 09:34 |系统分类:科研笔记

R语言randomForest包的随机森林分类模型以及对重要变量的选择

https://cloud.tencent.com/developer/article/1870681

  感谢作者的详细指导,以及代码贡献。R代码的百度盘链接:


https://pan.baidu.com/s/10MWBfjBnYIzf6Cx2Zd9CjA

下载,按照R代码做了,但是在部分地方会出现错误。对此,我进行了部分修改。

----------------------------------------------------------------------------------

#读取 OTUs 丰度表
otu <- read.table('otu_table.txt', sep = '\t', row.names = 1, header = TRUE, fill = TRUE)

#过滤低丰度 OTUs 类群,它们对分类贡献度低,且影响计算效率
#120 个样本,就按 OTUs 丰度的行和不小于 120 为准吧
otu <- otu[which(rowSums(otu) >= 120), ]

#合并分组,得到能够被 randomForest 识别计算的格式
group <- read.table('group.txt', sep = '\t', row.names = 1, header = TRUE, fill = TRUE)
otu <- data.frame(t(otu))
otu_group <- cbind(otu, group)

#将总数据集分为训练集(占 70%)和测试集(占 30%)
set.seed(123)
select_train <- sample(120, 120*0.7)
otu_train <- otu_group[select_train, ]
otu_test <- otu_group[-select_train, ]

####################################################
#randomForest 包的随机森林
library(randomForest)
library(ggplot2)

#随机森林计算(默认生成 500 棵决策树),详情 ?randomForest
# 如果出现 Error in y - ymean : non-numeric argument to binary operator
# 用as. factor 改变变量类型
set.seed(123)
otu_train.forest <- randomForest(as.factor(otu_train$groups) ~ ., data = otu_train,
                                 importance = TRUE)
otu_train.forest
plot(randomForest::margin(otu_train.forest), main = '观测值被判断正确的概率图')
# 原来的做图代码如下:出现Error in unit(c(t, r, b, l), unit) :  'list' object cannot be coerced to type 'double'
# plot(margin(otu_train.forest, otu_train$groups), main = '观测值被判断正确的概率图')

#训练集自身测试
train_predict <- predict(otu_train.forest, otu_train)
compare_train <- table(train_predict, otu_train$groups)
compare_train
sum(diag(compare_train)/sum(compare_train))

#使用测试集评估
test_predict <- predict(otu_train.forest, otu_test)
compare_test <- table(otu_test$groups, test_predict, dnn = c('Actual', 'Predicted'))
compare_test

###关键 OTUs 识别
#查看表示每个变量(OTUs)重要性的得分
#summary(otu_train.forest)
importance_otu <- otu_train.forest$importance
head(importance_otu)

#或者使用函数 importance()
importance_otu <- data.frame(importance(otu_train.forest))
head(importance_otu)

#可以根据某种重要性的高低排个序,例如根据“Mean Decrease Accuracy”指标
importance_otu <- importance_otu[order(importance_otu$MeanDecreaseAccuracy, decreasing = TRUE), ]
head(importance_otu)

#输出表格
#write.table(importance_otu, 'importance_otu.txt', sep = '\t', col.names = NA, quote = FALSE)

#作图展示 top30 重要的 OTUs
varImpPlot(otu_train.forest, n.var = min(30, nrow(otu_train.forest$importance)), main = 'Top 30 - variable importance')

###交叉验证帮助选择特定数量的 OTUs
#5 次重复十折交叉验证
set.seed(123)
otu_train.cv <- replicate(5, rfcv(otu_train[-ncol(otu_train)], as.factor(otu_train$groups), cv.fold = 10,step = 1.5), simplify = FALSE)
otu_train.cv

#提取验证结果绘图
otu_train.cv <- data.frame(sapply(otu_train.cv, '[[', 'error.cv'))
otu_train.cv$otus <- rownames(otu_train.cv)
otu_train.cv <- reshape2::melt(otu_train.cv, id = 'otus')
otu_train.cv$otus <- as.numeric(as.character(otu_train.cv$otus))

#拟合线图
library(ggplot2)
library(splines)  #用于在 geom_smooth() 中添加拟合线,或者使用 geom_line() 替代 geom_smooth() 绘制普通折线

p <- ggplot(otu_train.cv, aes(otus, value)) +
    geom_smooth(se = FALSE,    method = 'glm', formula = y~ns(x, 6)) +
    theme(panel.grid = element_blank(), panel.background = element_rect(color = 'black', fill = 'transparent')) +  
    labs(title = '',x = 'Number of OTUs', y = 'Cross-validation error')

p

#大约提取前 30 个重要的 OTUs
p + geom_vline(xintercept = 30)

#根据 OTUs 重要性排序后选择,例如根据“Mean Decrease Accuracy”指标
importance_otu <- importance_otu[order(importance_otu$MeanDecreaseAccuracy, decreasing = TRUE), ]
head(importance_otu)

#输出表格
#write.table(importance_otu[1:30, ], 'importance_otu_top30.txt', sep = '\t', col.names = NA, quote = FALSE)

###简约分类器
#选择 top30 重要的 OTUs,例如上述已经根据“Mean Decrease Accuracy”排名获得
otu_select <- rownames(importance_otu)[1:30]

#数据子集的训练集和测试集
otu_train_top30 <- otu_train[ ,c(otu_select, 'groups')]
otu_test_top30 <- otu_test[ ,c(otu_select, 'groups')]

#随机森林计算(默认生成 500 棵决策树),详情 ?randomForest
set.seed(123)
otu_train.forest_30 <- randomForest(as.factor(otu_train_top30$groups) ~ ., data = otu_train_top30, importance = TRUE)
otu_train.forest_30
plot(randomForest::margin(otu_train.forest_30), main = '观测值被判断正确的概率图')
# 原来代码如下
# plot(margin(otu_train.forest_30, otu_test_top30$groups), main = '观测值被判断正确的概率图')

#训练集自身测试
train_predict <- predict(otu_train.forest_30, otu_train_top30)
compare_train <- table(train_predict, otu_train_top30$groups)
compare_train

#使用测试集评估
test_predict <- predict(otu_train.forest_30, otu_test_top30)
compare_test <- table(otu_test_top30$groups, test_predict, dnn = c('Actual', 'Predicted'))
compare_test

##NMDS 排序图中展示分类
#NMDS 降维
nmds <- vegan::metaMDS(otu, distance = 'bray')
result <- nmds$points
result <- as.data.frame(cbind(result, rownames(result)))

#获得上述的分类预测结果
predict_group <- c(train_predict, test_predict)
predict_group <- as.character(predict_group[rownames(result)])

#作图
colnames(result)[1:3] <- c('NMDS1', 'NMDS2', 'samples')
result$NMDS1 <- as.numeric(as.character(result$NMDS1))
result$NMDS2 <- as.numeric(as.character(result$NMDS2))
result$samples <- as.character(result$samples)
result <- cbind(result, predict_group)
head(result)

ggplot(result, aes(NMDS1, NMDS2, color = predict_group)) +  
    geom_polygon(data = plyr::ddply(result, 'predict_group', function(df) df[chull(df[[1]], df[[2]]), ]), fill = NA) +
    geom_point()

-------------------------------

版权归原作者,转载请注明出处。





https://wap.sciencenet.cn/blog-331295-1310844.html

上一篇:RStudio或R中miniconda tensorflow 环境的设置
下一篇:生态补水对白洋淀流域地表水和地下水水化学特征的影响
收藏 IP: 60.30.105.*| 热度|

0

该博文允许注册用户评论 请点击登录 评论 (0 个评论)

数据加载中...
扫一扫,分享此博文

Archiver|手机版|科学网 ( 京ICP备07017567号-12 )

GMT+8, 2024-5-5 23:27

Powered by ScienceNet.cn

Copyright © 2007- 中国科学报社

返回顶部