R语言复杂网络 r语言wgcna

转载

AI智行者 2023-09-18 15:58:04

文章标签 R语言复杂网络 r语言 html 数据 Soft 文章分类 R语言后端开发

R包WGCNA---转录组WGCNA共表达网络构建（无表型信息）

1. 下载R包WGCNA
2. 运行步骤

2.1参数筛选和模块计算
2.2 全部基因所属模块信息输出
2.3 计算KME值并输出筛选基因结果
2.4 导出Cytoscape格式网络数据
2.5 从TOM矩阵中提取固定基因集的Cytoscape数据

参考

最近有一个需求，需要使用多个分组的RNA-seq数据（包含CK在内共30个处理）进行共表达网络构建。将网上搜索的资料整理如下，作为记录。

1. 下载R包WGCNA

目前，共表达网络构建通常使用WGCNA这个R包，下载的时候按照官方文档即可。
下载：
当然也可以选择当前版本的R下继续安装，这就需要手动安装很多R包，最终完成安装，官网中也有相关的方法介绍。如果使用的Mac OS X系统，有时安装失败，也要重新安装才行【1】。

为了方便，我选择conda新建R4.1环境，然后下载最新的R版本R4.1.2，再完成安装。

## R version 4.1.2 (2021-11-01) -- "Bird Hippie"
install.packages("BiocManager")
BiocManager::install("WGCNA")

2. 运行步骤

2.1参数筛选和模块计算

参看【2】的链接，我们可以找到其中相关的教程方法，并且详细描述了相关的代码和图片结果。
使用下面的代码得到样本聚类图、软阈值筛选图、模块聚类图和模块相关性图。并保存网络计算结果net.wgcna.RData以及TOM矩阵结果。

rm(list = ls()) ## clear variable space
load("04.stages_mean_30column_exp_stat.RData")

## load packages
require(WGCNA) ## co-expression package
require(reshape2)
require(stringr)
#require(data.table) ## fast reading files
#options(datatable.fread.datatable=FALSE)
options(stringsAsFactors = FALSE)

## open multiple threads
enableWGCNAThreads()


## set some params
type = "signed" # "unsigned"
corType = "pearson"
corFnc = ifelse(corType=="pearson", cor, bicor)
maxPOutliers = ifelse(corType=="pearson",1,0.05)
robustY = ifelse(corType=="pearson",T,F)

load("04.stages_mean_30column_exp_stat.RData")
#total_stages_exp


dataE=t(total_stages_exp)
print(dim(dataE))
print(paste0("Expression data is OK"))

## plot tree 
sampleTree = hclust(dist(dataE), method = "ward.D") # average
pdf(file="Fig01.hclust.tree.pdf")
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
dev.off()
nGenes = ncol(dataE)
nSamples = nrow(dataE)

# cal soft value and plot tree
powers = c(c(1:10), seq(from = 12, to=30, by=2))
sft = pickSoftThreshold(dataE, powerVector=powers, networkType=type, verbose=5)

pdf(file="Fig02.picksoft_threshold.pdf")
par(mfrow = c(1,2))
cex1 = 0.9
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
               xlab="Soft Threshold (power)",
		           ylab="Scale Free Topology Model Fit,signed R^2",type="n",
		           main = paste("Scale independence"))
               text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
               labels=powers,cex=cex1,col="red")
               ## select standards: R-square=0.85
               abline(h=0.65,col="red")

## Soft threshold与平均连通性
plot(sft$fitIndices[,1], sft$fitIndices[,5],
               xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
 		           main = paste("Mean connectivity"))
               text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, 
               cex=cex1, col="red")
dev.off()
print(paste0("softthreshold is ",sft$powerEstimate))

### calculate net 
power = 14 # sft$powerEstimate estimation value
net = blockwiseModules(dataE, power = power,
		                maxBlockSize = 100000, TOMType = type, 
				            minModuleSize = 30,reassignThreshold = 0, 
				            mergeCutHeight = 0.27,numericLabels = TRUE, 
					          pamRespectsDendro = FALSE,saveTOMs=TRUE, 
					          corType = corType, maxPOutliers=maxPOutliers,
					          loadTOM =TRUE, saveTOMFileBase = "dataExpTOM", 
						        verbose = 3)

table(net$colors) # module number 


pdf(file="Fig03.cluster_module.color.pdf")
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
unique(moduleColors)

# plot clustered modules and tree
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
		                  "Module colors",
				                 dendroLabels = FALSE, hang = 0.03,
				                 addGuide = TRUE, guideHang = 0.05)
dev.off()

# plot module correlation heatmap
pdf(file="Fig04.cluster_heatmap.color.pdf")
MEs = net$MEs
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap", 
		                  marDendro = c(3,3,2,4),
				              marHeatmap = c(3,4,2,2), plotDendrograms = T, 
				              xLabelsAngle = 90)
dev.off()

save(net,file="net.wgcna.RData")

TOM矩阵热图绘制，有时矩阵太大，容易运行时间长，或者不出结果。

TOM = TOMsimilarityFromExpr(dataE, power=14,corType=corType, networkType=type)
## 使用load方法也可以
#load("net.wgcna.RData")
#load(net$TOMFiles[1], verbose=T)

## Loading objects:
##   TOM

TOM <- as.matrix(TOM)
dissTOM = 1-TOM

# Transform dissTOM with a power to make moderately strong
# connections more visible in the heatmap
plotTOM = dissTOM^7
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA
# Call the plot function
pdf(file="TOM.pdf")
TOMplot(plotTOM, net$dendrograms, moduleColors,
        main = "Network heatmap plot, all genes")

dev.off()

2.2 全部基因所属模块信息输出

load("net.wgcna.RData")
test=data.frame(color=net$colors)
test$gene=rownames(test)
write.table(test,file="f01.total_module_gene.record.txt",sep="\t",row.names=F,col.names=T,quote=F)

## 文件内容如下，第一列为基因名，第二列为所属模块的数字编号
gene	color
Nitab4.5_0000471g0100	0
Nitab4.5_0005432g0020	2
Nitab4.5_0000002g0180	2
Nitab4.5_0002510g0040	5

2.3 计算KME值并输出筛选基因结果

## 计算KME结果
require(WGCNA)
load("04.stages_mean_30column_exp_stat.RData")
datKME=signedKME(dataE, MEs, outputColumnName="kME_MM.")
write.table(datKME, file="kME_MM.txt",quote=F,sep="\t")

## 筛选模块内基因
FilterGenes = abs(datKME$kME_MM.3) > 0.8
table(FilterGenes)
hubgene_lightyellow <- dimnames(data.frame(dataE))[[2]][FilterGenes]
hubset1=data.frame(gene=hubgene_lightyellow)
write.table(hubset1,file="f06.hubset1.gene.txt",quote=F,sep="\t",row.names=F,col.names=T)

## 也可以每个模块只筛选一个Hub gene
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
HubGenes <- chooseTopHubInEachModule(dataE, moduleColors)

2.4 导出Cytoscape格式网络数据

## 将上面KME筛选的f06.hubset1.gene.txt基因与我们RNAseq分析得到的候选基因取交集，得到f09.brown.module.hub.gene.txt
## 使用这部分数据绘制网络
modules = c("brown")
inModule = is.finite(match(moduleColors, modules))
modProbes = probes[inModule]
dataE=as.data.frame(dataE)
probes = names(dataE)
modProbes = probes[inModule]
modGenes=read.table("f09.brown.module.hub.gene.txt")$V1
modTOM = TOM[inModule, inModule]
dimnames(modTOM) = list(modProbes, modProbes)

## 导出固定模块内的网络数据
cyt = exportNetworkToCytoscape(modTOM,edgeFile = paste("CytoscapeInput-edges-", 
paste(modules, collapse="-"), ".txt", sep=""), 
nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
 weighted = TRUE,threshold = 0.1,
 nodeNames = modProbes,altNodeNames = modGenes,
 nodeAttr = moduleColors[inModule])

2.5 从TOM矩阵中提取固定基因集的Cytoscape数据

后面和人交流发现，可以依据TOM矩阵，提取给定基因的weight值，没必要拘束在一个模块内，以此构建网络。

load("dataExpTOM-block.1.RData")
load("04.stages_mean_30column_exp_stat.RData")
require(WGCNA)

target=read.table("f14.extract_68_known_181gene.txt",header=F,stringsAsFactors=F)
total=rownames(total_stages_exp)
idx=which(total %in% target$V1)
tom_res <- TOM[idx,idx]

rownames(TOM)=total
colnames(TOM)=total
TOM[1:5,1:5]
tom_res <- TOM[total[idx],total[idx]]

## 输出TOM矩阵中选定基因集合的网络weight信息
cyt = exportNetworkToCytoscape(tom_res,
edgeFile = paste("CytoscapeInput-edges-249gene",  ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-249gene", ".txt", sep=""),
weighted = TRUE,threshold = 0.1)

## 输出TOM矩阵中全部的网络weight信息
modTOM2=TOM
cyt = exportNetworkToCytoscape(modTOM2,edgeFile = paste("CytoscapeInput-edges-total0.1",  ".txt", sep=""), nodeFile = paste("CytoscapeInput-nodes-tom_total0.1", ".txt", sep=""),weighted = TRUE,threshold = 0.1)
cyt = exportNetworkToCytoscape(modTOM2,edgeFile = paste("CytoscapeInput-edges-total0.3",  ".txt", sep=""), nodeFile = paste("CytoscapeInput-nodes-tom_total0.3", ".txt", sep=""),weighted = TRUE,threshold = 0.3)
cyt = exportNetworkToCytoscape(modTOM2,edgeFile = paste("CytoscapeInput-edges-tom_total0.4",  ".txt", sep=""), nodeFile = paste("CytoscapeInput-nodes-tom_total0.4", ".txt", sep=""),weighted = TRUE,threshold = 0.4)