clusterProfiler富集分析

2023-05-14

clusterProfiler 做 GO 和 KEGG 的富集分析

软件版本

R版本：4.1.0
R包版本：clusterProfiler 4.0.5

R包安装

如果你的R版本比较新，可以先试试直接安装clusterProfiler。
当前在用的服务器上的R版本是3.6.0，算比较旧的，装R包经常报错。
然后虽然Bioconductor的clusterProfiler写着依赖R >= 3.5.0，但是很多依赖包都会要求更高的R版本，我都懒得逐个包查哪个版本适配了，所以直接搭个最新版本的R的docker容器算了。

从docker registry获取r-base作为基础镜像
docker pull r-base
如需指定4.1.0版本则用docker pull r-base:4.1.0

搭建clusterProfiler的容器

容器搭建

# Shell

#!/bin/sh
local_path=/xxx/clusterProfiler
# 需要--privileged参数，否则后面安装clusterProfiler包会报错
docker run -tid --privileged --name clusterProfiler_Test --restart=always -v $local_path:/Test r-base:4.1.0 /bin/bash

进入容器

1
2
3

# 命令行

docker exec -ti clusterProfiler_Test /bin/bash

容器内安装clusterProfiler

# 命令行，安装依赖
apt-get update
apt install curl libcurl4-openssl-dev libssl-dev libxml2-dev

# 命令行，进入R
R

# 进入R后，安装包
# org.Hs.eg.db是人的基因注释数据库；如果是别的物种，这里下载和后面调用的数据库名称，以及物种名称是hsa的，都要换成对应物种的
# OrgDb的19个物种数据库：http://bioconductor.org/packages/release/BiocViews.html#___OrgDb
# pathview、enrichplot、ggplot2都是用来画图的
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("clusterProfiler")
BiocManager::install("org.Hs.eg.db")
BiocManager::install("pathview")
BiocManager::install("enrichplot")
BiocManager::install("ggplot2")

GO、KEGG富集分析
1
见后面的[分析脚本]部分
退出容器
1
2
# 键盘操作
# Ctrl + p + q
容器的后续处理

搭建容器时用了–restart=always，容器会保持启动状态，随时可以进入容器分析

如果暂时不需要用了，可以先将容器导出成镜像文件，再删除容器

# 命令行，导出镜像文件
docker export clusterProfiler_Test -o clusterProfiler.tar

# 命令行，停止和删除容器
docker stop clusterProfiler_Test
docker rm clusterProfiler_Test

分析脚本

脚本参考clusterProfiler官方文档中Part II: Enrichment analysis的6 GO enrichment analysis和7 KEGG enrichment analysis。
Over Representation Analysis (ORA, 过表达分析)和Gene Set Enrichment Analysis (GSEA, 基因富集分析)两种分析方法，分别写了两个脚本。
GO和KEGG都在同一个脚本里，有哪部分不需要的话，直接注释就行。

Shell调用Rscript

# ORA
Rscript clusterProfiler_ORA.R ORA.list ORA ORA_Result

# GSEA
Rscript clusterProfiler_GSEA.R GSEA.list GSEA GSEA_Result

ORA脚本

clusterProfiler_ORA.R

library("clusterProfiler")
library("org.Hs.eg.db")
library("pathview")
library("ggplot2")

args <- commandArgs(T)
# 输入文件，每行1个基因名称（Gene Symbol）
gene_list = args[1]
# 样本名称，会作为输出文件前缀
sample = args[2]
# 输出目录路径
out_dir = args[3]

GO_output_dir = paste(out_dir, "/GO", sep="", collapse="")
KEGG_output_dir = paste(out_dir, "/KEGG", sep="", collapse="")
if (! file.exists(out_dir)){dir.create(out_dir)}
if (! file.exists(GO_output_dir)){dir.create(GO_output_dir)}
if (! file.exists(KEGG_output_dir)){dir.create(KEGG_output_dir)}

data = read.table(gene_list, head=F, sep="\t", comment.char="#", colClasses="character")
data = as.vector(data[,1])

p_cutoff = 0.05
q_cutoff = 0.05

# GO 
ontology <- list("MF","CC","BP")
for (item in ontology){
	output_file = paste(GO_output_dir, "/", sample, ".", item, ".xls", sep="", collapse="")
	output_pic = paste(GO_output_dir, "/", sample, ".", item, ".png", sep="", collapse="")
	# GO enrichment result
	# pvalueCutoff is adjusted pvalue cutoff
	ego <- enrichGO(gene = data, keyType = "SYMBOL", OrgDb = org.Hs.eg.db, ont = item, pAdjustMethod = "BH", pvalueCutoff = p_cutoff, qvalueCutoff = q_cutoff)
	if ( (nrow(subset(ego@result, p.adjust<=p_cutoff))==0) | (nrow(subset(ego@result, qvalue<=q_cutoff))==0) ){
		warnning_msg = paste("Warning: ", item, "'s enrichGO() has NO result after pvalue.adjust and qvalue cutoff! Won't output result of ", item, "!", sep="", collapse="")
		write(warnning_msg, stderr())
		next
	}
	write.table(ego, file=output_file, quote=FALSE, col.names=TRUE, row.names=FALSE, sep="\t")
	# GO directed acyclic graph
	picture <- goplot(ego, showCategory=10, color="p.adjust", layout="sugiyama", geom="text")
	ggsave(file=output_pic, width=250, height=500, unit="mm", dpi=300)
	# Bar plot
	output_pic = paste(GO_output_dir, "/", sample, ".", item, ".bar.png", sep="", collapse="")
	picture <- barplot(ego, showCategory=10)
	ggsave(file=output_pic, width=150, height=250, unit="mm", dpi=300)
}

# KEGG
output_file = paste(KEGG_output_dir, "/", sample, ".KEGG.xls", sep="", collapse="")
# Translate Gene Symbol to ENTREZ ID
SYMBOL_ENTREZID <- bitr(data, fromType="SYMBOL", toType="ENTREZID", OrgDb=org.Hs.eg.db)
if (nrow(SYMBOL_ENTREZID) == 0){
	error_msg = "Error: All gene symbol in input file can't map to Entrezid ID! Stop analyysis!"
	write(error_msg, stderr())
	q()
}
data_ENTREZID = as.vector(SYMBOL_ENTREZID[,2])
# KEGG enrichment result
# pvalueCutoff is adjusted pvalue cutoff
kk <- enrichKEGG(gene=data_ENTREZID, organism="hsa", keyType="kegg", pAdjustMethod="BH", pvalueCutoff=p_cutoff, qvalueCutoff=q_cutoff)
if ( (nrow(subset(kk@result, p.adjust<=p_cutoff))==0) | (nrow(subset(kk@result, qvalue<=q_cutoff))==0) ){
	warnning_msg = "Warning: KEGG's enrichKEGG() has NO result after pvalue.adjust and qvalue cutoff! Won't output result of KEGG!"
	write(warnning_msg, stderr())
	q()
}
pass_path_id = subset(kk@result, p.adjust<=p_cutoff & qvalue<=q_cutoff, select=ID)
for (path_id in pass_path_id$ID){
	ENTREZID_Gene <- kk@result$geneID[which(kk@result==path_id)]
	ENTREZID_Gene_List <- strsplit(ENTREZID_Gene, split="/")
	ENTREZID_Gene_List <- unlist(ENTREZID_Gene_List)
	Output_Symbol_List <- c()
	n=1
	for (id in ENTREZID_Gene_List){
		Output_Symbol_List[n] <- SYMBOL_ENTREZID$SYMBOL[which(SYMBOL_ENTREZID$ENTREZID==id)]
		n <- n+1
	}
	Output_Symbol <- paste(Output_Symbol_List, sep="", collapse="/")
	kk@result$geneID[which(kk@result==path_id)] <- Output_Symbol
}
write.table(kk, file=output_file, quote=FALSE, col.names=TRUE, row.names=FALSE, sep="\t")
# Dot plot
output_pic = paste(KEGG_output_dir, "/", sample, ".KEGG.png", sep="", collapse="")
picture <- dotplot(kk, showCategory=20)
ggsave(file=output_pic, width=150, height=250, unit="mm", dpi=300)
# KEGG pathway view
for (path_id in pass_path_id$ID){
	output_pic = paste(KEGG_output_dir, "/", sample, ".", path_id, ".png", sep="", collapse="")
	pathview(gene.data=data_ENTREZID, pathway.id=path_id, species="hsa", min.nnodes=3)
	rm_command = paste("rm -f ./", path_id, ".png ", "./", path_id, ".xml", sep="", collapse="")
	mv_command = paste("mv ./", path_id, ".pathview.png ", output_pic, sep="", collapse="")
	system(rm_command)
	system(mv_command)
}

ORA.list

AP5Z1
ZNF592
FOXRED1
NUBPL
HFE
WDR35
ABHD12
ZNF513

ORA_Result

ORA_Result
├── GO
│   ├── ORA.BP.bar.png
│   ├── ORA.BP.png
│   ├── ORA.BP.xls
│   ├── ORA.CC.bar.png
│   ├── ORA.CC.png
│   ├── ORA.CC.xls
│   ├── ORA.MF.bar.png
│   ├── ORA.MF.png
│   └── ORA.MF.xls
└── KEGG
    ├── ORA.hsa00010.png
    ├── ORA.hsa00531.png
    ├── ORA.hsa01230.png
    ├── ORA.hsa04142.png
    ├── ORA.hsa04610.png
    ├── ORA.hsa04977.png
    ├── ORA.KEGG.png
    └── ORA.KEGG.xls

GO
- ORA.BP.bar.png
- ORA.BP.png
- ORA.BP.xls
KEGG
- ORA.hsa00531.png
- ORA.KEGG.png
- ORA.KEGG.xls

GSEA脚本

clusterProfiler_GSEA.R

library("clusterProfiler")
library("org.Hs.eg.db")
library("enrichplot")
library("ggplot2")

args <- commandArgs(T)
# 输入文件，每行两列，第一列是基因名称（Gene Symbol），第二列是Fold Change
gene_list = args[1]
# 样本名称，会作为输出文件前缀
sample = args[2]
# 输出目录路径
out_dir = args[3]

GO_output_dir = paste(out_dir, "/GO", sep="", collapse="")
KEGG_output_dir = paste(out_dir, "/KEGG", sep="", collapse="")
if (! file.exists(out_dir)){dir.create(out_dir)}
if (! file.exists(GO_output_dir)){dir.create(GO_output_dir)}
if (! file.exists(KEGG_output_dir)){dir.create(KEGG_output_dir)}

# 1st column is Symmbol, 2nd column is FC
data = read.table(gene_list, head=F, sep="\t", comment.char="#", colClasses="character")
geneList = as.numeric(data[,2])
names(geneList) = as.character(data[,1])
geneList = sort(geneList,decreasing=TRUE)

# Check if any duplicate gene
if (length(names(geneList[duplicated(names(geneList))])) > 0){
	error_msg = "Error: There are duplicate gene in input file! Stop analyysis!"
	write(error_msg, stderr())
	q()
}

p_cutoff = 0.05
q_cutoff = 0.05

# GO
ontology <- list("MF","CC","BP")
for (item in ontology){
	output_file = paste(GO_output_dir, "/", sample, ".", item, ".xls", sep="", collapse="")
	# GO GSEA result
	# pvalueCutoff is adjusted pvalue cutoff
	ego <- gseGO(gene=geneList, keyType="SYMBOL", OrgDb=org.Hs.eg.db, ont=item, pAdjustMethod="BH", pvalueCutoff=p_cutoff)
	if (nrow(ego)==0){
		wornning_msg = paste("Warning: ", item, "'s gseGO() has NO result after pvalue.adjust cutoff! Won't output result of ", item, "!", sep="", collapse="")
		write(wornning_msg, stderr())
		next
	}
	write.table(ego, file=output_file, quote=FALSE, col.names=TRUE, row.names=FALSE, sep="\t")
	# GO GSEA Plot
	for ( n in (1:length(ego$ID)) ){
		GO_id <- gsub(":", "_", ego$ID[n])
		output_pic = paste(GO_output_dir, "/", sample, ".", item, ".", GO_id, ".png", sep="", collapse="")
		picture <- gseaplot2(ego, geneSetID=n, pvalue_table=TRUE, title=ego$Description[n])
		#picture <- gseaplot2(ego, geneSetID=n, title=ego$Description[n])
		ggsave(file=output_pic, width=260, height=180, unit="mm", dpi=300)
	}
}

# KEGG
output_file = paste(KEGG_output_dir, "/", sample, ".KEGG.xls", sep="", collapse="")
# Translate Gene Symbol to ENTREZ ID
SYMBOL_ENTREZID <- bitr(names(geneList), fromType="SYMBOL", toType="ENTREZID", OrgDb=org.Hs.eg.db)
if (nrow(SYMBOL_ENTREZID)==0){
        error_msg = "Error: All gene symbol in input file can't map to Entrezid ID! Stop analyysis!"
        write(error_msg, stderr())
        q()
}
names(geneList) = as.character(SYMBOL_ENTREZID[,2])
# Some gene without ENTREZID will become duplicated name <NA>, remove them
geneList <- geneList[!duplicated(names(geneList))]
# KEGG GSEA result
# pvalueCutoff is adjusted pvalue cutoff
kk <- gseKEGG(gene=geneList, organism="hsa", keyType="kegg", pAdjustMethod="BH", pvalueCutoff=p_cutoff)
if (nrow(kk)==0){
	wornning_msg = "Warning: KEGG's gseKEGG() has NO result after pvalue.adjust cutoff! Won't output result of KEGG!"
	write(wornning_msg, stderr())
	q()
}
for ( n in (1:length(kk$ID)) ){
	# KEGG GSEA Plot
	output_pic = paste(KEGG_output_dir, "/", sample, ".", kk$ID[n], ".png", sep="", collapse="")
	picture <- gseaplot2(kk, geneSetID=n, pvalue_table=TRUE, title=kk$Description[n])
	#picture <- gseaplot2(kk, geneSetID=n, title=kk$Description[n])
	ggsave(file=output_pic, width=260, height=180, unit="mm", dpi=300)
	# ENTREZ ID to Gene Symbol
	ENTREZID_Gene <- kk$core_enrichment[n]
	ENTREZID_Gene_List <- strsplit(ENTREZID_Gene, split="/")
	ENTREZID_Gene_List <- unlist(ENTREZID_Gene_List)
	Output_Symbol_List <- c()
	i=1
	for (id in ENTREZID_Gene_List){
		Output_Symbol_List[i] <- SYMBOL_ENTREZID$SYMBOL[which(SYMBOL_ENTREZID$ENTREZID==id)]
		i <- i + 1
	}
	Output_Symbol <- paste(Output_Symbol_List, sep="", collapse="/")
	kk@result$core_enrichment[n] <- Output_Symbol
}
write.table(kk, file=output_file, quote=FALSE, col.names=TRUE, row.names=FALSE, sep="\t")

GSEA.list

S100A11	0.695631395
RBPMS2	0.540877695
KRTCAP3	0.558604609
SELE	0.521754428
RNF39	2.770884868
C15orf39	1.372097355

GSEA_Result

GSEA_Result
├── GO
│   ├── GSEA.BP.GO_0000003.png
│   ├── GSEA.BP.GO_1901990.png
│   ├── GSEA.BP.xls
│   ├── GSEA.CC.GO_1903046.png
│   ├── GSEA.CC.GO_1903047.png
│   ├── GSEA.CC.GO_2000026.png
│   ├── GSEA.CC.xls
│   ├── GSEA.MF.GO_0000775.png
│   ├── GSEA.MF.GO_0005694.png
│   ├── GSEA.MF.GO_1902494.png
│   └── GSEA.MF.xls
└── KEGG
    ├── GSEA.hsa04068.png
    ├── GSEA.hsa04141.png
    ├── GSEA.hsa05170.png
    └── GSEA.KEGG.xls