About the dataset: The dataset analyzed here is a subset of data from a previous study done by Srivastava et al. (cited below) which examines the expression profile of genes in kidneys from 12 American black bears (Ursus americanus) in Maine using RNA-Seq (6 Fall bears, 6 Spring bears). This was to aid in identifying genes for targeted therapies in humans who suffer from Chronic Kidney Disease (CKD). The purpose of this particular analysis was to detect if genes that were differentially expressed in bear kidneys after hibernation (spring) were related to cardiac pathways, particularly those affecting blood pressure. A total of 10 bear samples were used, 5 from fall and 5 from spring. Raw FastQ files were downloaded from NCBI’s Sequence Read Archive under the accession number SRP075217 (specific sample names in supplement labeled bear.txt). These files were used as input for HISAT2 alignment, from which the resulting .bam files were run through HTSeq to create counts files. These counts files are the starting input for this document and the resulting counts matrix will be utilized to create visualizations.
Please read this through first, and run different sections separately. Editing of files externally is required.
Citation: Srivastava, A., Kumar Sarsani, V., Fiddes, I., Sheehan, S. M., Seger, R. L., Barter, M. E., Neptune-Bear, S., Lindqvist, C., & Korstanje, R. (2019). Genome assembly and gene expression in the American black bear provides new insights into the renal response to hibernation. DNA research : an international journal for rapid publication of reports on genes and genomes, 26(1), 37–44. https://doi.org/10.1093/dnares/dsy036
Supplemental Files in Folder: Kmiecik_Bear_Supplemental
Note: some packages require BiocManager to be installed.
library(knitr)
library(DESeq2)
library(RColorBrewer)
library(pheatmap)
library(ggplot2)
library(tidyverse)
library(EnhancedVolcano)
library(biomaRt)
library(dplyr)
library(tidyr)
library(cowplot)
library(ggnewscale)
library(ggupset)
library(clusterProfiler)
library(org.Mm.eg.db)
library(DOSE)
library(pathview)
library(enrichplot)
library(limma)
library(ggplotify)
library(hypeR)
# read in annotation file
sampleTable <- read.csv("bear_anno.csv", header = T)
head(sampleTable)
## sample filename sex season identity
## 1 SRR7811754 SRR7811754.ENSG.count female fall JAX118_Fall_RNA_Bear
## 2 SRR7811767 SRR7811767.ENSG.count female fall JAX134_Fall_RNA_Bear
## 3 SRR7811766 SRR7811766.ENSG.count female fall JAX_160_Fall_RNA_Bear
## 4 SRR7811762 SRR7811762.ENSG.count male fall JAX141_Fall_RNA_Bear
## 5 SRR7811764 SRR7811764.ENSG.count male fall JAX165_Fall_RNA_Bear
## 6 SRR7811759 SRR7811759.ENSG.count female spring JAX106_Spring_RNA_Bear
## replicate
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 1
# convert file to data frame
sampleTable <- data.frame(sampleTable)
head(sampleTable)
## sample filename sex season identity
## 1 SRR7811754 SRR7811754.ENSG.count female fall JAX118_Fall_RNA_Bear
## 2 SRR7811767 SRR7811767.ENSG.count female fall JAX134_Fall_RNA_Bear
## 3 SRR7811766 SRR7811766.ENSG.count female fall JAX_160_Fall_RNA_Bear
## 4 SRR7811762 SRR7811762.ENSG.count male fall JAX141_Fall_RNA_Bear
## 5 SRR7811764 SRR7811764.ENSG.count male fall JAX165_Fall_RNA_Bear
## 6 SRR7811759 SRR7811759.ENSG.count female spring JAX106_Spring_RNA_Bear
## replicate
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 1
str(sampleTable)
## 'data.frame': 10 obs. of 6 variables:
## $ sample : chr "SRR7811754" "SRR7811767" "SRR7811766" "SRR7811762" ...
## $ filename : chr "SRR7811754.ENSG.count" "SRR7811767.ENSG.count" "SRR7811766.ENSG.count" "SRR7811762.ENSG.count" ...
## $ sex : chr "female" "female" "female" "male" ...
## $ season : chr "fall" "fall" "fall" "fall" ...
## $ identity : chr "JAX118_Fall_RNA_Bear" "JAX134_Fall_RNA_Bear" "JAX_160_Fall_RNA_Bear" "JAX141_Fall_RNA_Bear" ...
## $ replicate: int 1 2 3 4 5 1 2 3 4 5
# specify 'season' and 'sex' as factors so they can be stored as levels
season <- factor(sampleTable$season)
sex <- factor(sampleTable$sex)
# make dds object out of count matrix
dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = ".",
design = ~ sex + season)
dds
## class: DESeqDataSet
## dim: 24647 10
## metadata(1): version
## assays(1): counts
## rownames(24647): ENSUAMG00000000001 ENSUAMG00000000002 ...
## ENSUAMG00000028450 ENSUAMG00000028451
## rowData names(0):
## colnames(10): SRR7811754 SRR7811767 ... SRR7811757 SRR7811750
## colData names(4): sex season identity replicate
# filter out rows with less than 10 reads total
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds
## class: DESeqDataSet
## dim: 18442 10
## metadata(1): version
## assays(1): counts
## rownames(18442): ENSUAMG00000000001 ENSUAMG00000000002 ...
## ENSUAMG00000028448 ENSUAMG00000028450
## rowData names(0):
## colnames(10): SRR7811754 SRR7811767 ... SRR7811757 SRR7811750
## colData names(4): sex season identity replicate
# relevel data to specify reference level
dds$season <- relevel(dds$season, ref = "fall")
dds
## class: DESeqDataSet
## dim: 18442 10
## metadata(1): version
## assays(1): counts
## rownames(18442): ENSUAMG00000000001 ENSUAMG00000000002 ...
## ENSUAMG00000028448 ENSUAMG00000028450
## rowData names(0):
## colnames(10): SRR7811754 SRR7811767 ... SRR7811757 SRR7811750
## colData names(4): sex season identity replicate
# generate results table--------------------------------------------------------
dds <- DESeq(dds)
resdata <- results(dds, contrast=c('season', 'spring', 'fall'))
head(resdata)
## log2 fold change (MLE): season spring vs fall
## Wald test p-value: season spring vs fall
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSUAMG00000000001 975.79943 -0.1175525 0.118821 -0.989324 0.32250465
## ENSUAMG00000000002 413.65877 -0.3175965 0.115158 -2.757929 0.00581688
## ENSUAMG00000000003 9.97792 0.1805817 0.603795 0.299078 0.76488066
## ENSUAMG00000000004 453.12851 0.0378547 0.105520 0.358743 0.71978760
## ENSUAMG00000000005 257.06768 -0.1494567 0.124226 -1.203107 0.22893479
## ENSUAMG00000000006 206.70608 -0.0389975 0.190519 -0.204691 0.83781334
## padj
## <numeric>
## ENSUAMG00000000001 0.818765
## ENSUAMG00000000002 0.186141
## ENSUAMG00000000003 0.968891
## ENSUAMG00000000004 0.958208
## ENSUAMG00000000005 0.757165
## ENSUAMG00000000006 0.982058
# order our results table by the smallest *padj* value
result <- resdata[order(resdata$padj), ]
head(result)
## log2 fold change (MLE): season spring vs fall
## Wald test p-value: season spring vs fall
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSUAMG00000013871 1418.9993 2.112438 0.254439 8.30234 1.02080e-16
## ENSUAMG00000025404 750.3986 0.950526 0.120870 7.86405 3.71921e-15
## ENSUAMG00000016889 220.7612 2.585296 0.389799 6.63239 3.30296e-11
## ENSUAMG00000004830 1807.5093 1.237404 0.196620 6.29338 3.10623e-10
## ENSUAMG00000011450 188.9931 -1.401083 0.223633 -6.26509 3.72606e-10
## ENSUAMG00000016642 20.0035 3.431601 0.561403 6.11254 9.80572e-10
## padj
## <numeric>
## ENSUAMG00000013871 1.62358e-12
## ENSUAMG00000025404 2.95770e-11
## ENSUAMG00000016889 1.75112e-07
## ENSUAMG00000004830 1.18526e-06
## ENSUAMG00000011450 1.18526e-06
## ENSUAMG00000016642 2.59933e-06
# identify number of DEG based on padj < 0.05
table(result$padj<0.05)
##
## FALSE TRUE
## 15739 166
DEGs based on p-adjusted value can be merged with the dds object in the prefiltering step to return an unfiltered matrix. This matrix is important for further analysis!
resdata <- merge(as.data.frame(result), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene_id"
head(resdata)
## Gene_id baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSUAMG00000013871 1418.9993 2.1124384 0.2544389 8.302341 1.020800e-16
## 2 ENSUAMG00000025404 750.3986 0.9505259 0.1208698 7.864046 3.719209e-15
## 3 ENSUAMG00000016889 220.7612 2.5852960 0.3897986 6.632389 3.302964e-11
## 4 ENSUAMG00000004830 1807.5093 1.2374040 0.1966199 6.293382 3.106228e-10
## 5 ENSUAMG00000011450 188.9931 -1.4010826 0.2236332 -6.265092 3.726059e-10
## 6 ENSUAMG00000016642 20.0035 3.4316008 0.5614033 6.112541 9.805715e-10
## padj SRR7811754 SRR7811767 SRR7811766 SRR7811762 SRR7811764
## 1 1.623582e-12 858.003269 329.633995 632.099887 385.40127 483.835954
## 2 2.957701e-11 574.166512 419.447315 549.457852 473.08006 543.333183
## 3 1.751121e-07 64.002406 20.064678 93.809877 45.28465 83.071602
## 4 1.185259e-06 1216.973285 955.460854 1260.849421 1157.16731 762.238081
## 5 1.185259e-06 166.035227 358.297820 217.772929 321.81006 301.976500
## 6 2.599332e-06 4.637856 5.732765 2.233569 2.89051 1.122589
## SRR7811759 SRR7811758 SRR7811765 SRR7811757 SRR7811750
## 1 2818.29622 1761.1367 2029.35569 1871.78676 3020.44305
## 2 1004.18904 866.0177 1065.29564 1053.59299 955.40539
## 3 347.49866 298.0385 740.22722 317.32230 198.29169
## 4 2008.37808 2125.4060 2557.82406 3739.42552 2291.37059
## 5 74.78974 108.3776 78.94519 132.73613 129.19004
## 6 45.60350 62.2168 33.43561 32.14703 10.01473
#write results
write.csv(resdata, file="FallvsSpring_unfiltered_matrix.csv")
The below chunks are to get a list of differentially expressed genes (DEGs) based on the definition of a DEG which in the case of the paper was padj < 0.05. Here I will make two files, one accounting for both lof2FC > 1 & padj < 0.05 as well as just padj < 0.05.
#Get DEG up/down lists for lof2FC and padj
resdata <- as.data.frame(result) %>% dplyr::filter(abs(log2FoldChange) > 1 & padj < 0.05) %>% tibble::rownames_to_column("Gene_id")
#write results
write.csv(resdata, "FallvsSpring_log2fc1_fdr05_DEGlist.csv", row.names = TRUE)
head(resdata)
## Gene_id baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSUAMG00000013871 1418.9993 2.112438 0.2544389 8.302341 1.020800e-16
## 2 ENSUAMG00000016889 220.7612 2.585296 0.3897986 6.632389 3.302964e-11
## 3 ENSUAMG00000004830 1807.5093 1.237404 0.1966199 6.293382 3.106228e-10
## 4 ENSUAMG00000011450 188.9931 -1.401083 0.2236332 -6.265092 3.726059e-10
## 5 ENSUAMG00000016642 20.0035 3.431601 0.5614033 6.112541 9.805715e-10
## 6 ENSUAMG00000005847 232.9705 -1.151563 0.1921459 -5.993172 2.057872e-09
## padj
## 1 1.623582e-12
## 2 1.751121e-07
## 3 1.185259e-06
## 4 1.185259e-06
## 5 2.599332e-06
## 6 4.091306e-06
#Get DEG up/down lists for just padj
resdata2 <- as.data.frame(result) %>% dplyr::filter(padj < 0.05) %>% tibble::rownames_to_column("Gene_id")
#write results
write.csv(resdata2, "FallvsSpring_padj05_DEGlist.csv", row.names = TRUE)
head(resdata2)
## Gene_id baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSUAMG00000013871 1418.9993 2.1124384 0.2544389 8.302341 1.020800e-16
## 2 ENSUAMG00000025404 750.3986 0.9505259 0.1208698 7.864046 3.719209e-15
## 3 ENSUAMG00000016889 220.7612 2.5852960 0.3897986 6.632389 3.302964e-11
## 4 ENSUAMG00000004830 1807.5093 1.2374040 0.1966199 6.293382 3.106228e-10
## 5 ENSUAMG00000011450 188.9931 -1.4010826 0.2236332 -6.265092 3.726059e-10
## 6 ENSUAMG00000016642 20.0035 3.4316008 0.5614033 6.112541 9.805715e-10
## padj
## 1 1.623582e-12
## 2 2.957701e-11
## 3 1.751121e-07
## 4 1.185259e-06
## 5 1.185259e-06
## 6 2.599332e-06
Determining number of upregulated and downregulated genes
upreg <-
resdata2 %>%
filter(log2FoldChange > 0 & padj < 0.05)
nrow(upreg)
## [1] 79
downreg <-
resdata2 %>%
filter(log2FoldChange < 0 & padj < 0.05)
nrow(downreg)
## [1] 87
PCA plots are an important part in visualizing the variation in your samples. Based on season and sex we would expect some sort of clustering to indicate that the samples are representative of differences between each other.
# perform variance stabilizing transformation
vstcounts <- vst(dds, blind = FALSE)
# make PCA plot using ggplot
pcaData <- plotPCA(vstcounts, intgroup="season", "sex", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=season, shape=sex)) +
scale_color_manual(values = c('darkgreen', 'darkorchid')) +
geom_point(size=2) +
ylim(-30,30) +
xlim(-30,30) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
theme_light() +
coord_fixed()
However, a batch correction is needed since sex could be affecting the comparison between seasons.
assay(vstcounts) <- limma::removeBatchEffect(assay(vstcounts), vstcounts$sex)
pcaData <- plotPCA(vstcounts, intgroup=c("season", "sex"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=season, shape=sex)) +
scale_color_manual(values = c('darkgreen', 'darkorchid')) +
geom_point(size=2) +
xlim(-20, 20) +
ylim(-20, 20) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
A correlation heatmap represents the relationship between the variable season in a color-coordinated matrix.
vst_mat <- assay(vstcounts)
vst_cor <- cor(vst_mat)
annotation_heatmap <- data.frame(row.names=colnames(vst_mat), season)
pheatmap(vst_cor,
annotation_col = annotation_heatmap, main = "sample correlation", show_rownames = F, show_colnames = F)
# change aesthetics for readability
pheatmap(vst_cor, clustering_distance_rows = "euclidean",
fontsize = 8, cellwidth = 40, show_rownames = F, color =
colorRampPalette(brewer.pal(9,"BrBG"))(400))
Volcano plots are a form of scatterplot that depict changes between a variable in the dataset. Here it is representing DEGs in spring bears compared to fall bears.
resdata <- read.csv("FallvsSpring_unfiltered_matrix.csv", header = T)
resdata <- resdata[, c(2:14)]
head(resdata)
## Gene_id baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSUAMG00000013871 1418.9993 2.1124384 0.2544389 8.302341 1.020800e-16
## 2 ENSUAMG00000025404 750.3986 0.9505259 0.1208698 7.864046 3.719209e-15
## 3 ENSUAMG00000016889 220.7612 2.5852960 0.3897986 6.632389 3.302964e-11
## 4 ENSUAMG00000004830 1807.5093 1.2374040 0.1966199 6.293382 3.106228e-10
## 5 ENSUAMG00000011450 188.9931 -1.4010826 0.2236332 -6.265092 3.726059e-10
## 6 ENSUAMG00000016642 20.0035 3.4316008 0.5614033 6.112541 9.805715e-10
## padj SRR7811754 SRR7811767 SRR7811766 SRR7811762 SRR7811764
## 1 1.623582e-12 858.003269 329.633995 632.099887 385.40127 483.835954
## 2 2.957701e-11 574.166512 419.447315 549.457852 473.08006 543.333183
## 3 1.751121e-07 64.002406 20.064678 93.809877 45.28465 83.071602
## 4 1.185259e-06 1216.973285 955.460854 1260.849421 1157.16731 762.238081
## 5 1.185259e-06 166.035227 358.297820 217.772929 321.81006 301.976500
## 6 2.599332e-06 4.637856 5.732765 2.233569 2.89051 1.122589
## SRR7811759
## 1 2818.29622
## 2 1004.18904
## 3 347.49866
## 4 2008.37808
## 5 74.78974
## 6 45.60350
str(resdata)
## 'data.frame': 18442 obs. of 13 variables:
## $ Gene_id : chr "ENSUAMG00000013871" "ENSUAMG00000025404" "ENSUAMG00000016889" "ENSUAMG00000004830" ...
## $ baseMean : num 1419 750 221 1808 189 ...
## $ log2FoldChange: num 2.112 0.951 2.585 1.237 -1.401 ...
## $ lfcSE : num 0.254 0.121 0.39 0.197 0.224 ...
## $ stat : num 8.3 7.86 6.63 6.29 -6.27 ...
## $ pvalue : num 1.02e-16 3.72e-15 3.30e-11 3.11e-10 3.73e-10 ...
## $ padj : num 1.62e-12 2.96e-11 1.75e-07 1.19e-06 1.19e-06 ...
## $ SRR7811754 : num 858 574 64 1217 166 ...
## $ SRR7811767 : num 329.6 419.4 20.1 955.5 358.3 ...
## $ SRR7811766 : num 632.1 549.5 93.8 1260.8 217.8 ...
## $ SRR7811762 : num 385.4 473.1 45.3 1157.2 321.8 ...
## $ SRR7811764 : num 483.8 543.3 83.1 762.2 302 ...
## $ SRR7811759 : num 2818.3 1004.2 347.5 2008.4 74.8 ...
A column can be added to the dataset that shows whether the gene is
differentially expressed based on padj < 0.05. This creates a logical
vector where TRUE
represents genes with padj < 0.05 and
FALSE
means it does not meet this criteria (padj >
0.05).
threshold_padj <- resdata$padj < 0.05
length(which(threshold_padj)) ## Determine the number of TRUE values
## [1] 166
We can add the logical vector to the dataset
resdata$threshold <- threshold_padj ## Add vector as a column (threshold) to the resdata
str(resdata)
## 'data.frame': 18442 obs. of 14 variables:
## $ Gene_id : chr "ENSUAMG00000013871" "ENSUAMG00000025404" "ENSUAMG00000016889" "ENSUAMG00000004830" ...
## $ baseMean : num 1419 750 221 1808 189 ...
## $ log2FoldChange: num 2.112 0.951 2.585 1.237 -1.401 ...
## $ lfcSE : num 0.254 0.121 0.39 0.197 0.224 ...
## $ stat : num 8.3 7.86 6.63 6.29 -6.27 ...
## $ pvalue : num 1.02e-16 3.72e-15 3.30e-11 3.11e-10 3.73e-10 ...
## $ padj : num 1.62e-12 2.96e-11 1.75e-07 1.19e-06 1.19e-06 ...
## $ SRR7811754 : num 858 574 64 1217 166 ...
## $ SRR7811767 : num 329.6 419.4 20.1 955.5 358.3 ...
## $ SRR7811766 : num 632.1 549.5 93.8 1260.8 217.8 ...
## $ SRR7811762 : num 385.4 473.1 45.3 1157.2 321.8 ...
## $ SRR7811764 : num 483.8 543.3 83.1 762.2 302 ...
## $ SRR7811759 : num 2818.3 1004.2 347.5 2008.4 74.8 ...
## $ threshold : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
head(resdata)
## Gene_id baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSUAMG00000013871 1418.9993 2.1124384 0.2544389 8.302341 1.020800e-16
## 2 ENSUAMG00000025404 750.3986 0.9505259 0.1208698 7.864046 3.719209e-15
## 3 ENSUAMG00000016889 220.7612 2.5852960 0.3897986 6.632389 3.302964e-11
## 4 ENSUAMG00000004830 1807.5093 1.2374040 0.1966199 6.293382 3.106228e-10
## 5 ENSUAMG00000011450 188.9931 -1.4010826 0.2236332 -6.265092 3.726059e-10
## 6 ENSUAMG00000016642 20.0035 3.4316008 0.5614033 6.112541 9.805715e-10
## padj SRR7811754 SRR7811767 SRR7811766 SRR7811762 SRR7811764
## 1 1.623582e-12 858.003269 329.633995 632.099887 385.40127 483.835954
## 2 2.957701e-11 574.166512 419.447315 549.457852 473.08006 543.333183
## 3 1.751121e-07 64.002406 20.064678 93.809877 45.28465 83.071602
## 4 1.185259e-06 1216.973285 955.460854 1260.849421 1157.16731 762.238081
## 5 1.185259e-06 166.035227 358.297820 217.772929 321.81006 301.976500
## 6 2.599332e-06 4.637856 5.732765 2.233569 2.89051 1.122589
## SRR7811759 threshold
## 1 2818.29622 TRUE
## 2 1004.18904 TRUE
## 3 347.49866 TRUE
## 4 2008.37808 TRUE
## 5 74.78974 TRUE
## 6 45.60350 TRUE
Create a volcano plot using ggplot() function
ggplot(resdata) +
geom_point(aes(x=log2FoldChange,
y=-log10(padj), colour=threshold)) +
theme_classic() +
ggtitle("Spring vs Fall DEG") +
xlab("Log2 fold change") +
ylab("-Log10(padj)") +
theme(legend.position = "none",
plot.title = element_text(size = rel(1.25), hjust = 0.5),
axis.title = element_text(size = rel(1.25))) +
scale_color_manual(values = c("black", "red")) +
scale_x_continuous(limits = c(-10, 10)) +
coord_cartesian(ylim=c(0, 15), clip = "off")
#ggsave("SpringvsFall_DEG-volano.png")
Enhanced volcano takes volcano plots to the next level by being able to label genes of interest within the plot.
resdata <- read.csv("FallvsSpring_unfiltered_matrix.csv", header = T)
resdata <- resdata[, c(2:18)]
head(resdata)
## Gene_id baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSUAMG00000013871 1418.9993 2.1124384 0.2544389 8.302341 1.020800e-16
## 2 ENSUAMG00000025404 750.3986 0.9505259 0.1208698 7.864046 3.719209e-15
## 3 ENSUAMG00000016889 220.7612 2.5852960 0.3897986 6.632389 3.302964e-11
## 4 ENSUAMG00000004830 1807.5093 1.2374040 0.1966199 6.293382 3.106228e-10
## 5 ENSUAMG00000011450 188.9931 -1.4010826 0.2236332 -6.265092 3.726059e-10
## 6 ENSUAMG00000016642 20.0035 3.4316008 0.5614033 6.112541 9.805715e-10
## padj SRR7811754 SRR7811767 SRR7811766 SRR7811762 SRR7811764
## 1 1.623582e-12 858.003269 329.633995 632.099887 385.40127 483.835954
## 2 2.957701e-11 574.166512 419.447315 549.457852 473.08006 543.333183
## 3 1.751121e-07 64.002406 20.064678 93.809877 45.28465 83.071602
## 4 1.185259e-06 1216.973285 955.460854 1260.849421 1157.16731 762.238081
## 5 1.185259e-06 166.035227 358.297820 217.772929 321.81006 301.976500
## 6 2.599332e-06 4.637856 5.732765 2.233569 2.89051 1.122589
## SRR7811759 SRR7811758 SRR7811765 SRR7811757 SRR7811750
## 1 2818.29622 1761.1367 2029.35569 1871.78676 3020.44305
## 2 1004.18904 866.0177 1065.29564 1053.59299 955.40539
## 3 347.49866 298.0385 740.22722 317.32230 198.29169
## 4 2008.37808 2125.4060 2557.82406 3739.42552 2291.37059
## 5 74.78974 108.3776 78.94519 132.73613 129.19004
## 6 45.60350 62.2168 33.43561 32.14703 10.01473
BioMart will be used to gather gene annotation data. Using
listEnsembl()
, genes
was selected as the
database
listEnsembl()
## biomart version
## 1 genes Ensembl Genes 109
## 2 mouse_strains Mouse strains 109
## 3 snps Ensembl Variation 109
## 4 regulation Ensembl Regulation 109
useEnsembl()
was used to identify dataset and mirror
(based on location, useast
was used).
ensembl <- useEnsembl(biomart = "genes", mirror="useast")
Species specific datasets canbe found by running
View(listDatasets(ensembl))
. American black bear was found
to be uamericanus_gene_ensembl
.
ensembl <- useEnsembl(biomart = "genes", dataset = "uamericanus_gene_ensembl")
Given the matrix, ensembl gene ids were used as the identifier for bear. The outputs desired are external gene names. Run these codes to figure out filters, identifier already in the matrix, and attributes, identifiers that want to be added to the matrix.
#View(listFilters(ensembl)) #ensembl_gene_id_version
#View(listAttributes(ensembl)) #external_gene_name
Using desired filter and attributes, gene identifiers can be produced.
biomart.output <- getBM(mart = ensembl, values = resdata$Gene_id,
filters = "ensembl_gene_id",
attributes = c("external_gene_name",
"ensembl_gene_id"))
head(biomart.output)
## external_gene_name ensembl_gene_id
## 1 TMEM161A ENSUAMG00000000002
## 2 MAU2 ENSUAMG00000000016
## 3 ENSUAMG00000000031
## 4 GIGYF1 ENSUAMG00000000048
## 5 ENSUAMG00000000049
## 6 SNORA8 ENSUAMG00000000062
Look at structure of resdata object
str(resdata)
## 'data.frame': 18442 obs. of 17 variables:
## $ Gene_id : chr "ENSUAMG00000013871" "ENSUAMG00000025404" "ENSUAMG00000016889" "ENSUAMG00000004830" ...
## $ baseMean : num 1419 750 221 1808 189 ...
## $ log2FoldChange: num 2.112 0.951 2.585 1.237 -1.401 ...
## $ lfcSE : num 0.254 0.121 0.39 0.197 0.224 ...
## $ stat : num 8.3 7.86 6.63 6.29 -6.27 ...
## $ pvalue : num 1.02e-16 3.72e-15 3.30e-11 3.11e-10 3.73e-10 ...
## $ padj : num 1.62e-12 2.96e-11 1.75e-07 1.19e-06 1.19e-06 ...
## $ SRR7811754 : num 858 574 64 1217 166 ...
## $ SRR7811767 : num 329.6 419.4 20.1 955.5 358.3 ...
## $ SRR7811766 : num 632.1 549.5 93.8 1260.8 217.8 ...
## $ SRR7811762 : num 385.4 473.1 45.3 1157.2 321.8 ...
## $ SRR7811764 : num 483.8 543.3 83.1 762.2 302 ...
## $ SRR7811759 : num 2818.3 1004.2 347.5 2008.4 74.8 ...
## $ SRR7811758 : num 1761 866 298 2125 108 ...
## $ SRR7811765 : num 2029.4 1065.3 740.2 2557.8 78.9 ...
## $ SRR7811757 : num 1872 1054 317 3739 133 ...
## $ SRR7811750 : num 3020 955 198 2291 129 ...
Output gene names want to be added to this data frame
resdata_gene <- dplyr::left_join(resdata, biomart.output,
by=c('Gene_id'='ensembl_gene_id'))
resdata_gene <- resdata_gene[, c(18, 1:17)]
str(resdata_gene)
## 'data.frame': 18442 obs. of 18 variables:
## $ external_gene_name: chr "CISH" "SOCS2" "SLCO1C1" "" ...
## $ Gene_id : chr "ENSUAMG00000013871" "ENSUAMG00000025404" "ENSUAMG00000016889" "ENSUAMG00000004830" ...
## $ baseMean : num 1419 750 221 1808 189 ...
## $ log2FoldChange : num 2.112 0.951 2.585 1.237 -1.401 ...
## $ lfcSE : num 0.254 0.121 0.39 0.197 0.224 ...
## $ stat : num 8.3 7.86 6.63 6.29 -6.27 ...
## $ pvalue : num 1.02e-16 3.72e-15 3.30e-11 3.11e-10 3.73e-10 ...
## $ padj : num 1.62e-12 2.96e-11 1.75e-07 1.19e-06 1.19e-06 ...
## $ SRR7811754 : num 858 574 64 1217 166 ...
## $ SRR7811767 : num 329.6 419.4 20.1 955.5 358.3 ...
## $ SRR7811766 : num 632.1 549.5 93.8 1260.8 217.8 ...
## $ SRR7811762 : num 385.4 473.1 45.3 1157.2 321.8 ...
## $ SRR7811764 : num 483.8 543.3 83.1 762.2 302 ...
## $ SRR7811759 : num 2818.3 1004.2 347.5 2008.4 74.8 ...
## $ SRR7811758 : num 1761 866 298 2125 108 ...
## $ SRR7811765 : num 2029.4 1065.3 740.2 2557.8 78.9 ...
## $ SRR7811757 : num 1872 1054 317 3739 133 ...
## $ SRR7811750 : num 3020 955 198 2291 129 ...
To make the Enhanced volcano plot, a data frame was created selecting only relevant DEGs using padj < 0.05 as the parameter as specified in the original paper.
resdata_sig <- as.data.frame(resdata_gene) %>% dplyr::filter(padj < 0.05)
head(resdata_sig)
## external_gene_name Gene_id baseMean log2FoldChange lfcSE
## 1 CISH ENSUAMG00000013871 1418.9993 2.1124384 0.2544389
## 2 SOCS2 ENSUAMG00000025404 750.3986 0.9505259 0.1208698
## 3 SLCO1C1 ENSUAMG00000016889 220.7612 2.5852960 0.3897986
## 4 ENSUAMG00000004830 1807.5093 1.2374040 0.1966199
## 5 RTN4RL2 ENSUAMG00000011450 188.9931 -1.4010826 0.2236332
## 6 ENSUAMG00000016642 20.0035 3.4316008 0.5614033
## stat pvalue padj SRR7811754 SRR7811767 SRR7811766
## 1 8.302341 1.020800e-16 1.623582e-12 858.003269 329.633995 632.099887
## 2 7.864046 3.719209e-15 2.957701e-11 574.166512 419.447315 549.457852
## 3 6.632389 3.302964e-11 1.751121e-07 64.002406 20.064678 93.809877
## 4 6.293382 3.106228e-10 1.185259e-06 1216.973285 955.460854 1260.849421
## 5 -6.265092 3.726059e-10 1.185259e-06 166.035227 358.297820 217.772929
## 6 6.112541 9.805715e-10 2.599332e-06 4.637856 5.732765 2.233569
## SRR7811762 SRR7811764 SRR7811759 SRR7811758 SRR7811765 SRR7811757 SRR7811750
## 1 385.40127 483.835954 2818.29622 1761.1367 2029.35569 1871.78676 3020.44305
## 2 473.08006 543.333183 1004.18904 866.0177 1065.29564 1053.59299 955.40539
## 3 45.28465 83.071602 347.49866 298.0385 740.22722 317.32230 198.29169
## 4 1157.16731 762.238081 2008.37808 2125.4060 2557.82406 3739.42552 2291.37059
## 5 321.81006 301.976500 74.78974 108.3776 78.94519 132.73613 129.19004
## 6 2.89051 1.122589 45.60350 62.2168 33.43561 32.14703 10.01473
write.csv(resdata_sig, file="FallvsSpring_DEG.csv")
Looking at the supplementary data, the authors identified some genes such as ‘CISH’, ‘SLCO1C1’, and ‘NDST3’ as biologically significant to their findings. In addition, this article focusing on two cytokine supression genes ‘SOCS2’ and ‘SERPINC1’. ‘RTN4RL2’ was the most signficantly downregulated gene in the original paper.
# Genes from paper
keyvals <- ifelse(
resdata_gene$log2FoldChange < 0, 'royalblue',
ifelse(resdata_gene$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Upregulated'
names(keyvals)[keyvals == 'black'] <- 'NA'
names(keyvals)[keyvals == 'royalblue'] <- 'Downregulated'
EnhancedVolcano(resdata_gene,
lab = as.character(resdata_gene$external_gene_name),
x = 'log2FoldChange',
y = 'padj',
selectLab = c('CISH', 'SLCO1C1', 'NDST3', 'SOCS2', 'SERPINC1', 'RTN4RL2'),
xlab = bquote(~Log[1]~ 'fold change'),
colCustom = keyvals,
title = "DE genes from Original Paper",
subtitle = "Differential Expression",
FCcutoff = 1,
pCutoff = 0.05,
labSize = 3,
axisLabSize = 10,
labFace = 'bold',
boxedLabels = TRUE,
legendLabels=c('Not sig.','Log2FC','padj', 'padj & Log2FC'),
legendPosition = 'top',
legendLabSize = 10,
legendIconSize = 3.0,
drawConnectors = TRUE,
widthConnectors = 1.0,
colConnectors = 'black',
gridlines.major = FALSE,
gridlines.minor = FALSE)
Another volcano plot was made using the greatest DEG from this analysis. 3 genes possesed log2FC > 2 and the most down regulated gene was also included.
# Genes from new analysis
keyvals <- ifelse(
resdata_gene$log2FoldChange < 0, 'royalblue',
ifelse(resdata_gene$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Upregulated'
names(keyvals)[keyvals == 'black'] <- 'NA'
names(keyvals)[keyvals == 'royalblue'] <- 'Downregulated'
EnhancedVolcano(resdata_gene,
lab = as.character(resdata_gene$external_gene_name),
x = 'log2FoldChange',
y = 'padj',
selectLab = c('SLCO1C1', 'SERPINC1', 'CISH', 'PLCXD2', 'RTN4RL2'),
xlab = bquote(~Log[1]~ 'fold change'),
colCustom = keyvals,
title = "DE genes from new analysis",
subtitle = "Including most downregulated from original paper",
FCcutoff = 1,
pCutoff = 0.05,
labSize = 3,
axisLabSize = 10,
labFace = 'bold',
boxedLabels = TRUE,
legendLabels=c('Not sig.','Log2FC','padj', 'padj & Log2FC'),
legendPosition = 'top',
legendLabSize = 10,
legendIconSize = 3.0,
drawConnectors = TRUE,
widthConnectors = 1.0,
colConnectors = 'black',
gridlines.major = FALSE,
gridlines.minor = FALSE)
Volcano plot was made from genes examined from pathway analysis
# Genes from pathway analysis
keyvals <- ifelse(
resdata_gene$log2FoldChange < 0, 'royalblue',
ifelse(resdata_gene$log2FoldChange > 0, 'red',
'black'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- 'Upregulated'
names(keyvals)[keyvals == 'black'] <- 'NA'
names(keyvals)[keyvals == 'royalblue'] <- 'Downregulated'
EnhancedVolcano(resdata_gene,
lab = as.character(resdata_gene$external_gene_name),
x = 'log2FoldChange',
y = 'padj',
selectLab = c('AGXT', 'CTH', 'NECTIN1', 'ATRNL1', 'PLOD3', 'NPR2'),
xlab = bquote(~Log[1]~ 'fold change'),
colCustom = keyvals,
title = "DE genes examined in pathway analysis",
subtitle = "Differential expression",
FCcutoff = 1,
pCutoff = 0.05,
labSize = 3,
axisLabSize = 10,
labFace = 'bold',
boxedLabels = TRUE,
legendLabels=c('Not sig.','Log2FC','padj', 'padj & Log2FC'),
legendPosition = 'top',
legendLabSize = 10,
legendIconSize = 3.0,
drawConnectors = TRUE,
widthConnectors = 1.0,
colConnectors = 'black',
gridlines.major = FALSE,
gridlines.minor = FALSE)
Heatmaps are useful by using color to depict changes between samples. In this case by season.
A csv file was made externally just containing the counts values for
each sample, called heatmap_input.csv
heatmap_normDEG <- read.csv("heatmap_input.csv", header = T, row.names = NULL)
head(heatmap_normDEG)
## SRR7811754 SRR7811767 SRR7811766 SRR7811762 SRR7811764 SRR7811759
## 1 858.003269 329.633995 632.099887 385.40127 483.835954 2818.29622
## 2 574.166512 419.447315 549.457852 473.08006 543.333183 1004.18904
## 3 64.002406 20.064678 93.809877 45.28465 83.071602 347.49866
## 4 1216.973285 955.460854 1260.849421 1157.16730 762.238081 2008.37808
## 5 166.035227 358.297820 217.772929 321.81006 301.976500 74.78974
## 6 4.637856 5.732765 2.233569 2.89051 1.122589 45.60350
## SRR7811758 SRR7811765 SRR7811757 SRR7811750
## 1 1761.1367 2029.35569 1871.78676 3020.44305
## 2 866.0177 1065.29564 1053.59299 955.40539
## 3 298.0385 740.22722 317.32230 198.29169
## 4 2125.4060 2557.82406 3739.42552 2291.37059
## 5 108.3776 78.94519 132.73613 129.19004
## 6 62.2168 33.43561 32.14703 10.01473
An annotation file was also made externally containing two columns, sample names and season designation.
heatmap_meta <- read.csv("heatmap_anno.csv", header = T)
heatmap_meta
## sample season
## 1 SRR7811754 fall
## 2 SRR7811767 fall
## 3 SRR7811766 fall
## 4 SRR7811762 fall
## 5 SRR7811764 fall
## 6 SRR7811759 spring
## 7 SRR7811758 spring
## 8 SRR7811765 spring
## 9 SRR7811757 spring
## 10 SRR7811750 spring
pheatmap
# Assign the sample names to be the row names
rownames(heatmap_meta) <- heatmap_meta$sample
# Re-level factors
heatmap_meta$season <- factor(heatmap_meta$season, levels = c("fall", "spring"))
# Customized colors for heatmap
heatmap_colors <- colorRampPalette(c("blue", "white", "red"))(6)
# Colors for denoting annotations
ann_colors <- list(genotype = c(spring = "#D11313", fall = "#6DB0F8"))
# Plot heatmap
pheatmap(heatmap_normDEG,
color = heatmap_colors,
annotation_col = heatmap_meta,
annotation_colors = ann_colors,
show_colnames = F,
show_rownames = F)
Z scores are measures of standard deviation used to plot the heatmap. How variable are samples from the mean
# plot pheatmap
heatmap <- as.ggplot(pheatmap(heatmap_normDEG,
color = heatmap_colors,
cluster_rows = T,
annotation_col = heatmap_meta,
annotation_colors = ann_colors,
border_color="white",
cluster_cols = T,
show_colnames = F,
show_rownames = F,
scale="row",
treeheight_col = 10,
treeheight_row = 25,
cellwidth = 10,
cellheight = 2,
fontsize = 9,
annotation_names_col = F))
# Adding white space to add annotations to the top, right, bottom, and left margins
heatmap <- heatmap +
theme(plot.margin = unit(c(1,0.5,0.5,0.5), "cm"))
# Adding in annotations with cowplot
heatmap_figure <- ggdraw(heatmap) +
draw_label("Bear Kidney Samples",
x = 0.28,
y = 0.97,
size = 14,
hjust = 0,
vjust = 0) +
draw_label("Fall",
x = 0.25,
y = 0.91,
size = 12,
hjust = -0.7,
vjust = 0,
fontface = "italic") +
draw_label("Spring",
x = 0.36,
y = 0.91,
size = 12,
hjust = -0.5,
vjust = 0,
fontface = "italic") +
draw_label("Up-regulated (79)",
angle = 90,
size = 10,
x = 0.1,
y = 0.5,
hjust = -0.6,
vjust = 0) +
draw_label("Down-regulated (87)",
angle = 90,
size = 10,
x = 0.1,
y = 0.09,
hjust = -0.7,
vjust = 0)
heatmap_figure
Need to use unfiltered matrix to convert to entrez id
resdata <- read.csv("FallvsSpring_unfiltered_matrix.csv", header = T)
resdata <- resdata[, c(2:18)]
head(resdata)
## Gene_id baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSUAMG00000013871 1418.9993 2.1124384 0.2544389 8.302341 1.020800e-16
## 2 ENSUAMG00000025404 750.3986 0.9505259 0.1208698 7.864046 3.719209e-15
## 3 ENSUAMG00000016889 220.7612 2.5852960 0.3897986 6.632389 3.302964e-11
## 4 ENSUAMG00000004830 1807.5093 1.2374040 0.1966199 6.293382 3.106228e-10
## 5 ENSUAMG00000011450 188.9931 -1.4010826 0.2236332 -6.265092 3.726059e-10
## 6 ENSUAMG00000016642 20.0035 3.4316008 0.5614033 6.112541 9.805715e-10
## padj SRR7811754 SRR7811767 SRR7811766 SRR7811762 SRR7811764
## 1 1.623582e-12 858.003269 329.633995 632.099887 385.40127 483.835954
## 2 2.957701e-11 574.166512 419.447315 549.457852 473.08006 543.333183
## 3 1.751121e-07 64.002406 20.064678 93.809877 45.28465 83.071602
## 4 1.185259e-06 1216.973285 955.460854 1260.849421 1157.16731 762.238081
## 5 1.185259e-06 166.035227 358.297820 217.772929 321.81006 301.976500
## 6 2.599332e-06 4.637856 5.732765 2.233569 2.89051 1.122589
## SRR7811759 SRR7811758 SRR7811765 SRR7811757 SRR7811750
## 1 2818.29622 1761.1367 2029.35569 1871.78676 3020.44305
## 2 1004.18904 866.0177 1065.29564 1053.59299 955.40539
## 3 347.49866 298.0385 740.22722 317.32230 198.29169
## 4 2008.37808 2125.4060 2557.82406 3739.42552 2291.37059
## 5 74.78974 108.3776 78.94519 132.73613 129.19004
## 6 45.60350 62.2168 33.43561 32.14703 10.01473
For this pathway analysis, mouse homologs will have to be used. Therefore, gene ids from bear must be obtained and then converted to mouse in order to be input for later packages.
American black bear selected first
ensembl <- useEnsembl(biomart = "genes",
dataset = "uamericanus_gene_ensembl",
mirror="useast")
Make sure to obtain mouse orthologs for pathway analysis.
either:
- mmusculus_homolog_ensembl_gene
biomart.output <- getBM(mart = ensembl, values = resdata$Gene_id,
filters = "ensembl_gene_id",
attributes = c("ensembl_gene_id",
"mmusculus_homolog_ensembl_gene",
"external_gene_name"))
Merge mouse annotations to resdata
names(biomart.output)[1] <- "Gene_id"
#biomart.output
resdata_gene <- dplyr::left_join(resdata, biomart.output,
by=c('Gene_id'='Gene_id'))
#resdata_gene
Save resdata_gene and edit the file externally so that only mmusculus_homolog_associated_gene_names are there. This will be the new input to get entrez id
write.csv(resdata_gene, "FallvsSpring_unfiltered_matrix_mouse.csv")
Input newly made file and remove rows without annotations.
resdata_mouse <- read.csv("FallvsSpring_unfiltered_matrix_mouse2.csv", header = T)
resdata_mouse <- resdata_mouse[, c(2:18)]
head(resdata_mouse)
## mmusculus_homolog_ensembl_gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000032578 1418.9993 2.1124384 0.2544389 8.302341
## 2 ENSMUSG00000020027 750.3986 0.9505259 0.1208698 7.864046
## 3 ENSMUSG00000030235 220.7612 2.5852960 0.3897986 6.632389
## 4 1807.5093 1.2374040 0.1966199 6.293382
## 5 ENSMUSG00000050896 188.9931 -1.4010826 0.2236332 -6.265092
## 6 ENSMUSG00000060924 20.0035 3.4316008 0.5614033 6.112541
## pvalue padj SRR7811754 SRR7811767 SRR7811766 SRR7811762 SRR7811764
## 1 1.02e-16 1.62e-12 858.003269 329.633995 632.099887 385.40127 483.835954
## 2 3.72e-15 2.96e-11 574.166512 419.447315 549.457852 473.08006 543.333183
## 3 3.30e-11 1.75e-07 64.002406 20.064678 93.809877 45.28465 83.071602
## 4 3.11e-10 1.19e-06 1216.973285 955.460854 1260.849421 1157.16730 762.238081
## 5 3.73e-10 1.19e-06 166.035227 358.297820 217.772929 321.81006 301.976500
## 6 9.81e-10 2.60e-06 4.637856 5.732765 2.233569 2.89051 1.122589
## SRR7811759 SRR7811758 SRR7811765 SRR7811757 SRR7811750
## 1 2818.29622 1761.1367 2029.35569 1871.78676 3020.44305
## 2 1004.18904 866.0177 1065.29564 1053.59299 955.40539
## 3 347.49866 298.0385 740.22722 317.32230 198.29169
## 4 2008.37808 2125.4060 2557.82406 3739.42552 2291.37059
## 5 74.78974 108.3776 78.94519 132.73613 129.19004
## 6 45.60350 62.2168 33.43561 32.14703 10.01473
# remove rows without ensembl gene ids
resdata_mouse <-
resdata_mouse[!(is.na(resdata_mouse$mmusculus_homolog_ensembl_gene) | resdata_mouse$mmusculus_homolog_ensembl_gene==""), ]
head(resdata_mouse)
## mmusculus_homolog_ensembl_gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000032578 1418.9993 2.1124384 0.2544389 8.302341
## 2 ENSMUSG00000020027 750.3986 0.9505259 0.1208698 7.864046
## 3 ENSMUSG00000030235 220.7612 2.5852960 0.3897986 6.632389
## 5 ENSMUSG00000050896 188.9931 -1.4010826 0.2236332 -6.265092
## 6 ENSMUSG00000060924 20.0035 3.4316008 0.5614033 6.112541
## 7 ENSMUSG00000044715 941.7348 0.7836418 0.1297604 6.039146
## pvalue padj SRR7811754 SRR7811767 SRR7811766 SRR7811762 SRR7811764
## 1 1.02e-16 1.62e-12 858.003269 329.633995 632.099887 385.40127 483.835954
## 2 3.72e-15 2.96e-11 574.166512 419.447315 549.457852 473.08006 543.333183
## 3 3.30e-11 1.75e-07 64.002406 20.064678 93.809877 45.28465 83.071602
## 5 3.73e-10 1.19e-06 166.035227 358.297820 217.772929 321.81006 301.976500
## 6 9.81e-10 2.60e-06 4.637856 5.732765 2.233569 2.89051 1.122589
## 7 1.55e-09 3.52e-06 790.290578 683.154511 623.165613 753.45948 601.707822
## SRR7811759 SRR7811758 SRR7811765 SRR7811757 SRR7811750
## 1 2818.29622 1761.1367 2029.35569 1871.78676 3020.44305
## 2 1004.18904 866.0177 1065.29564 1053.59299 955.40539
## 3 347.49866 298.0385 740.22722 317.32230 198.29169
## 5 74.78974 108.3776 78.94519 132.73613 129.19004
## 6 45.60350 62.2168 33.43561 32.14703 10.01473
## 7 1219.43755 932.2484 1178.60520 1356.39728 1278.88122
Use BioMart for the mouse dataset
ensembl <- useEnsembl(biomart = "genes",
dataset = "mmusculus_gene_ensembl",
mirror="useast")
Obtain entrezgene ids from mouse ensembl ids
biomart.output <- getBM(mart = ensembl, values = resdata_mouse$mmusculus_homolog_ensembl_gene,
filters = "ensembl_gene_id",
attributes = c("ensembl_gene_id",
"entrezgene_id",
"external_gene_name"))
head(biomart.output)
## ensembl_gene_id entrezgene_id external_gene_name
## 1 ENSMUSG00000000056 67608 Narf
## 2 ENSMUSG00000000058 12390 Cav2
## 3 ENSMUSG00000000088 12858 Cox5a
## 4 ENSMUSG00000000148 231841 Brat1
## 5 ENSMUSG00000000154 18400 Slc22a18
## 6 ENSMUSG00000000159 72058 Igsf5
#write.csv(biomart.output, "mouse_gene_names.csv)
# will need this output to identify mouse genes from pathway analysis to compare back to bear
Merge entrezgene_id with resdata_mouse
names(biomart.output)[1] <- "mmusculus_homolog_ensembl_gene"
#biomart.output
resdata_mouse <- dplyr::left_join(resdata_mouse, biomart.output,
by=c('mmusculus_homolog_ensembl_gene'= 'mmusculus_homolog_ensembl_gene'))
#resdata_mouse
write.csv(resdata_mouse, "resdata_mouse.csv")
A list of background and significant genes are needed to perform over-representation analysis.
Can make two data frames that show up and down-regulated genes based on log2FC and padj value.
UpGenes <-
resdata_mouse %>%
filter(log2FoldChange > 0 & padj < 0.05) %>%
as.data.frame()
DownGenes <-
resdata_mouse %>%
filter(log2FoldChange < 0 & padj < 0.05) %>%
as.data.frame()
# we want the log2 fold change
original_gene_list <- resdata_mouse$log2FoldChange
head(original_gene_list)
## [1] 2.1124384 0.9505259 2.5852960 -1.4010826 3.4316008 0.7836418
# name the vector
names(original_gene_list) <- resdata_mouse$mmusculus_homolog_ensembl_gene
head(original_gene_list)
## ENSMUSG00000032578 ENSMUSG00000020027 ENSMUSG00000030235 ENSMUSG00000050896
## 2.1124384 0.9505259 2.5852960 -1.4010826
## ENSMUSG00000060924 ENSMUSG00000044715
## 3.4316008 0.7836418
# omit any NA values
gene_list <- na.omit(original_gene_list)
head(gene_list)
## ENSMUSG00000032578 ENSMUSG00000020027 ENSMUSG00000030235 ENSMUSG00000050896
## 2.1124384 0.9505259 2.5852960 -1.4010826
## ENSMUSG00000060924 ENSMUSG00000044715
## 3.4316008 0.7836418
# sort the list in decreasing order (will be required so good to do this now)
gene_list = sort(gene_list, decreasing = TRUE)
head(gene_list)
## ENSMUSG00000105906 ENSMUSG00000105906 ENSMUSG00000105547 ENSMUSG00000105547
## 6.880774 6.880774 6.880774 6.880774
## ENSMUSG00000106039 ENSMUSG00000106039
## 6.880774 6.880774
# Extract significant results (padj < 0.05) using subset()
sig_genes_df = subset(resdata_mouse, padj < 0.05)
# From significant results, we want to filter on padj value
genes <- sig_genes_df$padj
# Name the vector i.e. add mouse ensembl id
names(genes) <- sig_genes_df$mmusculus_homolog_ensembl_gene
# omit NA values
genes <- na.omit(genes)
# set genes as new object
genes <- names(genes)
head(genes)
## [1] "ENSMUSG00000032578" "ENSMUSG00000020027" "ENSMUSG00000030235"
## [4] "ENSMUSG00000050896" "ENSMUSG00000060924" "ENSMUSG00000044715"
These analyses aid in identifying which pathways the genes in the dataset are involved in.
Use keytypes()
to select correct input, this is
ENSEMBL
keytypes(org.Mm.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GENETYPE" "GO" "GOALL" "IPI" "MGI"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UNIPROT"
go_enrich <- enrichGO(gene = genes,
universe = names(gene_list),
OrgDb = org.Mm.eg.db,
keyType = 'ENSEMBL',
readable = T,
ont = "CC",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
## Output results from GO analysis to a table
cluster_summary <- data.frame(go_enrich)
write.csv(cluster_summary, "clusterProfiler_bear_CC.csv")
head(cluster_summary)
## ID Description GeneRatio BgRatio
## GO:0042571 GO:0042571 immunoglobulin complex, circulating 4/122 32/14381
## GO:0019814 GO:0019814 immunoglobulin complex 4/122 38/14381
## GO:0043083 GO:0043083 synaptic cleft 3/122 17/14381
## pvalue p.adjust qvalue geneID Count
## GO:0042571 0.0001475198 0.0295846 0.02801449 Iglc1/Iglc3/Iglc4/Iglc2 4
## GO:0019814 0.0002911570 0.0295846 0.02801449 Iglc1/Iglc3/Iglc4/Iglc2 4
## GO:0043083 0.0003713548 0.0295846 0.02801449 C1ql1/Gria3/Lama5 3
barplot <- barplot(go_enrich, showCategory = 10,
title = "GO CC",
font.size = 8)
barplot
dot <- dotplot(go_enrich)
dot
Use keytypes()
to select correct input, this is
ENSEMBL
keytypes(org.Mm.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GENETYPE" "GO" "GOALL" "IPI" "MGI"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UNIPROT"
go_enrich <- enrichGO(gene = genes,
universe = names(gene_list),
OrgDb = org.Mm.eg.db,
keyType = 'ENSEMBL',
readable = T,
ont = "BP",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
## Output results from GO analysis to a table
cluster_summary <- data.frame(go_enrich)
write.csv(cluster_summary, "clusterProfiler_bear_BP.csv")
head(cluster_summary)
## ID Description GeneRatio BgRatio
## GO:1901605 GO:1901605 alpha-amino acid metabolic process 9/123 172/14279
## pvalue p.adjust qvalue
## GO:1901605 1.758679e-05 0.03774124 0.03661754
## geneID Count
## GO:1901605 Agxt/Cth/Aspg/Tst/Acmsd/Plod3/Glud1/Aldh6a1/Nr1h4 9
barplot <- barplot(go_enrich, showCategory = 10,
title = "GO BP",
font.size = 8)
barplot
dot <- dotplot(go_enrich)
dot
Use keytypes()
to select correct input, this is
ENSEMBL
keytypes(org.Mm.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GENETYPE" "GO" "GOALL" "IPI" "MGI"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UNIPROT"
go_enrich <- enrichGO(gene = genes,
universe = names(gene_list),
OrgDb = org.Mm.eg.db,
keyType = 'ENSEMBL',
readable = T,
ont = "MF",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
## Output results from GO analysis to a table
cluster_summary <- data.frame(go_enrich)
write.csv(cluster_summary, "clusterProfiler_bear_MF.csv")
head(cluster_summary)
## ID Description GeneRatio BgRatio pvalue
## GO:0043177 GO:0043177 organic acid binding 7/125 127/14127 0.0001331837
## p.adjust qvalue geneID
## GO:0043177 0.04927798 0.04640401 Slc46a1/Agxt/Gchfr/Plod3/Sesn2/Glud1/Nr1h4
## Count
## GO:0043177 7
barplot <- barplot(go_enrich, showCategory = 10,
title = "GO MF",
font.size = 8)
barplot
dot <- dotplot(go_enrich)
dot
Human Genes were used to compare further pathway analysis as input
for KEGG. The genelist.csv
file was created externally
matching mouse ensembl to human gene names.
# using the biomart.outputs create the genelists from up
UpGenes <- UpGenes %>%
dplyr::select(external_gene_name) %>%
dplyr::rename("gene" = "external_gene_name")
UpGenes <- mutate_all(UpGenes, .funs=toupper) #makes it easy to apply the same transformation on multiple variables
# using the biomart.outputs create the genelists from down
DownGenes <- DownGenes %>%
dplyr::select(external_gene_name) %>%
dplyr::rename("gene" = "external_gene_name")
DownGenes <- mutate_all(DownGenes, .funs=toupper) #makes it easy to apply the same transformation on multiple variables
#load geneset
KEGG <- msigdb_gsets(species="Homo sapiens", category="C2", subcategory="CP:KEGG")
PID <- msigdb_gsets(species="Homo sapiens", category="C2", subcategory="CP:PID")
GOBP<- msigdb_gsets(species = "Homo sapiens", "C5", "GO:BP")
react<- msigdb_gsets("Homo sapiens", "C2", "CP:REACTOME")
HALL<- msigdb_gsets("Homo sapiens", "H")
#load gene list - this gene list is ALL DE genes - not separated by UP or DOWN
x_up <- UpGenes[which(UpGenes$gene !=""),1]
x_down <- DownGenes[which(DownGenes$gene !=""),1]
#KEGG
hyp_1 <- hypeR(x_up, KEGG, test="hypergeometric", background=50000, pval = 0.05, plotting=TRUE)
print(hyp_1)
## (hyp)
##
## data:
## label pval fdr signature
## KEGG_NITROGEN_METABOLISM 0.00036 0.044 61
## KEGG_GLYCINE_SERINE_AND_THREONINE_METABOLISM 0.00067 0.044 61
## KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM 0.00071 0.044 61
## KEGG_PEROXISOME 0.00410 0.190 61
## KEGG_JAK_STAT_SIGNALING_PATHWAY 0.01600 0.580 61
## KEGG_PROXIMAL_TUBULE_BICARBONATE_RECLAMATION 0.02800 0.630 61
## geneset overlap background hits
## 23 2 50000 CTH,GLUD1
## 31 2 50000 AGXT,CTH
## 32 2 50000 AGXT,GLUD1
## 78 2 50000 ACSL5,AGXT
## 155 2 50000 CISH,SOCS2
## 23 1 50000 GLUD1
##
## plots: 14 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1, title = "KEGG", abrv = 30, val = "pval")
#PID
hyp_1_pid <- hypeR(x_up, PID, test="hypergeometric", background=50000, pval=0.05, plotting=TRUE)
print(hyp_1_pid)
## (hyp)
##
## data:
## label pval fdr signature geneset overlap
## PID_IL2_1PATHWAY 0.0021 0.41 61 55 2
## PID_IL5_PATHWAY 0.0170 0.62 61 14 1
## PID_S1P_S1P4_PATHWAY 0.0170 0.62 61 14 1
## PID_THROMBIN_PAR4_PATHWAY 0.0180 0.62 61 15 1
## PID_S1P_META_PATHWAY 0.0250 0.62 61 21 1
## PID_PRL_SIGNALING_EVENTS_PATHWAY 0.0280 0.62 61 23 1
## background hits
## 50000 CISH,SOCS2
## 50000 CISH
## 50000 GNA13
## 50000 GNA13
## 50000 GNA13
## 50000 PTP4A1
##
## plots: 13 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1_pid, title = "PID", abrv = 30, val = "pval")
# GOBP
hyp_1_G <- hypeR(x_up, GOBP, test="hypergeometric", background=50000, fdr=0.05, plotting=TRUE)
print(hyp_1_G)
## (hyp)
##
## data:
## label pval fdr signature
## GOBP_ALPHA_AMINO_ACID_METABOLIC_PROCESS 6.1e-09 4.7e-05 61
## GOBP_CELLULAR_AMINO_ACID_METABOLIC_PROCESS 6.2e-08 2.4e-04 61
## GOBP_ORGANONITROGEN_COMPOUND_BIOSYNTHETIC_PROCESS 1.1e-07 2.9e-04 61
## GOBP_ORGANIC_ACID_METABOLIC_PROCESS 2.5e-07 4.8e-04 61
## GOBP_CYSTEINE_METABOLIC_PROCESS 3.8e-07 5.8e-04 61
## GOBP_REGULATION_OF_TRANSPORT 1.1e-06 1.5e-03 61
## geneset overlap background
## 204 7 50000
## 286 7 50000
## 1703 13 50000
## 964 10 50000
## 12 3 50000
## 1749 12 50000
## hits
## ACMSD,AGXT,ALDH6A1,CTH,GLUD1,NR1H4,TST
## ACMSD,AGXT,ALDH6A1,CTH,GLUD1,NR1H4,TST
## ACMSD,ACSL5,AGXT,CTH,EIF3L,EIF4A1,GALNT5,GLUD1,NDST3,NUS1,RAB38,SELENOK,SLC25A38
## ACMSD,ACSL5,AGXT,ALDH6A1,CTH,GLUD1,LYPLA1,NR1H4,NUPR1,TST
## AGXT,CTH,TST
## ACSL5,CLIC5,DNAJC6,GLUD1,LYPLA1,MCU,NKAIN1,NR1H4,NUS1,SELENOK,SRI,STXBP1
##
## plots: 87 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1_G, title="GOBP", abrv=30, val = "fdr")
#reactome
hyp_1 <- hypeR(x_down, react, test="hypergeometric", background=50000, pval=0.5, plotting=TRUE)
print(hyp_1)
## (hyp)
##
## data:
## label pval fdr signature
## REACTOME_COLLAGEN_BIOSYNTHESIS_AND_MODIFYING_ENZYMES 9.5e-05 0.12 65
## REACTOME_COLLAGEN_FORMATION 2.3e-04 0.12 65
## REACTOME_SPHINGOLIPID_METABOLISM 2.3e-04 0.12 65
## REACTOME_TRANSPORT_OF_SMALL_MOLECULES 3.7e-04 0.13 65
## REACTOME_METABOLISM_OF_LIPIDS 4.1e-04 0.13 65
## REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION 6.5e-04 0.16 65
## geneset overlap background hits
## 67 3 50000 COL11A2,COL6A5,PLOD3
## 90 3 50000 COL11A2,COL6A5,PLOD3
## 90 3 50000 CERS2,PRKD2,SMPD4
## 728 6 50000 ABCA1,ABCC3,ANO8,SLC22A6,SLC46A1,SLC5A10
## 743 6 50000 ABCA1,ABCC3,CERS2,PRKD2,SMPD4,TSPOAP1
## 301 4 50000 COL11A2,COL6A5,LAMA5,PLOD3
##
## plots: 171 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1, title = "react", abrv = 30, val = "pval")
#HPO
hyp_1_hall <- hypeR(x_up, HALL, test="hypergeometric", background=50000, pval=0.5, plotting=TRUE)
print(hyp_1_hall)
## (hyp)
##
## data:
## label pval fdr signature geneset overlap
## HALLMARK_BILE_ACID_METABOLISM 0.00036 0.018 61 112 3
## HALLMARK_GLYCOLYSIS 0.00190 0.032 61 200 3
## HALLMARK_IL2_STAT5_SIGNALING 0.00190 0.032 61 199 3
## HALLMARK_APOPTOSIS 0.01700 0.120 61 161 2
## HALLMARK_ESTROGEN_RESPONSE_LATE 0.02500 0.120 61 200 2
## HALLMARK_HEME_METABOLISM 0.02500 0.120 61 200 2
## background hits
## 50000 ACSL5,AGXT,NR1H4
## 50000 CTH,IER3,NDST3
## 50000 CISH,SERPINC1,SOCS2
## 50000 CTH,IER3
## 50000 CISH,TST
## 50000 ALDH6A1,SLC25A38
##
## plots: 26 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1_hall, title = "HALL", abrv = 30, val = "pval")
#KEGG
hyp_1 <- hypeR(x_down, KEGG, test="hypergeometric", background=50000, pval=0.5, plotting=TRUE)
print(hyp_1)
## (hyp)
##
## data:
## label pval fdr signature geneset overlap
## KEGG_ABC_TRANSPORTERS 0.0015 0.25 65 44 2
## KEGG_ADHERENS_JUNCTION 0.0041 0.25 65 73 2
## KEGG_PEROXISOME 0.0047 0.25 65 78 2
## KEGG_ECM_RECEPTOR_INTERACTION 0.0054 0.25 65 84 2
## KEGG_CELL_ADHESION_MOLECULES_CAMS 0.0130 0.49 65 133 2
## KEGG_FOCAL_ADHESION 0.0280 0.79 65 199 2
## background hits
## 50000 ABCA1,ABCC3
## 50000 NECTIN1,WASF3
## 50000 PAOX,PEX6
## 50000 COL11A2,LAMA5
## 50000 GLG1,NECTIN1
## 50000 COL11A2,LAMA5
##
## plots: 27 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1, title = "KEGG", abrv = 30, val = "pval")
#PID
hyp_1_pid <- hypeR(x_down, PID, test="hypergeometric", background=50000, pval=0.5, plotting=TRUE)
print(hyp_1_pid)
## (hyp)
##
## data:
## label pval fdr signature geneset overlap background
## PID_SYNDECAN_1_PATHWAY 0.0017 0.33 65 46 2 50000
## PID_INTEGRIN1_PATHWAY 0.0034 0.33 65 66 2 50000
## PID_INTEGRIN4_PATHWAY 0.0140 0.93 65 11 1 50000
## PID_INTEGRIN_A9B1_PATHWAY 0.0320 0.94 65 25 1 50000
## PID_RXR_VDR_PATHWAY 0.0330 0.94 65 26 1 50000
## PID_NECTIN_PATHWAY 0.0380 0.94 65 30 1 50000
## hits
## COL11A2,LAMA5
## COL11A2,LAMA5
## LAMA5
## PAOX
## ABCA1
## NECTIN1
##
## plots: 16 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1_pid, title = "PID", abrv = 30, val = "pval")
# GOBP
hyp_1_G <- hypeR(x_down, GOBP, test="hypergeometric", background=50000, fdr=0.05, plotting=TRUE)
print(hyp_1_G)
## (hyp)
##
## data:
## label pval fdr
## GOBP_TRANSMEMBRANE_TRANSPORT 8.4e-09 6.4e-05
## GOBP_ION_TRANSPORT 8.9e-08 3.4e-04
## GOBP_CELL_MORPHOGENESIS 6.7e-07 1.4e-03
## GOBP_ANIMAL_ORGAN_MORPHOGENESIS 9.0e-07 1.4e-03
## GOBP_TISSUE_DEVELOPMENT 9.4e-07 1.4e-03
## GOBP_REGULATION_OF_ANATOMICAL_STRUCTURE_MORPHOGENESIS 4.8e-06 6.1e-03
## signature geneset overlap background
## 65 1529 14 50000
## 65 1556 13 50000
## 65 1004 10 50000
## 65 1037 10 50000
## 65 1916 13 50000
## 65 976 9 50000
## hits
## ABCA1,ABCC3,ANO8,GRIA3,PEX6,SESN2,SHANK3,SLC16A13,SLC22A23,SLC22A6,SLC46A1,SLC5A10,STIM1,TMEM175
## ABCC3,ANO8,GRIA3,MELTF,NECTIN1,SHANK3,SLC16A13,SLC22A23,SLC22A6,SLC46A1,SLC5A10,STIM1,TMEM175
## ATRNL1,COCH,DAB2IP,LAMA5,MEGF8,MELTF,NECTIN1,PLOD3,SHANK3,WASF3
## ATRNL1,GAA,GATA5,GLG1,LAMA5,MEGF8,NECTIN1,PLOD3,SHANK3,STIM1
## ATRNL1,COL11A2,DAB2IP,FAM83H,GAA,GATA5,GLG1,LAMA5,MEGF8,NECTIN1,PLOD3,PRKD2,STIM1
## COCH,DAB2IP,GATA5,MEGF8,MELTF,PRKD2,SHANK3,STIM1,WASF3
##
## plots: 21 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1_G, title="GOBP", abrv=30, val = "fdr")
#reactome
hyp_1 <- hypeR(x_down, react, test="hypergeometric", background=50000, pval=0.5, plotting=TRUE)
print(hyp_1)
## (hyp)
##
## data:
## label pval fdr signature
## REACTOME_COLLAGEN_BIOSYNTHESIS_AND_MODIFYING_ENZYMES 9.5e-05 0.12 65
## REACTOME_COLLAGEN_FORMATION 2.3e-04 0.12 65
## REACTOME_SPHINGOLIPID_METABOLISM 2.3e-04 0.12 65
## REACTOME_TRANSPORT_OF_SMALL_MOLECULES 3.7e-04 0.13 65
## REACTOME_METABOLISM_OF_LIPIDS 4.1e-04 0.13 65
## REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION 6.5e-04 0.16 65
## geneset overlap background hits
## 67 3 50000 COL11A2,COL6A5,PLOD3
## 90 3 50000 COL11A2,COL6A5,PLOD3
## 90 3 50000 CERS2,PRKD2,SMPD4
## 728 6 50000 ABCA1,ABCC3,ANO8,SLC22A6,SLC46A1,SLC5A10
## 743 6 50000 ABCA1,ABCC3,CERS2,PRKD2,SMPD4,TSPOAP1
## 301 4 50000 COL11A2,COL6A5,LAMA5,PLOD3
##
## plots: 171 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1, title = "react", abrv = 30, val = "pval")
#HPO
hyp_1_hall <- hypeR(x_down, HALL, test="hypergeometric", background=50000, pval=0.5, plotting=TRUE)
print(hyp_1_hall)
## (hyp)
##
## data:
## label pval fdr signature geneset overlap
## HALLMARK_BILE_ACID_METABOLISM 0.00043 0.022 65 112 3
## HALLMARK_XENOBIOTIC_METABOLISM 0.02800 0.600 65 200 2
## HALLMARK_PROTEIN_SECRETION 0.12000 0.600 65 96 1
## HALLMARK_PEROXISOME 0.13000 0.600 65 104 1
## HALLMARK_DNA_REPAIR 0.18000 0.600 65 150 1
## HALLMARK_UV_RESPONSE_UP 0.19000 0.600 65 158 1
## background hits
## 50000 ABCA1,PAOX,PEX6
## 50000 ABCC3,PMM1
## 50000 ABCA1
## 50000 PEX6
## 50000 NPR2
## 50000 SELENOW
##
## plots: 19 Figures
##
## args: signature
## genesets
## test
## background
## power
## absolute
## pval
## fdr
## plotting
## quiet
hyp_dots(hyp_1_hall, title = "HALL", abrv = 30, val = "pval")
Box plots are figures that display the distribution of normalized counts. Genes identified with the cardiac pathway from the previous pathway analysis will be displayed since they are the most relevent for this analysis. Boxplots show the distribution of counts across samples with a line indicating the average.
Load in externally made csv file with genes of interest to be plotted.
top_deg <- read.csv("boxplot_DEG.csv", header = T)
head(top_deg)
## gene_id SRR7811754 SRR7811767 SRR7811766 SRR7811762 SRR7811764 SRR7811759
## 1 CISH 858.00327 329.63399 632.09989 385.40127 483.83595 2818.29622
## 2 SLCO1C1 64.00241 20.06468 93.80988 45.28465 83.07160 347.49866
## 3 SERPINC1 163.25251 88.85786 221.12328 96.35032 245.84704 1374.48945
## 4 NDST3 129.85995 40.12936 214.42258 72.26274 42.65839 477.92467
## 5 PLCXD2 249.51663 119.43261 148.53231 146.45248 108.89115 50.16385
## 6 RTN4RL2 166.03523 358.29782 217.77293 321.81006 301.97650 74.78974
## SRR7811758 SRR7811765 SRR7811757 SRR7811750
## 1 1761.13672 2029.35569 1871.78676 3020.44305
## 2 298.03852 740.22722 317.32230 198.29169
## 3 636.21691 1294.70107 688.56865 222.32704
## 4 308.07349 463.45469 219.84421 257.37860
## 5 81.28323 15.78904 64.29406 71.10459
## 6 108.37764 78.94519 132.73613 129.19004
Load in annotation file
meta <- read.csv("boxplot_meta.csv", header = T)
head(meta)
## sample condition replicate
## 1 SRR7811754 fall 1
## 2 SRR7811767 fall 2
## 3 SRR7811766 fall 3
## 4 SRR7811762 fall 4
## 5 SRR7811764 fall 5
## 6 SRR7811759 spring 1
Gather normalized counts of each sample.
expression_plot <- pivot_longer(top_deg,
cols = 2:11,
names_to = "sample",
values_to = "normalized_counts")
#View(expression_plot)
Merge ‘expression_plot’ with ‘metadata’ dataframe.
# Join metadata for visualizing groups or features
expression_plot <- left_join(x = expression_plot,
y = meta,
by = "sample")
expression_plot <-
expression_plot %>%
dplyr::select(gene_id, sample, normalized_counts, condition)
#View(expression_plot)
Genes to be visualized: - Common strongly upregulated genes: CISH, SLCO1C1, NDST3 - Additional upregulated gene from this analysis: SERPINC1 - Most upregulated gene: SLCO1C1 : CISH (orig.) - Most downregulated gene: PLCXD2 : RTN4RL2 (orig.) - Genes involved in several pathways - up: AGXT, CTH - down: NECTIN1, ATRNL1, PLOD3, NPR2
First, objects created based on all of the genes to be visualized. Then, from objects condtion was selected and seasons were releveled. Boxplots were created by plotting condition against normalized counts with added aesthetic features.
# CISH
CISH_exp <- expression_plot %>%
filter(gene_id == "CISH")
#View(CISH_exp)
CISH_exp$condition <- factor(CISH_exp$condition, levels = c("fall", "spring"))
boxplot_CISH <- ggplot(CISH_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("CISH") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# SLCO1C1
SLCO1C1_exp <- expression_plot %>%
filter(gene_id == "SLCO1C1")
#View(SLCO1C1_exp)
SLCO1C1_exp$condition <- factor(SLCO1C1_exp$condition, levels = c("fall", "spring"))
boxplot_SLCO1C1 <- ggplot(SLCO1C1_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("SLCO1C1") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# SERPINC1
SERPINC1_exp <- expression_plot %>%
filter(gene_id == "SERPINC1")
#View(SERPINC1_exp)
SERPINC1_exp$condition <- factor(SERPINC1_exp$condition, levels = c("fall", "spring"))
boxplot_SERPINC1 <- ggplot(SERPINC1_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("SERPINC1") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# NDST3
NDST3_exp <- expression_plot %>%
filter(gene_id == "NDST3")
#View(NDST3_exp)
NDST3_exp$condition <- factor(NDST3_exp$condition, levels = c("fall", "spring"))
boxplot_NDST3 <- ggplot(NDST3_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("NDST3") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# PLCXD2
PLCXD2_exp <- expression_plot %>%
filter(gene_id == "PLCXD2")
#View(PLCXD2_exp)
PLCXD2_exp$condition <- factor(PLCXD2_exp$condition, levels = c("fall", "spring"))
boxplot_PLCXD2 <- ggplot(PLCXD2_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("PLCXD2") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# RTN4RL2
RTN4RL2_exp <- expression_plot %>%
filter(gene_id == "RTN4RL2")
#View(RTN4RL2_exp)
RTN4RL2_exp$condition <- factor(RTN4RL2_exp$condition, levels = c("fall", "spring"))
boxplot_RTN4RL2 <- ggplot(RTN4RL2_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("RTN4RL2") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# AGXT
AGXT_exp <- expression_plot %>%
filter(gene_id == "AGXT")
#View(AGXT_exp)
AGXT_exp$condition <- factor(AGXT_exp$condition, levels = c("fall", "spring"))
boxplot_AGXT <- ggplot(AGXT_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("AGXT") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# CTH
CTH_exp <- expression_plot %>%
filter(gene_id == "CTH")
#View(CTH_exp)
CTH_exp$condition <- factor(CTH_exp$condition, levels = c("fall", "spring"))
boxplot_CTH <- ggplot(CTH_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("CTH") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# NECTIN1
NECTIN1_exp <- expression_plot %>%
filter(gene_id == "NECTIN1")
#View(NECTIN1_exp)
NECTIN1_exp$condition <- factor(NECTIN1_exp$condition, levels = c("fall", "spring"))
boxplot_NECTIN1 <- ggplot(NECTIN1_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("NECTIN1") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# ATRNL1
ATRNL1_exp <- expression_plot %>%
filter(gene_id == "ATRNL1")
#View(ATRNL1_exp)
ATRNL1_exp$condition <- factor(ATRNL1_exp$condition, levels = c("fall", "spring"))
boxplot_ATRNL1 <- ggplot(ATRNL1_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("ATRNL1") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# PLOD3
PLOD3_exp <- expression_plot %>%
filter(gene_id == "PLOD3")
#View(PLOD3_exp)
PLOD3_exp$condition <- factor(PLOD3_exp$condition, levels = c("fall", "spring"))
boxplot_PLOD3 <- ggplot(PLOD3_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("PLOD3") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
# NPR2
NPR2_exp <- expression_plot %>%
filter(gene_id == "NPR2")
#View(NPR2_exp)
NPR2_exp$condition <- factor(NPR2_exp$condition, levels = c("fall", "spring"))
boxplot_NPR2 <- ggplot(NPR2_exp) +
geom_boxplot(aes(x=condition,
y=normalized_counts,
fill=condition)) +
theme_bw() + scale_fill_manual(values = c("darkgreen", "darkorchid")) +
ggtitle("NPR2") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
Genes that want to be discussed together we grouped in a boxplot grid.
# Arranging upregulated plots in a figure
boxplot_grid_upreg <- plot_grid(boxplot_CISH,
boxplot_SLCO1C1,
boxplot_NDST3,
boxplot_SERPINC1,
labels = c("A", "B", "C", "D"),
align = 'v',
ncol = 2)
boxplot_grid_upreg
# Arranging downregulated plots in a figure
boxplot_grid_downreg <- plot_grid(boxplot_PLCXD2,
boxplot_RTN4RL2,
labels = c("A", "B"),
align = 'v',
ncol = 2)
boxplot_grid_downreg
# Arranging upregulated pathway gene plots in a figure
boxplot_grid_uppath <- plot_grid(boxplot_AGXT,
boxplot_CTH,
labels = c("A", "B"),
align = 'v',
ncol = 2)
boxplot_grid_uppath
# Arranging downregulated pathway gene plots in a figure
boxplot_grid_downpath <- plot_grid(boxplot_NECTIN1,
boxplot_ATRNL1,
boxplot_PLOD3,
boxplot_NPR2,
labels = c("A", "B", "C", "D"),
align = 'v',
ncol = 2)
boxplot_grid_downpath
Save boxplot grids to pdf
# Save upregulated plots to pdf
ggsave(plot = boxplot_grid_upreg,
filename = "boxplot_figure_upreg.pdf",
device = "pdf",
width = 6,
height = 4,
units = "in",
dpi = 300)
# Save downregulated plots to pdf
ggsave(plot = boxplot_grid_downreg,
filename = "boxplot_figure_downreg.pdf",
device = "pdf",
width = 6,
height = 4,
units = "in",
dpi = 300)
# Save upregulated pathway plots to pdf
ggsave(plot = boxplot_grid_uppath,
filename = "boxplot_figure_uppath.pdf",
device = "pdf",
width = 6,
height = 4,
units = "in",
dpi = 300)
# Save downregulated pathway plots to pdf
ggsave(plot = boxplot_grid_downpath,
filename = "boxplot_figure_downpath.pdf",
device = "pdf",
width = 6,
height = 4,
units = "in",
dpi = 300)
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] hypeR_1.14.0 ggplotify_0.1.0
## [3] limma_3.54.2 enrichplot_1.18.4
## [5] pathview_1.38.0 DOSE_3.24.2
## [7] org.Mm.eg.db_3.16.0 AnnotationDbi_1.60.2
## [9] clusterProfiler_4.6.2 ggupset_0.3.0
## [11] ggnewscale_0.4.8 cowplot_1.1.1
## [13] biomaRt_2.54.1 EnhancedVolcano_1.16.0
## [15] ggrepel_0.9.3 lubridate_1.9.2
## [17] forcats_1.0.0 stringr_1.5.0
## [19] dplyr_1.1.2 purrr_1.0.1
## [21] readr_2.1.4 tidyr_1.3.0
## [23] tibble_3.2.1 tidyverse_2.0.0
## [25] ggplot2_3.4.2 pheatmap_1.0.12
## [27] RColorBrewer_1.1-3 DESeq2_1.38.3
## [29] SummarizedExperiment_1.28.0 Biobase_2.58.0
## [31] MatrixGenerics_1.10.0 matrixStats_0.63.0
## [33] GenomicRanges_1.50.2 GenomeInfoDb_1.34.9
## [35] IRanges_2.32.0 S4Vectors_0.36.2
## [37] BiocGenerics_0.44.0 knitr_1.42
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 tidyselect_1.2.0 RSQLite_2.3.1
## [4] htmlwidgets_1.6.2 grid_4.2.2 BiocParallel_1.32.6
## [7] scatterpie_0.1.9 munsell_0.5.0 ragg_1.2.5
## [10] codetools_0.2-18 withr_2.5.0 colorspace_2.1-0
## [13] GOSemSim_2.24.0 filelock_1.0.2 highr_0.10
## [16] rstudioapi_0.14 labeling_0.4.2 KEGGgraph_1.58.3
## [19] GenomeInfoDbData_1.2.9 polyclip_1.10-4 bit64_4.0.5
## [22] farver_2.1.1 downloader_0.4 vctrs_0.6.2
## [25] treeio_1.22.0 generics_0.1.3 gson_0.1.0
## [28] xfun_0.39 timechange_0.2.0 BiocFileCache_2.6.1
## [31] R6_2.5.1 graphlayouts_0.8.4 locfit_1.5-9.7
## [34] msigdbr_7.5.1 bitops_1.0-7 cachem_1.0.7
## [37] fgsea_1.24.0 gridGraphics_0.5-1 DelayedArray_0.24.0
## [40] promises_1.2.0.1 scales_1.2.1 ggraph_2.1.0
## [43] gtable_0.3.3 tidygraph_1.2.3 rlang_1.1.0
## [46] systemfonts_1.0.4 splines_4.2.2 lazyeval_0.2.2
## [49] yaml_2.3.7 reshape2_1.4.4 httpuv_1.6.9
## [52] qvalue_2.30.0 tools_4.2.2 ellipsis_0.3.2
## [55] kableExtra_1.3.4 jquerylib_0.1.4 Rcpp_1.0.10
## [58] plyr_1.8.8 visNetwork_2.1.2 progress_1.2.2
## [61] zlibbioc_1.44.0 RCurl_1.98-1.12 prettyunits_1.1.1
## [64] viridis_0.6.2 magrittr_2.0.3 data.table_1.14.8
## [67] openxlsx_4.2.5.2 hms_1.1.3 patchwork_1.1.2
## [70] reactable_0.4.4 mime_0.12 evaluate_0.20
## [73] xtable_1.8-4 HDO.db_0.99.1 XML_3.99-0.14
## [76] gridExtra_2.3 compiler_4.2.2 crayon_1.5.2
## [79] shadowtext_0.1.2 htmltools_0.5.5 ggfun_0.0.9
## [82] later_1.3.0 tzdb_0.3.0 geneplotter_1.76.0
## [85] aplot_0.1.10 DBI_1.1.3 tweenr_2.0.2
## [88] dbplyr_2.3.2 MASS_7.3-58.1 rappdirs_0.3.3
## [91] babelgene_22.9 Matrix_1.5-4 cli_3.6.1
## [94] parallel_4.2.2 igraph_1.4.2 pkgconfig_2.0.3
## [97] xml2_1.3.3 ggtree_3.6.2 svglite_2.1.1
## [100] annotate_1.76.0 bslib_0.4.2 webshot_0.5.4
## [103] XVector_0.38.0 rvest_1.0.3 yulab.utils_0.0.6
## [106] digest_0.6.31 graph_1.76.0 Biostrings_2.66.0
## [109] rmarkdown_2.21 fastmatch_1.1-3 tidytree_0.4.2
## [112] curl_5.0.0 shiny_1.7.4 lifecycle_1.0.3
## [115] nlme_3.1-160 jsonlite_1.8.4 viridisLite_0.4.1
## [118] fansi_1.0.4 pillar_1.9.0 lattice_0.20-45
## [121] KEGGREST_1.38.0 fastmap_1.1.1 httr_1.4.5
## [124] GO.db_3.16.0 glue_1.6.2 zip_2.3.0
## [127] png_0.1-8 bit_4.0.5 Rgraphviz_2.42.0
## [130] ggforce_0.4.1 stringi_1.7.12 sass_0.4.5
## [133] blob_1.2.4 textshaping_0.3.6 org.Hs.eg.db_3.16.0
## [136] memoise_2.0.1 ape_5.7-1