Our data comes from an investigation of CD4+ T-cell differentiation. This study examined differences in the differentiation of Tfh and Th1 T-helper cells in the presence and absence of Aiolos. The data set originally had 12 samples composed of 4 treatments with 3 replications per treatment. There were 6 samples grown in Tfh polarizing conditions and 6 in Th1 polarizing conditions. The Th1-Ikzf3-02 sample was removed prior to running the nf-core/rnaseq pipeline due to unresolvable issues with files related to that sample. Subsequent analyses comparing differential expression between wild-type and aiolos deficient cells were performed only on the 6 samples in Tfh-polarizing conditions.
GSE203065
mm10
Read, K. A., Jones, D. M., Pokhrel, S., Hales, E. D. S., Varkey, A., Tuazon, J. A., Eisele, C. D., Abdouni, O., Saadey, A., Leonard, M. R., Warren, R. T., Powell, M. D., Boss, J. M., Hemann, E. A., Yount, J. S., Xin, G., Ghoneim, H. E., Lio, C. J., Freud, A. G., Collins, P. L., … Oestreich, K. J. (2023). Aiolos represses CD4+ T cell cytotoxic programming via reciprocal regulation of TFH transcription factors and IL-2 sensitivity. Nature communications, 14(1), 1652. https://doi.org/10.1038/s41467-023-37420-0
Loading the packages required for subsequent analyses.
# loading required packages
pacman::p_load(DESeq2, dplyr, tidyr, ggplot2, genekitr, pheatmap, RColorBrewer, EnhancedVolcano, clusterProfiler, DOSE, ggnewscale, enrichplot, knitr, cowplot, msigdbr, pathview, patchwork, ggplotify)
The input file for our DESeq analysis was the un-normalised count matrix.
# load count matrix
counts <- read.table("salmon/salmon.merged.gene_counts.tsv", header = T)
# keep gene_id col as names of rows
row.names(counts) <- counts$gene_id
# preview counts matrix with all samples
head(counts)
# create separate counts matrix for Tfh samples only
counts_tfh <- counts |>
dplyr::select(3:8) |>
round()
# preview to make sure as expected
head(counts_tfh)
We now need to create a coldata object with treatment and replicate information in order to make the dds object.
# treatment vector for coldata df
treatment <-
rep(c("WT","KO"), 3) |>
factor() |>
sort()
# replicate vector for coldata df
replicate <- rep(c(1,2,3), 2) |>
factor()
# coldata to create dds object
coldata <- data.frame(treatment, replicate)
# create the dds object
dds <- DESeqDataSetFromMatrix(countData = counts_tfh,
colData = coldata,
design = ~ treatment)
# filtering dds, keep rows with at least 10 reads total
keep <- dds |>
counts() |>
rowSums() >= 10
dds <- dds[keep,]
# re-level the treatment variable to make wild type the reference
dds$treatment <-
dds$treatment |>
relevel(ref = "WT")
# DESeq analysis
dds <- dds |>
DESeq()
# extract results from DESeq
res <- dds |>
results(contrast = c("treatment",
"KO",
"WT"))
res |>
dim() # 20531 x 6
[1] 20531 6
# order results from smallest to largest p-adjusted value
result <- res[res$padj |>
order(), ]
head(result)
log2 fold change (MLE): treatment KO vs WT
Wald test p-value: treatment KO vs WT
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000020143.16 5858.095 -2.72621 0.0677903 -40.2154 0.00000e+00 0.00000e+00
ENSMUSG00000004105.9 8816.018 -1.02557 0.0397834 -25.7788 1.53385e-146 1.08558e-142
ENSMUSG00000049109.16 2812.993 1.84731 0.0754677 24.4782 2.52451e-132 1.19115e-128
ENSMUSG00000032053.9 4909.934 -1.22529 0.0505739 -24.2277 1.13541e-129 4.01792e-126
ENSMUSG00000031765.9 1539.002 2.03377 0.0917571 22.1648 7.51775e-109 2.12828e-105
ENSMUSG00000023132.9 348.358 5.10500 0.2385279 21.4021 1.27692e-101 3.01247e-98
# number significant/non-significant padj, alpha = .05
(result$padj < .05) |>
table()
FALSE TRUE
11096 3059
# merge the result object and counts data frame
resdata <- merge(as.data.frame(result),
as.data.frame(counts(dds, normalized = T)),
by = "row.names",
sort = F)
names(resdata)[1] <- "Gene_id"
head(resdata)
# write resdata to file called unfiltered results
resdata |>
write.csv(file = "Tfh_KOvsWT_unfiltered_matrix.csv")
# get up/down DEG lists
resdata <- as.data.frame(result) |>
dplyr::filter(abs(log2FoldChange) > 1 & padj < 0.05) |>
tibble::rownames_to_column("Gene_id")
head(resdata)
# save to file DEG list file
write.csv(resdata, "Tfh_KOvsWT_log2fc1_fdr05_DEGlist.csv", row.names = FALSE)
# variance stabilizing transformation on counts for PCA
vst_counts <- dds |>
vst(blind = F)
# pca data for plot
vst_counts |>
plotPCA(intgroup = "treatment",
returnData = T) -> pca_dat
# calc proportion of variance each PC accounts for
pc_var_percent <- pca_dat |>
attr("percentVar") |>
round(digits = 2) * 100
# customize plot
pca_dat |>
ggplot(mapping = aes(PC1, PC2, color = treatment)) +
geom_point(size = 2) +
labs(title = "PCA Plot for Aiolos-deficient and Naive WT CD4+ T cells in Tfh-polarizing Conditions",
x = paste("PC1: ",pc_var_percent[1],"% variance"),
y = paste("PC2: ",pc_var_percent[2],"% variance")) +
ylim(-8,8) +
scale_x_continuous(breaks = seq(-8, 8, 2)) +
scale_color_discrete(labels = c("Aiolos-deficient", "WT")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size = 10))
Figure 1. Principal component analysis for Aiolos-deficient and Naive wild-type samples in Tfh polarized conditions. A majority of the variance, 79%, in the samples was accounted for by the first principal component. The samples are well separated in variance by treatment. A small amount of variance separates the three samples within each treatment.
# correlation heatmap
dds |>
rlog(blind = F) |> # transform dds data to log2 scale
assay() |>
cor() |>
pheatmap(clustering_distance_rows = "euclidean",
fontsize = 8,
cellwidth = 40,
show_rownames = F,
color = colorRampPalette(brewer.pal(n=6, "PuBuGn"))(400))
Figure 2: Correlation heatmap for Tfh polarized samples, demonstrating the strength of correlations between each pair of samples. The correlations between the wild type samples are very strong and the correlations between the Aiolos-deficient samples are very strong, while the correlation between the wild type and Aiolos-deficient samples are weaker.
# prepping data for heatmap visualization
# read in unfiltered matrix that was created earlier
unfilter_matrix <- read.csv("Tfh_KOvsWT_unfiltered_matrix.csv", header = T)
head(unfilter_matrix)
str(unfilter_matrix)
'data.frame': 20531 obs. of 14 variables:
$ X : int 1 2 3 4 5 6 7 8 9 10 ...
$ Gene_id : chr "ENSMUSG00000020143.16" "ENSMUSG00000004105.9" "ENSMUSG00000049109.16" "ENSMUSG00000032053.9" ...
$ baseMean : num 5858 8816 2813 4910 1539 ...
$ log2FoldChange : num -2.73 -1.03 1.85 -1.23 2.03 ...
$ lfcSE : num 0.0678 0.0398 0.0755 0.0506 0.0918 ...
$ stat : num -40.2 -25.8 24.5 -24.2 22.2 ...
$ pvalue : num 0.00 1.53e-146 2.52e-132 1.14e-129 7.52e-109 ...
$ padj : num 0.00 1.09e-142 1.19e-128 4.02e-126 2.13e-105 ...
$ Tfh_._Ikzf3_KO1: num 1631 5933 4407 2877 2487 ...
$ Tfh_._Ikzf3_KO2: num 1408 5723 4615 2972 2466 ...
$ Tfh_._Ikzf3_KO3: num 1577 5765 4181 2979 2467 ...
$ Tfh_._WT1 : num 9448 11813 1367 6594 653 ...
$ Tfh_._WT2 : num 10296 11803 1151 6939 510 ...
$ Tfh_._WT3 : num 10789 11857 1156 7099 650 ...
# add column of determining if gene is diff expressed
threshold_padj <- unfilter_matrix$padj < 0.05
# number of diff expressed genes = 3059
threshold_padj |>
which() |>
length()
[1] 3059
# add col determining diff expression to unfilter_matrix
unfilter_matrix$threshold <- threshold_padj
str(unfilter_matrix)
'data.frame': 20531 obs. of 15 variables:
$ X : int 1 2 3 4 5 6 7 8 9 10 ...
$ Gene_id : chr "ENSMUSG00000020143.16" "ENSMUSG00000004105.9" "ENSMUSG00000049109.16" "ENSMUSG00000032053.9" ...
$ baseMean : num 5858 8816 2813 4910 1539 ...
$ log2FoldChange : num -2.73 -1.03 1.85 -1.23 2.03 ...
$ lfcSE : num 0.0678 0.0398 0.0755 0.0506 0.0918 ...
$ stat : num -40.2 -25.8 24.5 -24.2 22.2 ...
$ pvalue : num 0.00 1.53e-146 2.52e-132 1.14e-129 7.52e-109 ...
$ padj : num 0.00 1.09e-142 1.19e-128 4.02e-126 2.13e-105 ...
$ Tfh_._Ikzf3_KO1: num 1631 5933 4407 2877 2487 ...
$ Tfh_._Ikzf3_KO2: num 1408 5723 4615 2972 2466 ...
$ Tfh_._Ikzf3_KO3: num 1577 5765 4181 2979 2467 ...
$ Tfh_._WT1 : num 9448 11813 1367 6594 653 ...
$ Tfh_._WT2 : num 10296 11803 1151 6939 510 ...
$ Tfh_._WT3 : num 10789 11857 1156 7099 650 ...
$ threshold : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
# add col with gene symbol
unfilter_matrix$Gene_id <- sub("\\.\\d+", "", unfilter_matrix$Gene_id)
ids <- unfilter_matrix$Gene_id
gene_symbol_map <- ids |>
transId(transTo = "symbol",
org = "mouse")
names(unfilter_matrix)[2] = "input_id"
resdata_gene <- merge(unfilter_matrix, gene_symbol_map, by.x = "input_id", by.y = "input_id")
str(resdata_gene)
'data.frame': 18970 obs. of 16 variables:
$ input_id : chr "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000031" "ENSMUSG00000000037" ...
$ X : int 12096 2028 9766 4222 1756 4845 362 3799 3337 10583 ...
$ baseMean : num 6756.6 2666 91.5 22.8 1369.9 ...
$ log2FoldChange : num 0.0141 0.1829 0.6925 -0.9898 -0.2512 ...
$ lfcSE : num 0.0469 0.0544 1.0746 0.5032 0.0687 ...
$ stat : num 0.3 3.36 0.644 -1.967 -3.657 ...
$ pvalue : num 0.763828 0.00078 0.519269 0.049161 0.000255 ...
$ padj : num 0.89377 0.00544 0.75256 0.16478 0.00206 ...
$ Tfh_._Ikzf3_KO1: num 6977.6 2893 52.6 11.5 1280.5 ...
$ Tfh_._Ikzf3_KO2: num 6863.9 2850.2 200.5 12.5 1219.9 ...
$ Tfh_._Ikzf3_KO3: num 6523 2759.9 86.2 22.2 1252.6 ...
$ Tfh_._WT1 : num 6822.1 2457 59.9 27.4 1466.9 ...
$ Tfh_._WT2 : num 6529.5 2531.2 8.7 33.1 1507.1 ...
$ Tfh_._WT3 : num 6823.4 2504.8 141.4 29.9 1492.8 ...
$ threshold : logi FALSE TRUE FALSE FALSE TRUE FALSE ...
$ symbol : chr "Gnai3" "Cdc45" "H19" "Scml2" ...
head(resdata_gene)
# add a column to specify up/down-reg
resdata_gene$diffexpressed <- "Unchanged"
# if log2FC > 1 and padj < 0.05 set as upregulated
resdata_gene$diffexpressed[resdata_gene$padj < 0.05 & resdata_gene$log2FoldChange > 1] <- "Upregulated"
# if log2FC > -1 and padj < 0.05 set as downregulated
resdata_gene$diffexpressed[resdata_gene$padj < 0.05 & resdata_gene$log2FoldChange > -1] <- "Downregulated"
resdata_sig <- as.data.frame(resdata_gene) |>
dplyr::filter(abs(log2FoldChange) > 1 & padj < 0.05)
head(resdata_sig)
# save DEGS to file
write.csv(resdata_sig, file="Tfh_KO_vs_WT_DEG.csv")
# load the DEG.csv file
all_matrix <- read.csv("Tfh_KO_vs_WT_DEG.csv", header = T) |>
data.frame()
head(all_matrix)
heatmap_matrix <- all_matrix |>
select(symbol, Tfh_._Ikzf3_KO1:Tfh_._WT3) |>
data.frame()
# remove non-unique rows based on symbol so that I can use symbol var as name of rows, I had to remove the duplicated rows (only 2)
heatmap_matrix <- heatmap_matrix |> distinct(symbol, .keep_all = T)
head(heatmap_matrix)
# keep symbol col as names of rows
row.names(heatmap_matrix) <- heatmap_matrix$symbol # having issues bc I had duplicate row names
heatmap_matrix <- heatmap_matrix |>
select(2:7)
head(heatmap_matrix)
names(heatmap_matrix)
[1] "Tfh_._Ikzf3_KO1" "Tfh_._Ikzf3_KO2" "Tfh_._Ikzf3_KO3" "Tfh_._WT1" "Tfh_._WT2" "Tfh_._WT3"
# add col to coldata with sample names that match col names of count data
coldata$sample_id <- c("Tfh_Ikzf3.KO1", "Tfh_Ikzf3.KO2", "Tfh_Ikzf3.KO3", "Tfh_WT1", "Tfh_WT2", "Tfh_WT3")
rownames(coldata) <- coldata$sample_id
heatmap_meta <- coldata |>
select(treatment)
# re-level factors
heatmap_meta$genotype <- factor(heatmap_meta$treatment,
levels = c("WT", "KO"))
heatmap_meta <- heatmap_meta |>
select(2)
# colors for heatmap
heatmap_colors <- brewer.pal(10, "RdBu") |>
rev()
# rename meta genotypes to be more descriptive
heatmap_meta_rename <- heatmap_meta
# add col with correct labels
heatmap_meta_rename$Genotype <- ifelse(heatmap_meta_rename$genotype == "KO", "Aiolos_deficient", "WT")
heatmap_meta_rename <- heatmap_meta_rename |>
select("Genotype")
# colors for annotations
ann_colors <- list(Genotype = c(Aiolos_deficient = "#B0F6F9", WT = "#A66EF4"))
# make plot
heatmap_matrix |>
pheatmap(color = heatmap_colors,
annotation_col = heatmap_meta_rename,
annotation_colors = ann_colors,
show_rownames = F,
show_colnames = F,
scale = "row",
annotation_names_col = F,
cellwidth = 20,
cellheight = 0.4,
treeheight_row = 10,
treeheight_col = 20) |>
as.ggplot() -> heatmap
#heatmap
# # save plot as image
# ggsave(filename = "heatmap_figure.png",
# plot = heatmap,
# units = "in",
# width = 5,
# height = 6,
# dpi = 300)
Figure 3. Heatmap demonstrating significant differentially expressed genes in wild-type vs. Aiolos-deficient mice.
# load count matrix
counts <- read.table("salmon/salmon.merged.gene_counts.tsv", header = T)
# keep gene_id col as names of rows
row.names(counts) <- counts$gene_id
head(counts)
# create separate matrix for Tfh counts
counts_tfh <- counts |>
dplyr::select(3:8) |>
round()
# preview to make sure as expected
head(counts_tfh)
treatment <-
rep(c("WT","KO"), 3) |>
factor() |>
sort()
# replicate vector for coldata df
replicate <- rep(c(1,2,3), 2) |>
factor()
# coldata to create dds object
coldata <- data.frame(treatment, replicate)
coldata
# create dds object
dds <- DESeqDataSetFromMatrix(countData = counts_tfh,
colData = coldata,
design = ~ treatment)
dds
class: DESeqDataSet
dim: 55941 6
metadata(1): version
assays(1): counts
rownames(55941): ENSMUSG00000000001.5 ENSMUSG00000000003.16 ...
ENSMUSG00002076991.1 ENSMUSG00002076992.1
rowData names(0):
colnames(6): Tfh_._Ikzf3_KO1 Tfh_._Ikzf3_KO2 ... Tfh_._WT2 Tfh_._WT3
colData names(2): treatment replicate
# filtering dds
# keep rows with at least 10 reads total
keep <- dds |>
counts() |>
rowSums() >= 10
dds <- dds[keep,]
dds
class: DESeqDataSet
dim: 20531 6
metadata(1): version
assays(1): counts
rownames(20531): ENSMUSG00000000001.5 ENSMUSG00000000028.16 ...
ENSMUSG00002076304.1 ENSMUSG00002076896.1
rowData names(0):
colnames(6): Tfh_._Ikzf3_KO1 Tfh_._Ikzf3_KO2 ... Tfh_._WT2 Tfh_._WT3
colData names(2): treatment replicate
# re-level the treatment variable
dds$treatment <-
dds$treatment |>
relevel(ref = "WT")
dds <- dds |>
DESeq()
# extract results from DESeq
res <- dds |>
results(contrast = c("treatment",
"KO",
"WT"))
res |>
head()
log2 fold change (MLE): treatment KO vs WT
Wald test p-value: treatment KO vs WT
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000000001.5 6756.5893 0.0141043 0.0469427 0.300458 0.763828202 0.89377434
ENSMUSG00000000028.16 2666.0388 0.1829078 0.0544402 3.359795 0.000780003 0.00544157
ENSMUSG00000000031.18 91.5497 0.6925470 1.0745956 0.644472 0.519269299 0.75256035
ENSMUSG00000000037.18 22.7692 -0.9898316 0.5031701 -1.967191 0.049161194 0.16478255
ENSMUSG00000000056.8 1369.9485 -0.2512007 0.0686897 -3.657038 0.000255147 0.00205672
ENSMUSG00000000058.7 31.5859 -0.8224517 0.4740733 -1.734862 0.082765275 0.24175453
res |>
dim() # 20531 x 6
[1] 20531 6
# order results from smallest to largest p-adjusted value
result <- res[res$padj |>
order(), ]
head(result)
log2 fold change (MLE): treatment KO vs WT
Wald test p-value: treatment KO vs WT
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000020143.16 5858.095 -2.72621 0.0677903 -40.2154 0.00000e+00 0.00000e+00
ENSMUSG00000004105.9 8816.018 -1.02557 0.0397834 -25.7788 1.53385e-146 1.08558e-142
ENSMUSG00000049109.16 2812.993 1.84731 0.0754677 24.4782 2.52451e-132 1.19115e-128
ENSMUSG00000032053.9 4909.934 -1.22529 0.0505739 -24.2277 1.13541e-129 4.01792e-126
ENSMUSG00000031765.9 1539.002 2.03377 0.0917571 22.1648 7.51775e-109 2.12828e-105
ENSMUSG00000023132.9 348.358 5.10500 0.2385279 21.4021 1.27692e-101 3.01247e-98
# merge the result object and counts data frame
resdata <- merge(as.data.frame(result),
as.data.frame(counts(dds, normalized = T)),
by = "row.names",
sort = F)
names(resdata)[1] <- "Gene_id"
head(resdata)
# write resdata to file called unfiltered results
resdata |>
write.csv(file = "Tfh_KOvsWT_unfiltered_matrix.csv")
threshold_padj <- resdata$padj < 0.05
length(which(threshold_padj))
[1] 3059
resdata$threshold <- threshold_padj ## Add vector as a column (threshold) to the resdata
str(resdata)
'data.frame': 20531 obs. of 14 variables:
$ Gene_id : 'AsIs' chr "ENSMUSG00000020143.16" "ENSMUSG00000004105.9" "ENSMUSG00000049109.16" "ENSMUSG00000032053.9" ...
$ baseMean : num 5858 8816 2813 4910 1539 ...
$ log2FoldChange : num -2.73 -1.03 1.85 -1.23 2.03 ...
$ lfcSE : num 0.0678 0.0398 0.0755 0.0506 0.0918 ...
$ stat : num -40.2 -25.8 24.5 -24.2 22.2 ...
$ pvalue : num 0.00 1.53e-146 2.52e-132 1.14e-129 7.52e-109 ...
$ padj : num 0.00 1.09e-142 1.19e-128 4.02e-126 2.13e-105 ...
$ Tfh_._Ikzf3_KO1: num 1631 5933 4407 2877 2487 ...
$ Tfh_._Ikzf3_KO2: num 1408 5723 4615 2972 2466 ...
$ Tfh_._Ikzf3_KO3: num 1577 5765 4181 2979 2467 ...
$ Tfh_._WT1 : num 9448 11813 1367 6594 653 ...
$ Tfh_._WT2 : num 10296 11803 1151 6939 510 ...
$ Tfh_._WT3 : num 10789 11857 1156 7099 650 ...
$ threshold : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
resdata$Gene_id <- sub("\\.\\d+", "", resdata$Gene_id)
ids <- resdata$Gene_id
gene_symbol_map <- transId(ids, transTo = "symbol", org = "mouse")
names(resdata)[1] = "input_id"
resdata_gene <- merge(resdata, gene_symbol_map, by.x = "input_id", by.y = "input_id")
head(resdata_gene)
#Add a column to dataframe to specify up or down regulated
resdata_gene$diffexpressed <- "Unchanged"
#if log2FC > 1 and padj < 0.05 set as upregulated
resdata_gene$diffexpressed[resdata_gene$padj < 0.05 & resdata_gene$log2FoldChange > 1] <- "Upregulated"
#if log2FC > -1 and padj < 0.05 set as downregulated
resdata_gene$diffexpressed[resdata_gene$padj < 0.05 & resdata_gene$log2FoldChange < -1] <- "Downregulated"
# MAke data det to use for ggplots later
resdata_sig <- as.data.frame(resdata_gene) %>% dplyr::filter(abs(log2FoldChange) > 1 & padj < 0.05)
head(resdata_sig)
write.csv(resdata_sig, file="Tfh_ko_versus_wt_DEG.csv")
keyvals <- ifelse(
resdata_gene$log2FoldChange < -1, 'blue',
ifelse(resdata_gene$log2FoldChange > 1, 'red',
'grey'))
keyvals[is.na(keyvals)] <- 'black'
names(keyvals)[keyvals == 'red'] <- '> 2-fold up'
names(keyvals)[keyvals == 'grey'] <- 'n.s.'
names(keyvals)[keyvals == 'blue'] <- '> 2-fold down'
EnhancedVolcano(resdata_gene,
lab = as.character(resdata_gene$symbol),
x = 'log2FoldChange',
y = 'padj',
selectLab = c('Tbc1d4', 'Tox', 'Tox2', 'Spp1', 'Pou2af1', 'Il2ra', 'Pdcd1', 'Il2rb', "Bcl6", "Il6st", "Prdm1", "Cd40lg", "Zfp831"),
#selectLab = rownames(resdata_gene)[which(names(keyvals) %in%
# c('Upregulated','Downregulated'))],
xlab = bquote(~Log[2]~ 'fold change'),
colCustom = keyvals,
title = "Ikzf3+/- vs. WT",
subtitle = "Differential expression",
FCcutoff = 1,
pCutoff = 0.05,
labSize = 5,
axisLabSize = 12,
legendLabels=c('Not sig.','Log2FC','padj', 'padj & Log2FC'),
legendPosition = 'left',
legendLabSize = 8,
legendIconSize = 4.0,
gridlines.major = FALSE,
gridlines.minor = FALSE) +
scale_x_continuous(limits = c(-8, 8)) +
coord_cartesian(ylim=c(0, 150), clip = "off") +
annotate("text", x = 6, y = 75, label = "270", colour = "red", size = 5, hjust = 0, fontface = "bold") +
annotate("text", x = -6, y = 75, label = "181", colour = "blue", size = 5, hjust = 1, fontface = "bold") +
annotate("text", x = 6, y = 65, label = "up", colour = "black", size = 5, hjust = 0, fontface = "bold") +
annotate("text", x = -6, y = 65, label = "down", colour = "black", size = 5, hjust = 1, fontface = "bold")
Figure 4. Volcano plot of up and down regulated genes comparing the wild-type and aiolos-deficient samples. There were a total of 270 significantly up-regulated genes and 181 significantly down-regulated genes in the Aiolos-deficient samples compared to the wild-type samples. Significant differences in expression were determined on a greater than 2 log-fold change and a p-value less than 0.05.
# we want the log2 fold change
original_gene_list <- resdata$log2FoldChange
head(original_gene_list)
[1] -2.726210 -1.025566 1.847310 -1.225292 2.033774 5.105001
# name the vector
names(original_gene_list) <- resdata$input_id
head(original_gene_list)
ENSMUSG00000020143 ENSMUSG00000004105 ENSMUSG00000049109 ENSMUSG00000032053 ENSMUSG00000031765 ENSMUSG00000023132
-2.726210 -1.025566 1.847310 -1.225292 2.033774 5.105001
# omit any NA values
background_gene_list<-na.omit(original_gene_list)
# sort the list in decreasing order (required for clusterProfiler)
background_gene_list = sort(background_gene_list, decreasing = TRUE)
# Extract significant results (padj < 0.05) using subset()
sig_genes_df = subset(resdata, padj < 0.05)
# From significant results, we want to filter on log2fold change
sig_genes <- sig_genes_df$log2FoldChange
# Name the vector
names(sig_genes) <- sig_genes_df$input_id
# omit NA values
sig_genes <- na.omit(sig_genes)
# filter on min log2fold change (log2FoldChange > 1)
sig_genes <- names(sig_genes)[abs(sig_genes) > 1]
### Uncomment to run since takes so long to run
# go_enrich <- enrichGO(gene = sig_genes,
# universe = background_gene_list, # not sure if i need to do names(background_gene_list)
# OrgDb = "org.Mm.eg.db",
# keyType = 'ENSEMBL',
# readable = T,
# ont = "BP",
# pvalueCutoff = 0.05,
# qvalueCutoff = 0.05)
#
# saveRDS(go_enrich, file = "enrichGO_cd8.rds")
go_enrich <- readRDS("enrichGO_cd8.rds")
## Output results from GO analysis to a table
cluster_summary <- data.frame(go_enrich)
write.csv(cluster_summary, "clusterProfiler_TCF7ko.csv")
head(cluster_summary)
barplot <- barplot(go_enrich,
showCategory = 10,
title = "GO Biological Pathways",
font.size = 8)
barplot
Figure 5:The bar plot highlights the GO Enrich database by counts, as well as organizing by p-values. The positive regulation of leukocyte activation has both a high count and is highly statistically significant to the rest.
dot <- dotplot(go_enrich,
showCategory = 10,
font.size = 8)
dot
Figure 6: The dot plot uses the GO Enrich database to show the top 10 pathways in Aiolos that are the most enriched by gene ratio. Positive regulation of cell activation is the most enriched pathway by gene ratio, a significant topic disussed by authors.
plot_color <- cnetplot(go_enrich,
showCategory = 5,
categorySize="pvalue",
foldChange=background_gene_list,
vertex.label.font=6,
cex_label_gene = 0.5,
cex_label_category= 0.8) +
scale_color_gradient2(name='fold change', low='darkgreen', high='firebrick')
plot_color
Figure 7: The figure highlights significant genes and biological concepts from GO Enrich as the network. The 5 main categories were: interferon-gamma production, T cell differentiation, lymphocyte activation involved in immune response, cell activation involved in immune response, and leukocyte activation involved in immune response. Aiolos has a known relationship to all of these pathways that were found, significant genes liked include Pou2af1, Bcl6, and Tox; which were discussed at length by the authors.
# Convert gene IDs for enrichKEGG function
# We will lose some genes here because not all IDs will be converted
ids<-bitr(names(original_gene_list),
fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb="org.Mm.eg.db")
# remove duplicate IDS (here I use "ENSEMBL", but it should be whatever was selected as keyType)
dedup_ids = ids[!duplicated(ids[c("ENSEMBL")]),]
# Create a new dataframe df2 which has only the genes which were successfully mapped using the bitr function above
df2 = resdata[resdata$input_id %in% dedup_ids$ENSEMBL,]
# Create a new column in df2 with the corresponding ENTREZ IDs
df2 <- merge(resdata, dedup_ids, by.x = "input_id", by.y = "ENSEMBL")
str(df2)
'data.frame': 17044 obs. of 15 variables:
$ input_id : 'AsIs' chr "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000031" "ENSMUSG00000000037" ...
$ baseMean : num 6756.6 2666 91.5 22.8 1369.9 ...
$ log2FoldChange : num 0.0141 0.1829 0.6925 -0.9898 -0.2512 ...
$ lfcSE : num 0.0469 0.0544 1.0746 0.5032 0.0687 ...
$ stat : num 0.3 3.36 0.644 -1.967 -3.657 ...
$ pvalue : num 0.763828 0.00078 0.519269 0.049161 0.000255 ...
$ padj : num 0.89377 0.00544 0.75256 0.16478 0.00206 ...
$ Tfh_._Ikzf3_KO1: num 6977.6 2893 52.6 11.5 1280.5 ...
$ Tfh_._Ikzf3_KO2: num 6863.9 2850.2 200.5 12.5 1219.9 ...
$ Tfh_._Ikzf3_KO3: num 6523 2759.9 86.2 22.2 1252.6 ...
$ Tfh_._WT1 : num 6822.1 2457 59.9 27.4 1466.9 ...
$ Tfh_._WT2 : num 6529.5 2531.2 8.7 33.1 1507.1 ...
$ Tfh_._WT3 : num 6823.4 2504.8 141.4 29.9 1492.8 ...
$ threshold : logi FALSE TRUE FALSE FALSE TRUE FALSE ...
$ ENTREZID : chr "14679" "12544" "14955" "107815" ...
# Create a vector of the gene universe
kegg_gene_list <- df2$log2FoldChange
# Name vector with ENTREZ ids
names(kegg_gene_list) <- df2$ENTREZID
head(kegg_gene_list)
14679 12544 14955 107815 67608 12390
0.0141043 0.1829078 0.6925470 -0.9898316 -0.2512007 -0.8224517
# omit any NA values
kegg_gene_list<-na.omit(kegg_gene_list)
# sort the list in decreasing order (required for clusterProfiler)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)
## GSEA using gene sets from KEGG pathways
gseaKEGG <- gseKEGG(geneList = kegg_gene_list, # ordered named vector of fold changes
organism = "mmu", # supported organisms listed below
nPerm = 1000, # default number permutations
minGSSize = 20, # minimum gene set size (# genes in set)
pvalueCutoff = 0.05, # padj cutoff value
verbose = FALSE)
### Uncomment when running since take a long time
#saveRDS(gseaKEGG, file = "gseaKEGG.RDS")
#gseaKEGG <- readRDS("gseaKEGG.RDS")
## Extract the GSEA results
gseaKEGG_results <- gseaKEGG@result
## Write GSEA results to file
write.csv(gseaKEGG_results, "gsea_kegg_results.csv", quote=F)
# View(gseaKEGG_results)
enrich_gsea <- gseaplot2(gseaKEGG,
title = gseaKEGG_results$Description[1],
geneSetID = 1)
enrich_gsea
Figure 8. The JAK-STAT pathway had an enrichment score of 0.6. This plot demonstrates that genes associated with this pathway were up-regulated in the Aiolos-deficient samples compared to the wild-type samples.
gseaKEGG_top5 <- gseaplot2(gseaKEGG, geneSetID = 1:5)
gseaKEGG_top5
Figure 9: This plot highlights the top 5 most upregulated pathways in the Aiolos-deficient samples. The JAK-STAT pathway enrichment has a score of 0.6, as opposed to the other top 5 GSEA Kegg pathways which had a score of approximately 0.7. The other significant pathways are allograft rejection, autoimmune thyroid disorder, graft-versus host disease, and type 1 diabetes melltus.
all_gene_sets <- msigdbr(species = "Mus musculus")
saveRDS(all_gene_sets, file = "all_gene_sets.RDS")
head(all_gene_sets)
msigdbr_collections()
GOBP_gene_sets <- msigdbr(species = "mouse", category = "C5",
subcategory = "GO:BP") %>%
dplyr::select(gs_name, entrez_gene)
head(GOBP_gene_sets)
saveRDS(GOBP_gene_sets, "GOBP_gene_sets.RDS")
GOBP_gene_sets <- readRDS("GOBP_gene_sets.RDS")
# Run GSEA
### Uncomment to run, will tkae a long time
#msig_GOBP_GSEA <- GSEA(kegg_gene_list, TERM2GENE = GOBP_gene_sets, verbose = FALSE)
#saveRDS(msig_GOBP_GSEA, "msig_GOBP_GSEA.RDS")
msig_GOBP_GSEA <- readRDS("msig_GOBP_GSEA.RDS")
## Extract the GSEA results
gseaGOBP_results <- msig_GOBP_GSEA@result
## Write GSEA results to file
write.csv(gseaGOBP_results, "gseaGOBP_results.csv", quote=F)
View(gseaGOBP_results)
## Plot the GSEA plot for a single enriched pathway
gseaplot2(msig_GOBP_GSEA, title = 'Leukocyte Mediated Cytotoxicity', geneSetID = 'GOBP_REGULATION_OF_LEUKOCYTE_MEDIATED_CYTOTOXICITY')
Figure 10. The leukocyte mediated cytotoxicity pathway had an enrichment score of about 0.7. This plot demonstrates that this pathway was upregulated in the Aiolos-deficient samples as the peak of the enrichment score is to the left side of the x-axis. This pathway is also down-regulated in the wild-type samples as the enrichment score on the right side of the graph is very low.
gseaplot2(msig_GOBP_GSEA, title = 'Interferon Gamma Response', geneSetID = 'GOBP_RESPONSE_TO_INTERFERON_GAMMA')
Figure 11. Interferon gamma response pathway had an enrichment score just below 0.6. This plot demonstrates that this pathway was upregulated in the Aiolos-deficient samples as the peak of the enrichment score is to the right side of the x-axis.
name = c("Tfh_._Ikzf3_KO1", "Tfh_._Ikzf3_KO2", "Tfh_._Ikzf3_KO3", "Tfh_._WT1", "Tfh_._WT2", "Tfh_._WT3")
meta <- data.frame(name, coldata)
order_matrix <- arrange(resdata_sig, padj)
#order_matrix
head(resdata_sig)
symbol_deg <- order_matrix %>% select(symbol, 8:13)
head(symbol_deg)
top_20_deg <- slice(symbol_deg, 1:20)
head(top_20_deg)
expression_plot <- pivot_longer(top_20_deg,
cols = 2:7,
values_to = "normalized_counts")
expression_plot3 <- pivot_longer(symbol_deg,
cols = 2:7,
values_to = "normalized_counts")
# View(expression_plot)
# Join metadata for visualizing groups or features
expression_plot2 <- left_join(x = expression_plot,
y = meta,
by = "name")
expression_plot4 <- left_join(x = expression_plot3,
y = meta,
by = "name")
Tox_exp <- expression_plot4 %>%
filter(symbol == "Tox")
Tox_exp$treatment <- factor(Tox_exp$treatment, levels = c("WT", "KO"))
boxplot_Tox <- ggplot(Tox_exp) +
geom_boxplot(aes(x=treatment,
y=normalized_counts,
fill=treatment)) +
theme_bw() + scale_fill_manual(values = c("blue", "red")) +
ggtitle("Tox") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
Prf1_exp <- expression_plot2 %>%
filter(symbol == "Prf1")
Prf1_exp$treatment <- factor(Prf1_exp$treatment, levels = c("WT", "KO"))
boxplot_Prf1 <- ggplot(Prf1_exp) +
geom_boxplot(aes(x=treatment,
y=normalized_counts,
fill=treatment)) +
theme_bw() + scale_fill_manual(values = c("blue", "red")) +
ggtitle("Prf1") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
Pou2af1_exp <- expression_plot2 %>%
filter(symbol == "Pou2af1")
Pou2af1_exp$treatment <- factor(Pou2af1_exp$treatment, levels = c("WT", "KO"))
boxplot_Pou2af1 <- ggplot(Pou2af1_exp) +
geom_boxplot(aes(x=treatment,
y=normalized_counts,
fill=treatment)) +
theme_bw() + scale_fill_manual(values = c("blue", "red")) +
ggtitle("Pou2af1") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
Nkg7_exp <- expression_plot2 %>%
filter(symbol == "Nkg7")
Nkg7_exp$treatment <- factor(Nkg7_exp$treatment, levels = c("WT", "KO"))
boxplot_Nkg7 <- ggplot(Nkg7_exp) +
geom_boxplot(aes(x=treatment,
y=normalized_counts,
fill=treatment)) +
theme_bw() + scale_fill_manual(values = c("blue", "red")) +
ggtitle("Nkg7") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
Dock2_exp <- expression_plot2 %>%
filter(symbol == "Dock2")
Dock2_exp$treatment <- factor(Dock2_exp$treatment, levels = c("WT", "KO"))
boxplot_Dock2 <- ggplot(Dock2_exp) +
geom_boxplot(aes(x=treatment,
y=normalized_counts,
fill=treatment)) +
theme_bw() + scale_fill_manual(values = c("blue", "red")) +
ggtitle("Dock2") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
Il2ra_exp <- expression_plot4 %>%
filter(symbol == "Il2ra")
Il2ra_exp$treatment <- factor(Il2ra_exp$treatment, levels = c("WT", "KO"))
boxplot_Il2ra <- ggplot(Il2ra_exp) +
geom_boxplot(aes(x=treatment,
y=normalized_counts,
fill=treatment)) +
theme_bw() + scale_fill_manual(values = c("blue", "red")) +
ggtitle("Il2ra") +
xlab("Condition") +
ylab("Normalized Counts") +
theme(legend.position = "right",
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.25), hjust = 0.5), #title
axis.title = element_text(size = rel(0.75)),
axis.text = element_text(size = rel(0.75))) #axis
boxplot_grid <- plot_grid(boxplot_Tox,
boxplot_Dock2,
boxplot_Pou2af1,
boxplot_Nkg7,
boxplot_Prf1,
boxplot_Il2ra,
ncol = 3)
boxplot_grid
Figure 12: The boxplots highlight the normalized counts of significant genes in the data set (Prf1, Pou2af1, Nkg7, and Dock2), as well as genes discussed heavily in the paper (Tox and Il2ra). Nkg7, Prf1, and Il2ra are up regulated. Tox, Dox2, and Pou2af are down regulated. Il2ra has high normalized counts for both WT and KO as compared to the other genes.
sessionInfo()