# Log in to HPC.
ssh -i~/.ssh/"key name" k********@hpc.create.kcl.ac.uk
cd /scratch_tmp/grp/msc_appbio
mkdir group1
cd group1
mkdir raw_reads
mkdir ref_gen
mkdir outputs
To download the sequences from the SRA, the SRA Toolkit was used. fasterq-dump is part of the SRA package and it was used to fetch the SRA files and convert them to FASTQ. The –gzip function is not supported in fasterq-dump and thus, we run it after downloading and converting had finished. –split-files was used to split the forward from the reverse reads as our data were paired-end.
# Go to the right directory.
cd /scratch_tmp/grp/msc_appbio/group1/raw_reads
# Searching for available modules corresponding to SRA Tools
module spider SRA Tools
# Loading module
module load sra-tools/3.0.3-gcc-13.2.0
fasterq-dump --split-files SRR11318268, SRR11318269, SRR11318270, SRR11318271 -O .
# After downloading and converting the SRA files to FASTQ, compress all the files in the directory.
gzip *
The results from downloading with –split-files gives 2 files per SRA, as mentioned before, one forward and one reverse. The suffix of the split files in the raw_reads/ directory is one with _1.fastq.gz and another with _2.fastq.gz.
Creating conda environment to install and activate packages. Best practice for reproducibility and dependency management in computational workflows, especially if you work locally or the HPC does not provide a specific tool.
# Searching for conda in the HPC
module spider conda
# Loading conda
module load anaconda3/2021.05-gcc-13.2.0
# Creating conda environment
create conda -—name myenv
# Activating conda environment
conda activate myenv
# Installing FastQC and MultiQC from Bioconda repository
conda install -c bioconda fastqc multiqc samtools
# Deactivating conda environment
conda deactivate
In the end, we did not use our conda environment to run FastQC and MultiQC as these modules are already installed in the HPC. We made a SBATCH script to perform FastQC and MultiQC to our raw fastq.gz files.
# Create and edit the script with nano.
nano fastqc.sh
#!/bin/bash
echo "start of pipeline"
module load fastqc
module load py-multiqc
# Specify input and output directories.
baseDirectory="/scratch_tmp/grp/msc_appbio/group1/raw_reads"
resultsDirectory="/scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output"
# Create results directory if it does not exist.
mkdir -p "$resultsDirectory"
# Run FastQC on all .fastq.gz files in baseDirectory
fastqc -o "$resultsDirectory" -t 4 "$baseDirectory"/*.fastq.gz
echo "end of fastqc pipeline"
echo "start of multiqc"
multiqc "$resultsDirectory"
echo "end of multiqc"
# Run the script interactively in the msc_appbio partition.
srun msc_appbio --pty /bin/bash
sbatch fastqc.sh
squeue -u k24090847
# Or run the script as a sbatch job. Select partition and specify cpu number if you do not want the default resources.
sbatch -p msc_appbio --cpus-per-task = 12 fastqc.sh
# Check the status of your sbatch job.
squeue -u "knumber"
# Choose the rightdirectory to view the output files.
cd /scratch_tmp/grp/msc_appbio/group1/outputs
# List contents of the outputs directory.
ls
Move processed FASTQ files to local desktop from HPC using your local terminal.
# Open a terminal window and log into the remote server using sftp.
sftp -i~/.ssh/"key name" k********@hpc.create.kcl.ac.uk
# Change to the local directory where files will be downloaded
lcd /Users/User/Desktop
get /scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output/SRR11318268_1_fastqc.html
get /scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output/SRR11318268_2_fastqc.html
get /scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output/SRR11318269_1_fastqc.html
get /scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output/SRR11318269_2_fastqc.html
get /scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output/SRR11318270_1_fastqc.html
get /scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output/SRR11318270_2_fastqc.html
get /scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output/SRR11318271_1_fastqc.html
get /scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output/SRR11318271_2_fastqc.html
get /scratch_tmp/grp/msc_appbio/group1/outputs/fastqc_output/multiqc_report.html
Download the reference genome to the HPC with NCBI Datasets Command Line (CLI) tools. We require the FNA and GTF files.
# Select the correct directory.
cd /scratch_tmp/grp/msc_appbio/group1/ref_gen/
# Create a conda environment
conda create -n ncbi_datasets
#Activate environment
conda activate ncbi_datasets
#Install ncbi-datasets-cli in the environment
conda install -c conda-forge ncbi-datasets-cli
# Download reference genome and specify the GTF (annotation file) and genome(FNA) file types.
datasets download genome accession GCF_000146045.2 --include genome,gtf
Index reference genome using STAR RNA-seq aligner
nano star_indexing.sh
#!/bin/bash
module load star/2.7.10b-gcc-13.2.0
# Ensure line 131 is run on a single line for alignment to work
STAR --runThreadN 16 --runMode genomeGenerate --genomeDir /scratch_tmp/grp/msc_appbio/group1/ref_gen --genomeFastaFiles/scratch_tmp/grp/msc_appbio/group1/ref_gen/GCF_000146045.2_R64_genomic.fna --sjdbGTFfile /scratch_tmp/grp/msc_appbio/group1/ref_gen/genomic.gtf
Script of STAR alignment for paired-end RNA reads
#!/bin/bash
echo "start of pipeline"
module load star/2.7.10b-gcc-13.2.0
# Specify path to raw data
baseDir="/scratch_tmp/grp/msc_appbio/group1"
# Define path to STAR indexed reference genome
ref_genome_index="${baseDir}/ref_gen/starindex"
rawDataDir="${baseDir}/raw_reads/"
outputDataDir="${baseDir}/outputs/star_outs/"
# Create output directory if it doesn't exist
mkdir -p "$outputDataDir"
samples=$(ls "$rawDataDir"/*_1.fastq.gz | sed 's/_1.fastq.gz//' | xargs -n 1 basename)
# Loop where samples are aligned using STAR and the genome index, saving the outputs as sorted BAM file.
for sample in $samples; do #iterates through each item in the samples variable
echo "Processing sample: $sample" #prints a message indicating which sample is being processed
STAR --runThreadN 6 --genomeDir "$ref_genome_index" --readFilesCommand zcat --readFilesIn "$rawDataDir${sample}_1.fastq.gz" "$rawDataDir${sample}_2.fastq.gz" --outFileNamePrefix "$outputDataDir${sample}_" --outSAMtype BAM SortedByCoordinate
#ensure line 160 is run on a single line for alignment to work
done
echo "STAR alignment complete!"
Use SAMtools to inspect first 10 lines of BAM file from STAR alignment
cd /scratch_tmp/grp/msc_appbio/group1/outputs
conda activate myenv
samtools view SRR11318268_Aligned.sortedByCoord.out.bam | head -n 10
#!/bin/bash
# Load Modules
module load subread
module load py-multiqc
echo "modules loaded successfully."
# Define variables and paths
baseDir="/scratch_tmp/grp/msc_appbio/group1"
inputDir="${baseDir}/outputs/star_outs"
gtf_file="${baseDir}/ref_gen/raw_refgen/GCF_000146045.2"
output_Dir="${baseDir}/outputs/readCounts"
feature_type="db_xref"
# Create output directory if it doesn't exist
mkdir -p "$output_Dir"
# Run featureCounts tool
featureCounts -p -a ${gtf_file}/*.gtf -g $feature_type -o ${output_Dir}/readCounts2.txt ${inputDir}/*.out.bam
# Check if featureCounts ran successfully
if [[ $? -eq 0 ]]; then
echo "featureCounts completed successfully. Results are in $output_Dir"
else
echo "Error: featureCounts failed. Check your input files and parameters."
exit 1
fi
# Run multiQC
multiqc "$output_Dir"
# Check if MultiQC ran successfully
if [[ $? -eq 0 ]]; then
echo "multiqc completed successfully"
else
echo "Error: multiqc failed"
exit 1
fi
Transfer featureCounts output file out of HPC onto local desktop using sftp.
# Open a terminal window and log into the remote server using sftp.
sftp -i~/.ssh/"key name" k********@hpc.create.kcl.ac.uk
# Change to your local directory where files will be downloaded
lcd /Users/User/group_project7BBG1002
#Use get to retrieve the readCounts2.txt and readCounts2.txt.summary files
get /scratch_tmp/grp/msc_appbio/group1/outputs/readCounts/readCounts2.txt
get /scratch_tmp/grp/msc_appbio/group1/outputs/readCounts/readCounts2.txt.summary
We will process, visualise and produce plots from our readCounts2.txt file using R version 4.4.1.
Installing the packages BiocManager, dplyr, gplots, ggplot2 and ggrepel.
install.packages(c('BiocManager', 'dplyr', 'gplots', 'ggplot2', 'ggrepel', 'kableExtra'))
Installing the below packages with BiocManager.
Loading libraries. Importing dataset under “countData” variable.
# Loading libraries.
# Enhance and style tables generated with knitr::kable.
library(kableExtra)
# For differential gene expression analysis of RNA-seq data.
library(DESeq2)
# A package for data manipulation and transformation.
library(dplyr)
library(ggrepel) # For labeling purposes
# A popular package for data visualization in R.
library(ggplot2)
# Provides various plotting functions, with a focus on heatmaps and other complex visualizations.
library(gplots)
# Specifically designed for creating complex heatmaps with rich annotations.
library(ComplexHeatmap)
# Provides a collection of color palettes that are suitable for various types of data visualizations.
library(RColorBrewer)
# Used for circular visualizations, especially for displaying relationships between variables in a circular layout
library(circlize)
# Provides methods for performing Gene Ontology (GO) statistical tests and enrichment analysis.
library(GOstats)
# A database package providing GO (Gene Ontology) annotations.
library(GO.db)
# Used for statistical analysis and visualization of functional profiles (e.g., GO, KEGG) of genes, proteins, or metabolites.
library(clusterProfiler)
# A framework for storing, querying, and retrieving annotation data, such as gene symbols, IDs, and GO terms.
library(AnnotationDbi)
# An organism-specific annotation database, in this case, for *Saccharomyces cerevisiae* (yeast).
library(org.Sc.sgd.db)
# Versions of the packages used.
package.version('kableExtra')
## [1] "1.4.0"
package.version('DESeq2')
## [1] "1.44.0"
package.version('dplyr')
## [1] "1.1.4"
package.version('ggrepel')
## [1] "0.9.6"
package.version('ggplot2')
## [1] "3.5.2"
package.version('gplots')
## [1] "3.2.0"
package.version('ComplexHeatmap')
## [1] "2.20.0"
package.version('RColorBrewer')
## [1] "1.1-3"
package.version('circlize')
## [1] "0.4.16"
package.version('GOstats')
## [1] "2.70.0"
package.version('GO.db')
## [1] "3.19.1"
package.version('clusterProfiler')
## [1] "4.12.6"
package.version('AnnotationDbi')
## [1] "1.66.0"
package.version('org.Sc.sgd.db')
## [1] "3.19.1"
# Load our readCounts2.txt file and store it to countData parameter. Skip first row as it is metadata that we do not require.
countData = read.table('readCounts2.txt', sep = '\t', header=FALSE, skip = 1)
# Convert to dataframe.
countData = as.data.frame(countData)
# Set column names to the values of the 1st row.
colnames(countData) = countData[1, ]
# Remove 1st row.
countData = countData[-1, ]
# Rename our sample IDs to ReEC_2, ReEC_1, Control_2, Control_1 respectively.
colnames(countData)[colnames(countData) == "/scratch_tmp/grp/msc_appbio/group1/outputs/star_outs/SRR11318268_Aligned.sortedByCoord.out.bam"] = "ReEC_2"
colnames(countData)[colnames(countData) == "/scratch_tmp/grp/msc_appbio/group1/outputs/star_outs/SRR11318269_Aligned.sortedByCoord.out.bam"] = "ReEC_1"
colnames(countData)[colnames(countData) == "/scratch_tmp/grp/msc_appbio/group1/outputs/star_outs/SRR11318270_Aligned.sortedByCoord.out.bam"] = "Control_2"
colnames(countData)[colnames(countData) == "/scratch_tmp/grp/msc_appbio/group1/outputs/star_outs/SRR11318271_Aligned.sortedByCoord.out.bam"] = "Control_1"
# Make column Geneid our row IDs.
rownames(countData) = countData$Geneid
#Remove GenBank: from our row IDs
rownames(countData) = gsub("GenBank:", "", rownames(countData))
#Remove the .number from our row IDs
rownames(countData) = gsub("\\..*", "", rownames(countData))
# Remove columns 1 to 6.
countData = countData[,-c(1:6)]
countData[] = lapply(countData, as.numeric)
# Print head of countData.
head(countData)
## ReEC_2 ReEC_1 Control_2 Control_1
## NM_001180043 34 56 46 31
## NM_001184582 0 0 0 0
## NM_001178208 0 0 4 0
## NM_001179897 4 18 10 16
## NM_001180042 51 80 55 63
## NM_001180041 0 0 0 0
# Load Paper_Results.csv for comparison later.
Paper_results = read.csv("./Paper_results.csv", sep = ';', stringsAsFactors = FALSE)
# Convert specific columns to numeric and replace decimal point separator from ',' to '.'
Paper_results$Differential.Expression.Adjusted.p.value = as.numeric(gsub(",", ".",Paper_results$Differential.Expression.Adjusted.p.value))
Paper_results$Differential.Expression.Log2.Ratio = as.numeric(gsub(",", ".",Paper_results$Differential.Expression.Log2.Ratio))
Paper_results$Differential.Expression.p.value = as.numeric(gsub(",", ".",Paper_results$Differential.Expression.p.value))
# Display structure of the dataframe to verify format.
str(Paper_results)
## 'data.frame': 5888 obs. of 6 variables:
## $ Name : chr "HXT13 CDS" "HXT17 CDS" "DSF1 CDS" "hypothetical protein CDS" ...
## $ Differential.Expression.Adjusted.p.value: num 1.46e-60 5.56e-42 3.15e-31 3.15e-31 3.15e-31 ...
## $ Differential.Expression.Log2.Ratio : num 2.95 2.69 2.15 2.11 -1.92 ...
## $ Differential.Expression.p.value : num 2.46e-64 1.87e-45 2.65e-34 2.40e-34 1.84e-34 ...
## $ gene : chr "HXT13" "HXT17" "DSF1" "" ...
## $ locus_tag : chr "YEL069C" "YNR072W" "YEL070W" "YNR073C" ...
# Make column locus_tag as rownames and delete redundant locus_tag column afterwards.
rownames(Paper_results) = Paper_results$locus_tag
Paper_results$locus_tag = NULL
Summary of our dataset.
summary(countData)
## ReEC_2 ReEC_1 Control_2 Control_1
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 2020 1st Qu.: 2154 1st Qu.: 1840 1st Qu.: 1971
## Median : 4362 Median : 4572 Median : 4084 Median : 4247
## Mean : 9805 Mean : 10047 Mean : 9510 Mean : 9790
## 3rd Qu.: 8372 3rd Qu.: 8805 3rd Qu.: 8054 3rd Qu.: 8498
## Max. :1274145 Max. :1266216 Max. :1345039 Max. :873397
Read sums of read counts per sample.
colSums(countData)
## ReEC_2 ReEC_1 Control_2 Control_1
## 63505413 65071717 61593777 63411294
Barplot of our data (million counts per sample).
# Fix margins of our output.
par(mar=c(8,5,4,2)+0.1)
# Create barplot.
barplot(colSums(countData)/1e6, main = "Total Read Counts", xlab = "Samples", ylab = "Counts per Million")
Histogram of the first column of our dataset showing that the data are highly skewed to the right.
# Fix margins of our output.
par(mar=c(7,5,4,4)+0.2)
# Create histogram.
hist(countData$ReEC_2, xlim = c(0,120000), main = "Histogram of read counts for ReEC_2", xlab = "Number of read counts", br=1000)
Log Transformation. Taking the log2 for our sample read counts for better visualization.
logCountData = log2(1 + countData)
Repeating our previous histogram of the first column of our dataset, but now with the log transformed data.
# Fix margins of our output.
par(mar=c(7,5,4,4)+0.2)
# Create histogram with the log2 read counts
hist(logCountData$ReEC_2, xlim = c(0,20), main = "Histogram of read counts for ReEC_2", xlab = "Number of read counts", br=100)
Plotting Scatter Plot to observe the correlation between ReEC_2 and ReEC_1.
# Fix margins of our output.
par(mar=c(7,5,4,2)+0.1)
# Create scatter plot.
plot(logCountData[,1], logCountData[,2], xlab = "ReEC_2", ylab = "ReEC_1")
Plotting Scatter Plot to observe the correlation between Control_2 and Control_1.
# Fix margins of our output.
par(mar=c(7,5,4,2)+0.1)
# Create scatter plot.
plot(logCountData[,3], logCountData[,4], xlab = "Control_2", ylab = "Control_1")
Plotting Scatter Plot to observe the correlation between ReEC_2 and Control_2.
# Fix margins of our output.
par(mar=c(7,5,4,2)+0.1)
# Create scatter plot.
plot(logCountData[,1], logCountData[,3], xlab = "ReEC_2", ylab = "Control_2")
We have to make the experiment design into a small dataframe to associate each column (sample) in the countData object with a specific condition (mutated or control). Here we make a small table that has the sample/column names from our main dataset (countData) as row; and then a column for which columns of our main dataset are “treatment” or “control” respectively. This step is mandatory for DESeq2 package to properly analyse our data.
# Define our groups and indicate that object "group" is a factor.
groups = factor(c("mutated", "mutated", "control", "control"))
# Create dataframe
sample_info = data.frame(row.names = colnames(countData), condition = groups)
sample_info
## condition
## ReEC_2 mutated
## ReEC_1 mutated
## Control_2 control
## Control_1 control
Create a DESeqDataSet object (dds) using the DESeqDataSetFromMatrix function from the DESeq2 package. This dataset is then ready for downstream differential expression analysis.
# Create DESeq2 dataset
dds = DESeqDataSetFromMatrix(countData = countData, colData = sample_info, design = ~ condition)
Run the DESeq2 pipeline for differential expression analysis. Calculate significantly expressed genes using adjusted p-value cutoff of 0.01.
# Main function
dds = DESeq(dds)
# Confirm the number of rows is the equal with our countData dataset.
nrow(dds)
## [1] 6477
# Results. The contrast argument specifies which groups to compare (in this case, mutated vs control). Default adjusted p-value is 0.1 but we set it to 0.01 based on the study we are trying to reproduce.
res = results(dds, contrast = c("condition", "mutated", "control"), alpha = 0.01)
# Sorting the res table by the adjusted p-values (padj) in ascending order.
res = res[order(res$padj),]
# Make a dataframe of res, so we can use it later to make a volcano plot.
res.df = as.data.frame(res)
# Remove N/A values.
res.df = na.omit(res.df)
# Print head of res.
head(res)
## log2 fold change (MLE): condition mutated vs control
## Wald test p-value: condition mutated vs control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## NM_001178884 917.059 3.38035 0.212988 15.8711 1.00469e-56
## NM_001183250 3428.184 2.44669 0.185474 13.1916 9.81262e-40
## NM_001183249 397.737 3.09408 0.243089 12.7282 4.12390e-37
## NM_001178885 4439.225 2.45244 0.194268 12.6240 1.55681e-36
## NM_001179887 46228.818 -2.12102 0.177082 -11.9776 4.65538e-33
## NM_001179459 3885.273 2.12698 0.186749 11.3895 4.71450e-30
## padj
## <numeric>
## NM_001178884 6.24315e-53
## NM_001183250 3.04878e-36
## NM_001183249 8.54197e-34
## NM_001178885 2.41850e-33
## NM_001179887 5.78571e-30
## NM_001179459 4.88265e-27
# Print summary of res.
summary(res)
##
## out of 6337 with nonzero total read count
## adjusted p-value < 0.01
## LFC > 0 (up) : 112, 1.8%
## LFC < 0 (down) : 94, 1.5%
## outliers [1] : 0, 0%
## low counts [2] : 123, 1.9%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
We will now compare our results res.df to the study we are trying to reproduce. We will first create a subset of res.df with all the genes that present adjusted p-values(padj) < 0.01 and store it to sigGenes object. This subset will contain all the significant differentially expressed genes based on padj threshold of 0.01.
# Create subset of res.df.
sigGenes = res.df[res.df$padj < 0.01,]
# Remove NA values.
sigGenes = na.omit(sigGenes)
# Print dimentions of SigsPaper
dim(sigGenes)
## [1] 206 6
Filtering Paper_results for significant differentially expressed genes based on a adjusted p-value cutoff of 0.01, and storing it to SigsPaper object.
# Extract subset of significant differentially expressed genes from Paper_results.
SigsPaper = Paper_results[Paper_results$Differential.Expression.Adjusted.p.value < 0.01,]
SigsPaper = na.omit(SigsPaper)
# Print dimentions of SigsPaper
dim(SigsPaper)
## [1] 194 5
Now we will try to map our results IDs of differentially expressed genes (sigGenes) with paper’s ones (SigsPaper). To achieve this we will use org.Sc.sgd.db as a database to retrieve common IDs in both papers for comparison, since our dataset’s IDs differ from the paper’s ones. Therefore, we will make an extra column in sigGenes of COMMON IDs. We retrieve these IDs from org.Sc.sgd.db based on mapping to our row names (RefSeq ID type).
# Make COMMON column and map RefSeq IDs to ENTREZ from org.Sc.sgd.db
sigGenes$COMMON = mapIds(org.Sc.sgd.db, keys = rownames(sigGenes), column = "COMMON", keytype = "REFSEQ", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
# Remove NA values from sigGenes.
sigGenes = na.omit(sigGenes)
# Search for presence of common IDs in the "COMMON" column of sigGenes to the "gene" column of SigsPaper.
CommonIDs = intersect(sigGenes$COMMON, SigsPaper$gene)
# Make CommonIDs a dataframe.
CommonIDs = data.frame(CommonIDs)
# Remove NA values from CommonIDs.
CommonIDs = na.omit(CommonIDs)
# Print CommonIDs dimensions.
dim(CommonIDs)
## [1] 106 1
# Print CommonIDs as a scroll table.
kable(CommonIDs) %>%
kable_styling() %>%
scroll_box(width = "500px", height = "500px")
CommonIDs |
---|
DSF1 |
COX5B |
DAK2 |
RTS3 |
CWP1 |
CTT1 |
MAL12 |
IZH4 |
IZH2 |
SRD1 |
STB6 |
DSE2 |
CTS1 |
PUS2 |
ACO1 |
MLS1 |
MAL32 |
MEP3 |
PUT4 |
ESF1 |
TDA4 |
ICT1 |
CLN2 |
HXT10 |
SUC2 |
GAC1 |
ARG1 |
ISF1 |
YET2 |
MAL31 |
FUM1 |
FIT2 |
PAU7 |
IRC10 |
ICL1 |
COS12 |
HES1 |
AMN1 |
SRL1 |
CLN1 |
SPS100 |
FAS1 |
DIF1 |
ARO9 |
AXL2 |
SVS1 |
TOS6 |
GLK1 |
SPI1 |
ECM13 |
HSP30 |
EMI2 |
MTL1 |
ARG7 |
TOS1 |
PDH1 |
PRY1 |
ODC1 |
ENO2 |
SCW4 |
MSB2 |
DSE1 |
BAG7 |
PRM7 |
ARG3 |
MGA2 |
IDH1 |
KGD2 |
COR1 |
YRO2 |
IDH2 |
RKI1 |
SCW10 |
DBF20 |
MSA2 |
CIT2 |
DIT1 |
YAT2 |
ECL1 |
KCC4 |
CYB2 |
TAD2 |
SET4 |
ATP1 |
CYC1 |
HRP1 |
CHA1 |
SPS4 |
EGT2 |
SRL4 |
CIT1 |
IML2 |
MRK1 |
STL1 |
COX8 |
DSE3 |
IZH1 |
YPC1 |
TOS4 |
BUD7 |
ECM10 |
SUT1 |
ALG3 |
TCB3 |
CAX4 |
BSC1 |
By default, lfcThreshold is set to 0. The paper doesn’t specify
a log fold change (LFC) cutoff and we assume that DESeq2 default values
were used. However, typical cutoffs of 1.3 to 4 fold change (FC) are
applied along with the FDR threshold in most studies for a more accurate
prediction of the differentially expressed genes. If we apply a log2
fold change threshold of 0.379 (~1.3 FC) we can see that we get even
less differentially expressed genes (significant) than the numbers
mentioned in the paper.
# Results. The contrast argument specifies which groups to compare (in this case, mutated vs control). Default adjusted p-value is 0.1 but we set it to 0.01 based on the study we are trying to reproduce. Log2 fold change threshold (lfcThreshold) is set to 0.379.
res.strict = results(dds, contrast = c("condition", "mutated", "control"), lfcThreshold = 0.379, alpha = 0.01)
# Sorting the res table by the adjusted p-values (padj) in ascending order.
res.strict = res.strict[order(res.strict$padj),]
# Make a dataframe of res.strict, so we can use it later to make a volcano plot.
res.strict.df = as.data.frame(res.strict)
# Remove N/A values
res.strict.df = na.omit(res.strict.df)
# Print a summary of res.strict
summary(res.strict)
##
## out of 6337 with nonzero total read count
## adjusted p-value < 0.01
## LFC > 0.38 (up) : 46, 0.73%
## LFC < -0.38 (down) : 21, 0.33%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Filtering our differentially expressed genes based on log2FoldChange cutoff of > 1.6 and adjusted p-value of < 0.01 to reduce the number of genes and extract the top differentially expressed genes for better visualization in a heatmap.
# Select genes from res.df with absolut log2FoldChange values > 1.6 and adjusted p-values < 0.01.
df.top = res.df[(abs(res.df$log2FoldChange) > 1.6) & (res.df$padj < 0.01),]
# Print df.top as dataframe with scroll bars.
kable(df.top) %>%
kable_styling() %>%
scroll_box(width = "1000px", height = "800px")
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
---|---|---|---|---|---|---|
NM_001178884 | 917.05889 | 3.380351 | 0.2129879 | 15.871097 | 0.00e+00 | 0.0000000 |
NM_001183250 | 3428.18447 | 2.446690 | 0.1854738 | 13.191565 | 0.00e+00 | 0.0000000 |
NM_001183249 | 397.73706 | 3.094082 | 0.2430893 | 12.728170 | 0.00e+00 | 0.0000000 |
NM_001178885 | 4439.22483 | 2.452436 | 0.1942675 | 12.624013 | 0.00e+00 | 0.0000000 |
NM_001179887 | 46228.81813 | -2.121023 | 0.1770822 | -11.977612 | 0.00e+00 | 0.0000000 |
NM_001179459 | 3885.27306 | 2.126981 | 0.1867486 | 11.389544 | 0.00e+00 | 0.0000000 |
NM_001179914 | 2699.94731 | 3.176047 | 0.3106209 | 10.224835 | 0.00e+00 | 0.0000000 |
NM_001184636 | 2947.87015 | 2.334704 | 0.2285380 | 10.215827 | 0.00e+00 | 0.0000000 |
NM_001181290 | 33333.83030 | 2.394363 | 0.2363183 | 10.131942 | 0.00e+00 | 0.0000000 |
SGD:S000007261 | 9212.24249 | -2.305998 | 0.2309464 | -9.984989 | 0.00e+00 | 0.0000000 |
GeneID:851854 | 1093.98745 | 2.006927 | 0.2126872 | 9.436049 | 0.00e+00 | 0.0000000 |
NM_001179662 | 4276.57274 | -2.241660 | 0.2428992 | -9.228767 | 0.00e+00 | 0.0000000 |
NM_001181217 | 29336.04432 | -2.201083 | 0.2491844 | -8.833149 | 0.00e+00 | 0.0000000 |
GeneID:856259 | 899.78828 | 1.737558 | 0.1970967 | 8.815763 | 0.00e+00 | 0.0000000 |
NM_001179913 | 10266.03905 | 1.741138 | 0.2051681 | 8.486399 | 0.00e+00 | 0.0000000 |
NM_001181421 | 20899.72028 | 2.858331 | 0.3415470 | 8.368779 | 0.00e+00 | 0.0000000 |
NM_001184567 | 1662.67650 | -1.648336 | 0.1993342 | -8.269211 | 0.00e+00 | 0.0000000 |
NM_001183390 | 1947.59422 | 1.859596 | 0.2320836 | 8.012616 | 0.00e+00 | 0.0000000 |
NM_001183355 | 7751.96132 | 1.849258 | 0.2318568 | 7.975865 | 0.00e+00 | 0.0000000 |
NM_001181418 | 74442.41415 | 2.278568 | 0.2882363 | 7.905211 | 0.00e+00 | 0.0000000 |
SGD:S000007262 | 7327.80280 | -1.702045 | 0.2179335 | -7.809928 | 0.00e+00 | 0.0000000 |
GeneID:851712 | 39909.06205 | -2.177790 | 0.2812944 | -7.742030 | 0.00e+00 | 0.0000000 |
NM_001178647 | 7998.68498 | 2.244825 | 0.3341334 | 6.718351 | 0.00e+00 | 0.0000000 |
NM_001183191 | 1449.40624 | 1.752945 | 0.2687778 | 6.521913 | 0.00e+00 | 0.0000000 |
NM_001181468 | 124956.73388 | 1.623340 | 0.2613041 | 6.212455 | 0.00e+00 | 0.0000001 |
NM_001178666 | 764.53425 | 1.617183 | 0.2769976 | 5.838258 | 0.00e+00 | 0.0000006 |
NM_001183127 | 7087.03138 | -1.815860 | 0.3197033 | -5.679827 | 0.00e+00 | 0.0000014 |
GeneID:850925 | 322.58216 | 1.676340 | 0.3056030 | 5.485354 | 0.00e+00 | 0.0000039 |
NM_001179269 | 403.14184 | -1.671766 | 0.3379697 | -4.946495 | 8.00e-07 | 0.0000559 |
GeneID:850341 | 62.83073 | 2.348477 | 0.5163569 | 4.548166 | 5.40e-06 | 0.0003329 |
GeneID:851849 | 69.35399 | 1.640244 | 0.4030889 | 4.069188 | 4.72e-05 | 0.0020940 |
Order df.top by the log2foldchange (Higher to lower values) for
better visualization in the heatmap.
# Reorder the rows of df.top based on the log2FoldChange column in descending order.
df.top = df.top[order(df.top$log2FoldChange, decreasing = TRUE),]
Heatmap with ComplexHeatmap, RColorBrewer and circlize libraries
# Apply a regularized log transformation (rlog) to the count data in the dds object.The blind = FALSE argument indicates that the transformation will not ignore the experimental design.
rld = rlog(dds, blind=FALSE)
# Making a matrix of the log-transformed expression values for the top differentially expressed genes across all samples.
mat = assay(rld)[rownames(df.top), rownames(sample_info)]
# Ensure the column names of mat correspond to the sample names from the sample_info dataframe.
colnames(mat) = rownames(sample_info)
# Calculate the mean expression for each gene (across all samples). This give us an idea of the overall expression level of each gene across the samples in the heatmap.
base_mean = rowMeans(mat)
# Center and scale each column (Z-score), then transpose.
mat.scaled = t(apply(mat, 1, scale))
colnames(mat.scaled) = colnames(mat)
head(mat.scaled)
## ReEC_2 ReEC_1 Control_2 Control_1
## NM_001178884 0.9135980 0.8165386 -0.8341066 -0.8960300
## NM_001179914 0.5256830 1.1453589 -0.7471479 -0.9238941
## NM_001183249 0.9028519 0.8283980 -0.8645939 -0.8666560
## NM_001181421 0.6262600 1.0394192 -1.0975682 -0.5681111
## NM_001178885 0.8838570 0.8426861 -0.7678736 -0.9586695
## NM_001183250 0.8776038 0.8515029 -0.7943780 -0.9347288
Making a matrix of the Log2FoldChange values of the filtered genes in df.top, to use it as an extra column in our heatmap.
# Get log2FoldChange value for each gene in df.top.
l2_val = as.matrix(df.top$log2FoldChange)
# Assign column name to the matrix.
colnames(l2_val) = "log2FC"
l2_val
## log2FC
## [1,] 3.380351
## [2,] 3.176047
## [3,] 3.094083
## [4,] 2.858331
## [5,] 2.452436
## [6,] 2.446690
## [7,] 2.394363
## [8,] 2.348477
## [9,] 2.334704
## [10,] 2.278569
## [11,] 2.244825
## [12,] 2.126981
## [13,] 2.006927
## [14,] 1.859596
## [15,] 1.849258
## [16,] 1.752945
## [17,] 1.741138
## [18,] 1.737558
## [19,] 1.676340
## [20,] 1.640244
## [21,] 1.623340
## [22,] 1.617183
## [23,] -1.648336
## [24,] -1.671766
## [25,] -1.702045
## [26,] -1.815860
## [27,] -2.121022
## [28,] -2.177790
## [29,] -2.201083
## [30,] -2.241660
## [31,] -2.305998
Making a matrix of the baseMean values (Average gene expression across all samples) of the filtered genes in df.top to include as an extra column in our heatmap.
# Make matrix of the mean values for each gene in df.top.
mean = as.matrix(df.top$baseMean)
# Assign column name to the matrix.
colnames(mean) = "AvrgExpr"
mean
## AvrgExpr
## [1,] 917.05889
## [2,] 2699.94731
## [3,] 397.73706
## [4,] 20899.72028
## [5,] 4439.22483
## [6,] 3428.18447
## [7,] 33333.83030
## [8,] 62.83073
## [9,] 2947.87015
## [10,] 74442.41415
## [11,] 7998.68498
## [12,] 3885.27306
## [13,] 1093.98745
## [14,] 1947.59422
## [15,] 7751.96132
## [16,] 1449.40624
## [17,] 10266.03905
## [18,] 899.78828
## [19,] 322.58216
## [20,] 69.35399
## [21,] 124956.73388
## [22,] 764.53425
## [23,] 1662.67650
## [24,] 403.14184
## [25,] 7327.80280
## [26,] 7087.03138
## [27,] 46228.81813
## [28,] 39909.06205
## [29,] 29336.04432
## [30,] 4276.57274
## [31,] 9212.24249
Creating a color map for our Log2FoldChange and Average gene expression values.
# Define color scale for Z-scores, mapping low values to blue, zero to white, and high values to red.
col_z = colorRamp2(c(min(mat.scaled), 0, max(mat.scaled)), c("blue", "white", "red"))
# Maps values between b/w/r for min and max log2FoldChange values.
col_log2FC = colorRamp2(c(min(l2_val),0, max(l2_val)), c("blue", "white", "red"))
# Maps between 0% quantile, and 75% quantile of mean values --- 0, 25, 50, 75, 100.
col_AvrgExpr = colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white", "red"))
Creating a heatmap for the top 31 genes, consisting of 3 smaller heatmaps/columns that present the Z-score expression, the log2FoldChange in descending order, and the baseMean (Average gene expression) matrices respectively.
# Create an annotation that will be used later to summarise the l2FC column of the heatmap. ha goes as input to h2 (heatmap 2).
ha = HeatmapAnnotation(summary = anno_summary(gp = gpar(fill = 2),
height = unit(2, "cm")))
# Create heatmap 1 (Z-scores).
h1 = Heatmap(mat.scaled, cluster_rows = F,
column_labels = colnames(mat.scaled), column_names_gp = gpar(fontsize = 9), name="Z-score",
cluster_columns = T, col = col_z,
show_heatmap_legend = FALSE)
# Create heatmap 2 (log2 Fold Change).
h2 = Heatmap(l2_val, row_labels = rownames(df.top),
cluster_rows = F, name="log2FC", top_annotation = ha, col = col_log2FC, show_heatmap_legend = FALSE, # Define color scheme for log2FC
row_names_gp = gpar(fontsize = 8), column_names_gp = gpar(fontsize = 9), cell_fun = function(j, i, x, y, w, h, col) { # Add text to each grid.
grid.text(round(l2_val[i, j],2), x, y, gp = gpar(fontsize = 7))
}) # Round text to 2 decimals.
# Create heatmap 3 (Average Expression).
h3 = Heatmap(mean, row_labels = rownames(df.top),
cluster_rows = F, name = "AvrgExpr", col = col_AvrgExpr, show_heatmap_legend = FALSE,
row_names_gp = gpar(fontsize = 7), column_names_gp = gpar(fontsize = 9), cell_fun = function(j, i, x, y, w, h, col) { # Add text to each grid.
grid.text(round(mean[i, j],2), x, y, gp = gpar(fontsize = 7))
}) # Round text to 2 decimals.
# Combine the heatmaps into one (h).
h = h1+h2+h3
# Extracting heatmap (h) into a PNG file. "png()" initializes the graphics device for saving the plot as a PNG file.
# png("./heatmap.png", res = 300, width = 2700, height = 5000)
# Create legends with spacing between title and color bar
lgd_list = list(
Legend(title = "Z-score", col_fun = col_z, title_gap = unit(4, "mm")),
Legend(title = "log2FC", col_fun = col_log2FC, title_gap = unit(4, "mm")),
Legend(title = "AvrgExpr", col_fun = col_AvrgExpr, title_gap = unit(4, "mm"))
)
# Pack legends vertically with spacing between them
combined_legend = packLegend(
lgd_list[[1]],
lgd_list[[2]],
lgd_list[[3]],
direction = "vertical",
gap = unit(10, "mm") # Increase gap between the legends
)
# Draw heatmap with packed legends on the right and apply padding
draw(h,
heatmap_legend_list = combined_legend,
heatmap_legend_side = "right",
padding = unit(c(10, 10, 10, 10), "mm"))
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off()
We will initially create a basic volcano plot step by step from res.df that takes the default lfcThreshold (0) and only defines the significant differentially expressed genes based on the adjusted p-value (padj < 0.01).
# Set our x-axis and y-axis values to log2FoldChage and -log10(padj) respectively. We convert padj values to the scale of -log10 for better visualisation. "geom_point()" adds scatter plot points to the plot.
vol_plot = res.df %>%
ggplot(aes(x = log2FoldChange, y = -log10(padj))) + geom_point()
# Visualise ggplot output
vol_plot
We have set padj cutoff at 0.01 and since our y-axis is in -log10 scale we also need to convert our threshold to -log10. Our log2FoldChange (L2FC) cutoff is set at 0. This defines which genes are upregulated (L2FC > 0) and which are downregulated (L2FC < 0). A L2FC of 0 equals to fold change of 1. We will addWe have also set our x-axis range from -6 to 6.
# Add threshold lines for both statistical significance (y-axis) and fold change (x-axis), and adjust the x-axis scale.
vol_plot + geom_hline(yintercept = -log10(0.01), linetype = "dashed") +
geom_vline(xintercept = log2(1), linetype = "dashed") + scale_x_continuous(breaks = c(seq(-6, 6, 1)), # Modify x-axis tick intervals.
limits = c(-6, 6)) + # Modify x-axis range from -6 to 6.
ggtitle("ReEC(mutated) vs Control") + # Add title.
theme(plot.title = element_text(size = 11, face = "bold", hjust = 0.5)) + # Format title.
labs(subtitle = "(fold change = 0, adjusted p-value = 0.01)") + # Add subtitle.
theme(plot.subtitle = element_text(size = 8, face = "italic", hjust = 0.5)) # Format subtitle.
To visualise different groups of genes using different colours, point sizes, shapes or transparencies, we need to categorise genes into different groups and store these categories as a new parameter i.e. new column of data.
Genes with log2FoldChage > 0 & adj_p_val <= 0.01 as Upregulated.
Genes with log2FoldChage < 0 & adj_p_val <= 0.01 as Downregulated.
All other genes are labelled as Non-Significant.
# Create a new dataframe(res.df2) based on the original res.df and add a new column(gene_type). The new gene_type column categorizes each row (gene) into one of these three groups: Upregulated, Downregulated, Non-Significant; based on the log2FoldChange(0) and the padj(0.01) cutoffs.
res.df2 = res.df %>%
mutate(
gene_type = case_when(
log2FoldChange > 0 & padj <= 0.01 ~ "Upregulated",
log2FoldChange < 0 & padj <= 0.01 ~ "Downregulated",
TRUE ~ "Non-Significant"))
# Extract the gene_type column from the new dataframe(res.df2) and store it in a variable.
gene_type.object = res.df2$gene_type
# Summarize the counts of each category in the gene_type column.
res.df2 %>%
count(gene_type)
## gene_type n
## 1 Downregulated 94
## 2 Non-Significant 6008
## 3 Upregulated 112
# Check gene_type category names.
res.df2 %>%
distinct(gene_type) %>%
pull()
## [1] "Upregulated" "Downregulated" "Non-Significant"
In ggplot2, you also have the option to visualize different groups by point color, size, shape and transparency by modifying parameters via scale_color_manual() etc. A tidy way of doing this is to store each visual specifications in a separate vector.
# Create vectors of custom colors and alphas (transparency) to volcano plot.
cols = c("Upregulated" = "#ffad73", "Downregulated" = "#26b3ff", "Non-Significant" = "grey")
alphas = c("Upregulated" = 1, "Downregulated" = 1, "Non-Significant" = 0.5)
# Create a column in res.df2 that takes the rownames of our 20 most significant differentially expressed genes that will be used later for labeling purposes. This is possible as res.df2 is sorted in ascending order of the padj values.
res.df2 = res.df2 %>% mutate(top_20_genes = ifelse(row_number() <= 20, rownames(res.df2), NA))
# Create Volcano plot in the same way we did in the previous steps.
VolcanoPlot = ggplot( res.df2, aes(x = log2FoldChange,
y = -log10(padj),
fill = gene_type,
size = gene_type,
alpha = gene_type)) +
geom_point(shape = 21,
colour = "black", size = 2) + # Plot the points and modify color and size.
geom_hline(yintercept = -log10(0.01),
linetype = "dashed") + # Add dashed line in the y-axis at -log(0.01).
geom_vline(xintercept = log2(1),
linetype = "dashed") + # Add dashed line in the x-axis at log2(1) fold change.
scale_fill_manual(values = cols) + # Modify point color
scale_alpha_manual(values = alphas) + # Modify point transparency
scale_x_continuous(breaks = c(seq(-6, 6, 1)), # Modify x-axis tick intervals.
limits = c(-6, 6)) + # Modify x-axis range from -6 to 6.
ggtitle("ReEC(mutated) vs Control") + # Add title.
theme(plot.title = element_text(size = 11, face = "bold", hjust = 0.5)) + # Format title.
labs(subtitle = "(fold change = 0, adjusted p-value = 0.01)") + # Add subtitle.
theme(plot.subtitle = element_text(size = 8, face = "italic", hjust = 0.5)) + # Formt subtitle.
geom_text_repel(aes(label = top_20_genes), # Add labels to the top 20 most significant differentially expressed genes.
size = 2,
max.overlaps = Inf)
# Extracting Vol_plot2 into a PNG file and saving it to our current directory.
#png("./VolcanoPlot.png", res = 300, width = 4100, height = 2100)
# Renders the VolcanoPlot and writes it to the PNG file.
print(VolcanoPlot)
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off() # Turning off graphics device to free up memory.
Now we will create a volcano plot from res.strict.df that takes a lfcThreshold of 0.379(1.3 fold change), and padj of 0.01. Therefore, a gene is defined as differentially expressed (significant) only if |lfcThreshold| > 0.379 and padj <= 0.01.
# Create a new dataframe(res.strict.df2) based on the original res.strict.df and add a new column(gene_category). The new gene_category column categorizes each row (gene) into one of these three groups: Upregulated, Downregulated, Non-Significant; based on the |log2FoldChange| > 0.379 and the padj(0.01) cutoffs.
res.strict.df2 = res.strict.df %>%
mutate(
gene_category = case_when(
log2FoldChange >= 0.379 & padj <= 0.01 ~ "Upregulated",
log2FoldChange <= -0.379 & padj <= 0.01 ~ "Downregulated",
TRUE ~ "Non-Significant"))
# Extract the gene_category column from the new dataframe(res.strict.df2) and store it in a variable.
gene_category.object = res.df2$gene_category
# Summarize the counts of each category in the gene_category column.
res.strict.df2 %>%
count(gene_category)
## gene_category n
## 1 Downregulated 21
## 2 Non-Significant 6270
## 3 Upregulated 46
# Check gene_category category names.
res.strict.df2 %>%
distinct(gene_category) %>%
pull()
## [1] "Upregulated" "Downregulated" "Non-Significant"
# Create vectors of custom colors and alphas (transparency) to volcano plot.
cols2 = c("Upregulated" = "#ffad73", "Downregulated" = "#26b3ff", "Non-Significant" = "grey")
alphas2 = c("Upregulated" = 1, "Downregulated" = 1, "Non-Significant" = 0.5)
# Create a column in res.strict.df2 that takes the rownames of our 20 most significant differentially expressed genes that will be used later for labeling purposes. This is possible as res.strict.df2 is sorted in ascending order of the padj values.
res.strict.df2 = res.strict.df2 %>% mutate(top_20genes = ifelse(row_number() <= 20, rownames(res.df2), NA))
# Create Volcano plot in the same way we did in the previous steps.
VolcanoPlot_strict = ggplot( res.strict.df2, aes(x = log2FoldChange,
y = -log10(padj),
fill = gene_category,
size = gene_category,
alpha = gene_category)) +
geom_point(shape = 21,
colour = "black", size = 2) + # Plot the points and modify color and size.
geom_hline(yintercept = -log10(0.01),
linetype = "dashed") + # Add dashed line in the y-axis at -log(0.01).
geom_vline(xintercept = c(log2(1.3), -log2(1.3)),
linetype = "dashed") + # Add dashed line in the x-axis at log2(1.3) and -log2(1.3) fold change respectively.
scale_fill_manual(values = cols2) + # Modify point color
scale_alpha_manual(values = alphas2) + # Modify point transparency
scale_x_continuous(breaks = c(seq(-6, 6, 1)), # Modify x-axis tick intervals.
limits = c(-6, 6)) + # Modify x-axis range from -6 to 6.
ggtitle("ReEC(mutated) vs Control") + # Add title.
theme(plot.title = element_text(size = 11, face = "bold", hjust = 0.5)) + # Format title.
labs(subtitle = "(fold change = 1.3, adjusted p-value = 0.01)") + # Add subtitle.
theme(plot.subtitle = element_text(size = 8, face = "italic", hjust = 0.5)) + # Formt subtitle.
geom_text_repel(aes(label = top_20genes), # Add labels to the top 20 most significant differentially expressed genes.
size = 2,
max.overlaps = Inf)
# Extracting VolcanoPlot_strict into a PNG file and saving it to our current directory.
#png("./VolcanoPlot_strict.png", res = 300, width = 4100, height = 2100)
# Renders the VolcanoPlot_strict and writes it to the PNG file.
print(VolcanoPlot_strict)
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off()
Making a function to detect our groups selection (mutated, control).
detectGroups = function(x) { return(groups) }
# Perform PCA to the regularized log transformed data.
pca.object = prcomp(t(assay(rld)))
# Data preparation for plotting.
# Select the first two columns of pca.object which contains the principal component scores.
pcaData = as.data.frame(pca.object$x[,1:2]);
# Add groups as a "Type" column in pcaData
pcaData = cbind(pcaData, detectGroups(colnames(assay(rld)) ))
# Add column names
colnames(pcaData) = c("PC1", "PC2", "Type")
# Calculate the percentage of variance explained by the first two principal components (PC1 and PC2). Use the summary(pca.object)$importance table to get the proportion of variance explained for each principal component and convert it to a percentage.
percentVar=round(100*summary(pca.object)$importance[2,1:2],0)
# Plotting the PCA Results
# Initialize the ggplot object with the PCA data. Color and shape the points by the Type variable (which indicates the sample group).
p=ggplot(pcaData, aes(PC1, PC2, color=Type, shape = Type)) + geom_point(size=3)
# Label the x-axis and y-axis respectively with the percentages of variance explained by PC1 and PC2.
p=p+xlab(paste0("PC1: ",percentVar[1],"% variance"))
p=p+ylab(paste0("PC2: ",percentVar[2],"% variance"))
# Add title and ensure that the aspect ratio of the plot is equal, so the axes have the same scale.
p=p+ggtitle("Principal component analysis (PCA)") + coord_fixed(ratio=1.0) +
theme(plot.title = element_text(size = 12,hjust = 0.5)) + theme(aspect.ratio = 1) +
theme(axis.text.x = element_text( size = 10),
axis.text.y = element_text( size = 10),
axis.title.x = element_text( size = 11),
axis.title.y = element_text( size = 11) ) +
theme(legend.text=element_text(size=11))
# Display plot
print(p)
Making four extra columns in sigGenes of ENTREZ, GENENAME, ENSEMBL and GO IDs. We retrieve these IDs from org.Sc.sgd.db based on mapping to our row names (RefSeq ID type).
# Make ENTREZ column and map RefSeq IDs to ENTREZ from org.Sc.sgd.db.
sigGenes$ENTREZ = mapIds(org.Sc.sgd.db, keys = rownames(sigGenes), column = "ENTREZID", keytype = "REFSEQ", multiVals = "first")
## 'select()' returned 1:1 mapping between keys and columns
# Make GENENAME column and map RefSeq IDs to GENENAME from org.Sc.sgd.db.
sigGenes$GENENAME = mapIds(org.Sc.sgd.db, keys = rownames(sigGenes), column = "GENENAME", keytype = "REFSEQ", multiVals = "first")
## 'select()' returned 1:1 mapping between keys and columns
# Make ENSEMBL column and map RefSeq IDs to ENSEMBL from org.Sc.sgd.db.
sigGenes$ENSEMBL = mapIds(org.Sc.sgd.db, keys = rownames(sigGenes), column = "ENSEMBL", keytype = "REFSEQ", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
# Make GO column and map RefSeq IDs to GO from org.Sc.sgd.db.
sigGenes$GO = mapIds(org.Sc.sgd.db, keys = rownames(sigGenes), column = "GO", keytype = "REFSEQ", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
Identifying upregulated and downregulated genes based on the log2FoldChange thresholds of (-0.379, 0.379), and extracting their corresponding ENTREZ IDs. Then, we store these genes to their respective objects.
# Filter upregulated genes from sigGenes if log2FoldChange >= 0.379 and select the ENTREZ column from sigGenes. Store this to Upregulated_genes object.
Upregulated_genes = sigGenes[sigGenes$log2FoldChange >= 0.379, "ENTREZ"]
# Remove NA values.
Upregulated_genes = na.omit(Upregulated_genes)
# Filter downregulated genes from sigGenes if log2FoldChange <= -0.379 and select the ENTREZ column from sigGenes. Store this to Downregulated_genes object.
Downregulated_genes = sigGenes[sigGenes$log2FoldChange <= (-0.379), "ENTREZ"]
# Remove NA values.
Downregulated_genes = na.omit(Downregulated_genes)
Performing Gene Ontology (GO) enrichment analysis for Biological Process (BP) terms using the enrichGO function from the clusterProfiler package. The enrichment analysis is conducted for Upregulated and Downregulated genes based on their ENTREZID identifiers. We have used the Saccharomyces cerevisiae (baker’s yeast) database from the org.Sc.sgd.db to map our genes ENTREZ IDs to GO terms specific to yeast. Additionally, we have used a pvalueCutoff of 0.05 and also applied the Holm-Bonferroni correction to adjust p-values for multiple testing as stated in the paper.
# Perform enrichGO for Upregulated genes.
Upregulated_genesBP = enrichGO(gene = Upregulated_genes, OrgDb = "org.Sc.sgd.db", keyType = "ENTREZID", ont = "BP", pvalueCutoff = 0.05, pAdjustMethod = "holm")
# Perform enrichGO for Downregulated genes.
Downregulated_genesBP = enrichGO(gene = Downregulated_genes, OrgDb = "org.Sc.sgd.db", keyType = "ENTREZID", ont = "BP", pvalueCutoff = 0.05, pAdjustMethod = "holm")
# View Upregulated_genesBP as dataframe with scroll bars.
kable(Upregulated_genesBP) %>%
kable_styling() %>%
scroll_box(width = "1000px", height = "800px")
ID | Description | GeneRatio | BgRatio | RichFactor | FoldEnrichment | zScore | pvalue | p.adjust | qvalue | geneID | Count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
GO:0034219 | GO:0034219 | carbohydrate transmembrane transport | 9/86 | 39/6436 | 0.2307692 | 17.270125 | 11.859644 | 0.00e+00 | 0.0000008 | 0.0000006 | 856640/855809/850490/851352/853207/850536/852601/850317/851944 | 9 |
GO:0005975 | GO:0005975 | carbohydrate metabolic process | 18/86 | 256/6436 | 0.0703125 | 5.261991 | 8.097947 | 0.00e+00 | 0.0000026 | 0.0000011 | 855810/856639/850491/853209/853984/853207/852602/856440/853418/854644/854350/852601/850317/854525/853395/852128/856579/855480 | 18 |
GO:0008643 | GO:0008643 | carbohydrate transport | 9/86 | 48/6436 | 0.1875000 | 14.031977 | 10.545935 | 0.00e+00 | 0.0000054 | 0.0000015 | 856640/855809/850490/851352/853207/850536/852601/850317/851944 | 9 |
GO:0016052 | GO:0016052 | carbohydrate catabolic process | 11/86 | 91/6436 | 0.1208791 | 9.046256 | 8.995708 | 0.00e+00 | 0.0000140 | 0.0000029 | 850491/853209/853984/853207/852602/854644/850317/854525/853395/852128/856579 | 11 |
GO:1904659 | GO:1904659 | glucose transmembrane transport | 6/86 | 23/6436 | 0.2608696 | 19.522750 | 10.355603 | 4.00e-07 | 0.0002241 | 0.0000378 | 856640/855809/851352/850536/850317/851944 | 6 |
GO:0008645 | GO:0008645 | hexose transmembrane transport | 6/86 | 26/6436 | 0.2307692 | 17.270125 | 9.673535 | 9.00e-07 | 0.0004942 | 0.0000596 | 856640/855809/851352/850536/850317/851944 | 6 |
GO:0015749 | GO:0015749 | monosaccharide transmembrane transport | 6/86 | 26/6436 | 0.2307692 | 17.270125 | 9.673535 | 9.00e-07 | 0.0004942 | 0.0000596 | 856640/855809/851352/850536/850317/851944 | 6 |
GO:0015761 | GO:0015761 | mannose transmembrane transport | 5/86 | 16/6436 | 0.3125000 | 23.386628 | 10.433206 | 1.50e-06 | 0.0008183 | 0.0000695 | 856640/855809/851352/850536/851944 | 5 |
GO:0006096 | GO:0006096 | glycolytic process | 6/86 | 29/6436 | 0.2068966 | 15.483560 | 9.096695 | 1.80e-06 | 0.0009820 | 0.0000695 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009137 | GO:0009137 | purine nucleoside diphosphate catabolic process | 6/86 | 29/6436 | 0.2068966 | 15.483560 | 9.096695 | 1.80e-06 | 0.0009820 | 0.0000695 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009181 | GO:0009181 | purine ribonucleoside diphosphate catabolic process | 6/86 | 29/6436 | 0.2068966 | 15.483560 | 9.096695 | 1.80e-06 | 0.0009820 | 0.0000695 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0046032 | GO:0046032 | ADP catabolic process | 6/86 | 29/6436 | 0.2068966 | 15.483560 | 9.096695 | 1.80e-06 | 0.0009820 | 0.0000695 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009191 | GO:0009191 | ribonucleoside diphosphate catabolic process | 6/86 | 30/6436 | 0.2000000 | 14.967442 | 8.923201 | 2.20e-06 | 0.0012057 | 0.0000784 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009134 | GO:0009134 | nucleoside diphosphate catabolic process | 6/86 | 31/6436 | 0.1935484 | 14.484621 | 8.757833 | 2.70e-06 | 0.0014765 | 0.0000784 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0019364 | GO:0019364 | pyridine nucleotide catabolic process | 6/86 | 31/6436 | 0.1935484 | 14.484621 | 8.757833 | 2.70e-06 | 0.0014765 | 0.0000784 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0046031 | GO:0046031 | ADP metabolic process | 6/86 | 31/6436 | 0.1935484 | 14.484621 | 8.757833 | 2.70e-06 | 0.0014765 | 0.0000784 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0015755 | GO:0015755 | fructose transmembrane transport | 5/86 | 18/6436 | 0.2777778 | 20.788114 | 9.783121 | 2.80e-06 | 0.0015461 | 0.0000784 | 856640/855809/851352/850536/851944 | 5 |
GO:0009135 | GO:0009135 | purine nucleoside diphosphate metabolic process | 6/86 | 32/6436 | 0.1875000 | 14.031977 | 8.599957 | 3.30e-06 | 0.0017848 | 0.0000811 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009179 | GO:0009179 | purine ribonucleoside diphosphate metabolic process | 6/86 | 32/6436 | 0.1875000 | 14.031977 | 8.599957 | 3.30e-06 | 0.0017848 | 0.0000811 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0072526 | GO:0072526 | pyridine-containing compound catabolic process | 6/86 | 33/6436 | 0.1818182 | 13.606765 | 8.449004 | 4.00e-06 | 0.0021503 | 0.0000932 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009185 | GO:0009185 | ribonucleoside diphosphate metabolic process | 6/86 | 34/6436 | 0.1764706 | 13.206566 | 8.304467 | 4.80e-06 | 0.0025785 | 0.0001066 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009154 | GO:0009154 | purine ribonucleotide catabolic process | 6/86 | 35/6436 | 0.1714286 | 12.829236 | 8.165888 | 5.70e-06 | 0.0030732 | 0.0001215 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009261 | GO:0009261 | ribonucleotide catabolic process | 6/86 | 36/6436 | 0.1666667 | 12.472868 | 8.032855 | 6.70e-06 | 0.0036419 | 0.0001380 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009132 | GO:0009132 | nucleoside diphosphate metabolic process | 6/86 | 37/6436 | 0.1621622 | 12.135764 | 7.904992 | 8.00e-06 | 0.0042926 | 0.0001561 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0006195 | GO:0006195 | purine nucleotide catabolic process | 6/86 | 38/6436 | 0.1578947 | 11.816401 | 7.781962 | 9.40e-06 | 0.0050339 | 0.0001761 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0006090 | GO:0006090 | pyruvate metabolic process | 6/86 | 40/6436 | 0.1500000 | 11.225581 | 7.549191 | 1.27e-05 | 0.0068383 | 0.0002305 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0009166 | GO:0009166 | nucleotide catabolic process | 6/86 | 42/6436 | 0.1428571 | 10.691030 | 7.332379 | 1.70e-05 | 0.0091310 | 0.0002969 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0046352 | GO:0046352 | disaccharide catabolic process | 4/86 | 13/6436 | 0.3076923 | 23.026834 | 9.251058 | 1.94e-05 | 0.0103718 | 0.0003258 | 853209/853207/852602/854644 | 4 |
GO:0072523 | GO:0072523 | purine-containing compound catabolic process | 6/86 | 45/6436 | 0.1333333 | 9.978295 | 7.033190 | 2.56e-05 | 0.0136792 | 0.0004046 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0000023 | GO:0000023 | maltose metabolic process | 4/86 | 14/6436 | 0.2857143 | 21.382060 | 8.884103 | 2.69e-05 | 0.0143191 | 0.0004046 | 853209/853207/852602/852601 | 4 |
GO:0009313 | GO:0009313 | oligosaccharide catabolic process | 4/86 | 14/6436 | 0.2857143 | 21.382060 | 8.884103 | 2.69e-05 | 0.0143191 | 0.0004046 | 853209/853207/852602/854644 | 4 |
GO:0006091 | GO:0006091 | generation of precursor metabolites and energy | 12/86 | 218/6436 | 0.0550459 | 4.119479 | 5.452826 | 2.82e-05 | 0.0149490 | 0.0004046 | 854695/850491/853984/852825/854350/855105/850317/854525/853395/852128/856579/855480 | 12 |
GO:1901292 | GO:1901292 | nucleoside phosphate catabolic process | 6/86 | 46/6436 | 0.1304348 | 9.761375 | 6.939647 | 2.91e-05 | 0.0154474 | 0.0004046 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0005984 | GO:0005984 | disaccharide metabolic process | 5/86 | 28/6436 | 0.1785714 | 13.363787 | 7.629673 | 2.92e-05 | 0.0154672 | 0.0004046 | 853209/853207/852602/854644/852601 | 5 |
GO:0009141 | GO:0009141 | nucleoside triphosphate metabolic process | 7/86 | 70/6436 | 0.1000000 | 7.483721 | 6.347119 | 3.61e-05 | 0.0190442 | 0.0004849 | 853984/850317/854525/853395/852128/856579/856074 | 7 |
GO:0015791 | GO:0015791 | polyol transmembrane transport | 4/86 | 16/6436 | 0.2500000 | 18.709302 | 8.253356 | 4.79e-05 | 0.0252210 | 0.0006229 | 856640/855809/850490/851352 | 4 |
GO:0009311 | GO:0009311 | oligosaccharide metabolic process | 5/86 | 31/6436 | 0.1612903 | 12.070518 | 7.189950 | 4.90e-05 | 0.0257639 | 0.0006229 | 853209/853207/852602/854644/852601 | 5 |
GO:0046034 | GO:0046034 | ATP metabolic process | 6/86 | 51/6436 | 0.1176471 | 8.804378 | 6.511478 | 5.31e-05 | 0.0278911 | 0.0006578 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:1902600 | GO:1902600 | proton transmembrane transport | 8/86 | 105/6436 | 0.0761905 | 5.701883 | 5.652843 | 7.11e-05 | 0.0372564 | 0.0008578 | 856640/855809/854695/851352/853207/850536/852601/851944 | 8 |
GO:0098662 | GO:0098662 | inorganic cation transmembrane transport | 10/86 | 172/6436 | 0.0581395 | 4.351000 | 5.183815 | 8.76e-05 | 0.0458013 | 0.0010301 | 856640/855809/854695/851352/853207/856260/850536/852601/851608/851944 | 10 |
GO:0009205 | GO:0009205 | purine ribonucleoside triphosphate metabolic process | 6/86 | 56/6436 | 0.1071429 | 8.018272 | 6.138334 | 9.08e-05 | 0.0473990 | 0.0010411 | 853984/850317/854525/853395/852128/856579 | 6 |
GO:0046496 | GO:0046496 | nicotinamide nucleotide metabolic process | 7/86 | 81/6436 | 0.0864198 | 6.467413 | 5.762398 | 9.29e-05 | 0.0484163 | 0.0010411 | 853984/850317/854525/853395/852128/856579/855480 | 7 |
# View Downregulated_genesBP as dataframe with scroll bars.
kable(Downregulated_genesBP) %>%
kable_styling() %>%
scroll_box(width = "1000px", height = "800px")
ID | Description | GeneRatio | BgRatio | RichFactor | FoldEnrichment | zScore | pvalue | p.adjust | qvalue | geneID | Count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
GO:0045333 | GO:0045333 | cellular respiration | 15/74 | 115/6436 | 0.1304348 | 11.344301 | 12.071201 | 0.00e+00 | 0.0000000 | 0.0000000 | 851013/855606/855866/856794/855969/855691/851726/856321/852235/854303/850361/853507/855732/854231/851111 | 15 |
GO:0009060 | GO:0009060 | aerobic respiration | 14/74 | 99/6436 | 0.1414141 | 12.299208 | 12.218477 | 0.00e+00 | 0.0000000 | 0.0000000 | 851013/855606/855866/856794/855691/851726/856321/852235/854303/850361/853507/855732/854231/851111 | 14 |
GO:0006099 | GO:0006099 | tricarboxylic acid cycle | 9/74 | 31/6436 | 0.2903226 | 25.250218 | 14.595878 | 0.00e+00 | 0.0000000 | 0.0000000 | 851013/855606/855866/856794/855691/851726/854303/850361/855732 | 9 |
GO:0015980 | GO:0015980 | energy derivation by oxidation of organic compounds | 15/74 | 165/6436 | 0.0909091 | 7.906634 | 9.692444 | 0.00e+00 | 0.0000002 | 0.0000000 | 851013/855606/855866/856794/855969/855691/851726/856321/852235/854303/850361/853507/855732/854231/851111 | 15 |
GO:0045229 | GO:0045229 | external encapsulating structure organization | 17/74 | 247/6436 | 0.0688259 | 5.985994 | 8.617540 | 0.00e+00 | 0.0000009 | 0.0000001 | 853766/856546/850992/856671/854421/855659/856541/852237/852459/853196/852897/856861/853282/855352/852012/852856/855355 | 17 |
GO:0071555 | GO:0071555 | cell wall organization | 17/74 | 247/6436 | 0.0688259 | 5.985994 | 8.617540 | 0.00e+00 | 0.0000009 | 0.0000001 | 853766/856546/850992/856671/854421/855659/856541/852237/852459/853196/852897/856861/853282/855352/852012/852856/855355 | 17 |
GO:0006091 | GO:0006091 | generation of precursor metabolites and energy | 16/74 | 218/6436 | 0.0733945 | 6.383337 | 8.720629 | 0.00e+00 | 0.0000012 | 0.0000001 | 851013/855606/855866/856794/855969/855691/851726/856321/852235/854303/854262/850361/853507/855732/854231/851111 | 16 |
GO:0071554 | GO:0071554 | cell wall organization or biogenesis | 18/74 | 297/6436 | 0.0606061 | 5.271089 | 8.127582 | 0.00e+00 | 0.0000022 | 0.0000002 | 853766/856546/850992/856671/854421/855659/856541/852237/852459/853196/852897/856861/853282/855352/852012/854475/852856/855355 | 18 |
GO:0005975 | GO:0005975 | carbohydrate metabolic process | 14/74 | 256/6436 | 0.0546875 | 4.756335 | 6.614291 | 9.00e-07 | 0.0005514 | 0.0000473 | 853972/850992/855606/851092/856671/856794/855659/853196/854262/855352/850361/855732/852856/855355 | 14 |
GO:0051301 | GO:0051301 | cell division | 15/74 | 296/6436 | 0.0506757 | 4.407414 | 6.472617 | 1.00e-06 | 0.0005617 | 0.0000473 | 856546/850992/855819/855427/852455/855659/855239/854666/852897/856861/853003/850334/855389/854475/852119 | 15 |
GO:0000920 | GO:0000920 | septum digestion after cytokinesis | 5/74 | 20/6436 | 0.2500000 | 21.743243 | 10.019655 | 2.40e-06 | 0.0013970 | 0.0001071 | 856546/850992/852455/856861/855389 | 5 |
GO:0019752 | GO:0019752 | carboxylic acid metabolic process | 14/74 | 370/6436 | 0.0378378 | 3.290869 | 4.894894 | 6.49e-05 | 0.0381142 | 0.0026827 | 855606/855866/856794/856539/856108/855691/851726/854303/850361/856745/854950/850295/855732/855786 | 14 |
# Plotting our results of Upregulated_genesBP in a barplot using ggplot2 and ggrepel for customization of the title.
Up_genesBP.plot = barplot(Upregulated_genesBP, showCategory = 30) + ggtitle("GO term Enrichment Analysis of Upregulated Genes (BP)") +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
# Extracting Up_genesBP.plot into a PNG file and saving it to our current directory.
#png("./Up_genesBP.png", res = 300, width = 3200, height = 5100)
# Renders the Up_genesBP.plot and writes it to the PNG file.
print(Up_genesBP.plot)
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off()
# Plotting our results of Downregulated_genesBP in a barplot using ggplot2 and ggrepel for customization of the title.
Down_genesBP.plot = barplot(Downregulated_genesBP, showCategory = 20) + ggtitle("GO term Enrichment Analysis of Downregulated Genes (BP)") +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
# Extracting Down_genesBP.plot into a PNG file and saving it to our current directory.
#png("./Down_genesBP.png", res = 300, width = 3200, height = 5100)
# Renders the Down_genesBP.plot and writes it to the PNG file.
print(Down_genesBP.plot)
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off()
Performing Gene Ontology (GO) enrichment analysis for Molecular Function (MF) terms using the enrichGO function from the clusterProfiler package. The enrichment analysis is conducted for Upregulated and Downregulated genes based on their ENTREZID identifiers. We have used the Saccharomyces cerevisiae (baker’s yeast) database from the org.Sc.sgd.db to map our genes ENTREZ IDs to GO terms specific to yeast. Additionally, we have used a pvalueCutoff of 0.05 and also applied the Holm-Bonferroni correction to adjust p-values for multiple testing as stated in the paper.
# Perform enrichGO for Upregulated genes.
Upregulated_genesMF = enrichGO(gene = Upregulated_genes, OrgDb = "org.Sc.sgd.db", keyType = "ENTREZID", ont = "MF", pvalueCutoff = 0.05, pAdjustMethod = "holm")
# Perform enrichGO for Downregulated genes.
Downregulated_genesMF = enrichGO(gene = Downregulated_genes, OrgDb = "org.Sc.sgd.db", keyType = "ENTREZID", ont = "MF", pvalueCutoff = 0.05, pAdjustMethod = "holm")
# View Upregulated_genesMF as dataframe with scroll bars.
kable(Upregulated_genesMF) %>%
kable_styling() %>%
scroll_box(width = "1000px", height = "800px")
ID | Description | GeneRatio | BgRatio | RichFactor | FoldEnrichment | zScore | pvalue | p.adjust | qvalue | geneID | Count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
GO:0015144 | GO:0015144 | carbohydrate transmembrane transporter activity | 8/86 | 30/6436 | 0.2666667 | 19.956589 | 12.110554 | 0.0000000 | 0.0000006 | 0.0000005 | 856640/855809/850490/851352/853207/850536/852601/851944 | 8 |
GO:0005351 | GO:0005351 | carbohydrate:proton symporter activity | 7/86 | 25/6436 | 0.2800000 | 20.954419 | 11.632762 | 0.0000000 | 0.0000045 | 0.0000012 | 856640/855809/851352/853207/850536/852601/851944 | 7 |
GO:0005402 | GO:0005402 | carbohydrate:monoatomic cation symporter activity | 7/86 | 25/6436 | 0.2800000 | 20.954419 | 11.632762 | 0.0000000 | 0.0000045 | 0.0000012 | 856640/855809/851352/853207/850536/852601/851944 | 7 |
GO:0015295 | GO:0015295 | solute:proton symporter activity | 7/86 | 34/6436 | 0.2058824 | 15.407661 | 9.801933 | 0.0000002 | 0.0000451 | 0.0000094 | 856640/855809/851352/853207/850536/852601/851944 | 7 |
GO:0015294 | GO:0015294 | solute:monoatomic cation symporter activity | 7/86 | 37/6436 | 0.1891892 | 14.158391 | 9.340803 | 0.0000004 | 0.0000831 | 0.0000140 | 856640/855809/851352/853207/850536/852601/851944 | 7 |
GO:0005353 | GO:0005353 | fructose transmembrane transporter activity | 5/86 | 15/6436 | 0.3333333 | 24.945736 | 10.804611 | 0.0000010 | 0.0001916 | 0.0000231 | 856640/855809/851352/850536/851944 | 5 |
GO:0015578 | GO:0015578 | mannose transmembrane transporter activity | 5/86 | 15/6436 | 0.3333333 | 24.945736 | 10.804611 | 0.0000010 | 0.0001916 | 0.0000231 | 856640/855809/851352/850536/851944 | 5 |
GO:0015293 | GO:0015293 | symporter activity | 7/86 | 43/6436 | 0.1627907 | 12.182802 | 8.561880 | 0.0000013 | 0.0002400 | 0.0000256 | 856640/855809/851352/853207/850536/852601/851944 | 7 |
GO:0005355 | GO:0005355 | glucose transmembrane transporter activity | 5/86 | 17/6436 | 0.2941176 | 22.010944 | 10.094225 | 0.0000021 | 0.0003803 | 0.0000363 | 856640/855809/851352/850536/851944 | 5 |
GO:0015145 | GO:0015145 | monosaccharide transmembrane transporter activity | 5/86 | 20/6436 | 0.2500000 | 18.709302 | 9.230408 | 0.0000050 | 0.0009182 | 0.0000660 | 856640/855809/851352/850536/851944 | 5 |
GO:0015149 | GO:0015149 | hexose transmembrane transporter activity | 5/86 | 20/6436 | 0.2500000 | 18.709302 | 9.230408 | 0.0000050 | 0.0009182 | 0.0000660 | 856640/855809/851352/850536/851944 | 5 |
GO:0051119 | GO:0051119 | sugar transmembrane transporter activity | 5/86 | 20/6436 | 0.2500000 | 18.709302 | 9.230408 | 0.0000050 | 0.0009182 | 0.0000660 | 856640/855809/851352/850536/851944 | 5 |
GO:0022853 | GO:0022853 | active monoatomic ion transmembrane transporter activity | 9/86 | 102/6436 | 0.0882353 | 6.603283 | 6.638046 | 0.0000074 | 0.0013308 | 0.0000898 | 856640/855809/854695/851352/853207/850536/852601/851608/851944 | 9 |
GO:0015291 | GO:0015291 | secondary active transmembrane transporter activity | 8/86 | 90/6436 | 0.0888889 | 6.652196 | 6.283840 | 0.0000233 | 0.0041626 | 0.0002623 | 856640/855809/851352/853207/850536/852601/853039/851944 | 8 |
GO:0015318 | GO:0015318 | inorganic molecular entity transmembrane transporter activity | 11/86 | 188/6436 | 0.0585106 | 4.378773 | 5.471470 | 0.0000357 | 0.0063505 | 0.0003755 | 856640/855809/854695/850490/851352/853207/856260/850536/852601/851608/851944 | 11 |
GO:0022890 | GO:0022890 | inorganic cation transmembrane transporter activity | 10/86 | 161/6436 | 0.0621118 | 4.648274 | 5.455444 | 0.0000500 | 0.0088551 | 0.0004937 | 856640/855809/854695/851352/853207/856260/850536/852601/851608/851944 | 10 |
GO:0015078 | GO:0015078 | proton transmembrane transporter activity | 8/86 | 103/6436 | 0.0776699 | 5.812599 | 5.729677 | 0.0000620 | 0.0109047 | 0.0005755 | 856640/855809/854695/851352/853207/850536/852601/851944 | 8 |
GO:0022804 | GO:0022804 | active transmembrane transporter activity | 10/86 | 176/6436 | 0.0568182 | 4.252114 | 5.090631 | 0.0001062 | 0.0185800 | 0.0009313 | 856640/855809/854695/851352/853207/850536/852601/851608/853039/851944 | 10 |
# View Downregulated_genesMF as dataframe with scroll bars.
kable(Downregulated_genesMF) %>%
kable_styling() %>%
scroll_box(width = "1000px", height = "800px")
ID | Description | GeneRatio | BgRatio | RichFactor | FoldEnrichment | zScore | pvalue | p.adjust | qvalue | geneID | Count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
GO:0016798 | GO:0016798 | hydrolase activity, acting on glycosyl bonds | 10/74 | 62/6436 | 0.1612903 | 14.027899 | 11.116229 | 0.0000000 | 0.0000002 | 0.0000002 | 856546/850992/856671/855659/852459/853196/855352/855389/852481/852856 | 10 |
GO:0004553 | GO:0004553 | hydrolase activity, hydrolyzing O-glycosyl compounds | 9/74 | 49/6436 | 0.1836735 | 15.974628 | 11.347467 | 0.0000000 | 0.0000005 | 0.0000002 | 856546/850992/856671/855659/853196/855352/855389/852481/852856 | 9 |
GO:0015926 | GO:0015926 | glucosidase activity | 5/74 | 30/6436 | 0.1666667 | 14.495496 | 7.990046 | 0.0000200 | 0.0029751 | 0.0008127 | 856546/855659/853196/855352/852856 | 5 |
GO:0022890 | GO:0022890 | inorganic cation transmembrane transporter activity | 9/74 | 161/6436 | 0.0559006 | 4.861843 | 5.351728 | 0.0000842 | 0.0124663 | 0.0022584 | 852599/856321/852235/852177/856779/854231/852149/851111/851560 | 9 |
GO:0008324 | GO:0008324 | monoatomic cation transmembrane transporter activity | 9/74 | 165/6436 | 0.0545455 | 4.743980 | 5.254125 | 0.0001018 | 0.0149688 | 0.0022584 | 852599/856321/852235/852177/856779/854231/852149/851111/851560 | 9 |
GO:0015075 | GO:0015075 | monoatomic ion transmembrane transporter activity | 9/74 | 173/6436 | 0.0520231 | 4.524605 | 5.067989 | 0.0001464 | 0.0213733 | 0.0022584 | 852599/856321/852235/852177/856779/854231/852149/851111/851560 | 9 |
GO:0015078 | GO:0015078 | proton transmembrane transporter activity | 7/74 | 103/6436 | 0.0679612 | 5.910785 | 5.418238 | 0.0001637 | 0.0237375 | 0.0022584 | 856321/852235/852177/854231/852149/851111/851560 | 7 |
GO:0046912 | GO:0046912 | acyltransferase activity, acyl groups converted into alkyl on transfer | 3/74 | 10/6436 | 0.3000000 | 26.091892 | 8.563600 | 0.0001653 | 0.0237985 | 0.0022584 | 855606/850361/855732 | 3 |
GO:0009055 | GO:0009055 | electron transfer activity | 5/74 | 46/6436 | 0.1086957 | 9.453584 | 6.205298 | 0.0001665 | 0.0238034 | 0.0022584 | 856321/852235/853507/854231/851111 | 5 |
GO:0015318 | GO:0015318 | inorganic molecular entity transmembrane transporter activity | 9/74 | 188/6436 | 0.0478723 | 4.163600 | 4.747700 | 0.0002742 | 0.0389364 | 0.0031277 | 852599/856321/852235/852177/856779/854231/852149/851111/851560 | 9 |
GO:0015453 | GO:0015453 | oxidoreduction-driven active transmembrane transporter activity | 4/74 | 30/6436 | 0.1333333 | 11.596396 | 6.273626 | 0.0003519 | 0.0496246 | 0.0031277 | 856321/852235/854231/851111 | 4 |
GO:0020037 | GO:0020037 | heme binding | 4/74 | 30/6436 | 0.1333333 | 11.596396 | 6.273626 | 0.0003519 | 0.0496246 | 0.0031277 | 852979/854950/853507/854231 | 4 |
GO:0046906 | GO:0046906 | tetrapyrrole binding | 4/74 | 30/6436 | 0.1333333 | 11.596396 | 6.273626 | 0.0003519 | 0.0496246 | 0.0031277 | 852979/854950/853507/854231 | 4 |
# Plotting our results of Upregulated_genesMF in a barplot using ggplot2 and ggrepel for customization of the title.
Up_genesMF.plot = barplot(Upregulated_genesMF, showCategory = 30) + ggtitle("GO term Enrichment Analysis of Upregulated Genes (MF)") +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
# Extracting Up_genesMF.plot into a PNG file and saving it to our current directory.
#png("./Up_genesMF.png", res = 300, width = 3200, height = 5100)
# Renders the Up_genesMF.plot and writes it to the PNG file.
print(Up_genesMF.plot)
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off()
# Plotting our results of Downregulated_genesMF in a barplot using ggplot2 and ggrepel for customization of the title.
Down_genesMF.plot = barplot(Downregulated_genesMF, showCategory = 20) + ggtitle("GO term Enrichment Analysis of Downregulated Genes (MF)") +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
# Extracting Down_genesMF.plot into a PNG file and saving it to our current directory.
#png("./Down_genesMF.png", res = 300, width = 3200, height = 5100)
# Renders the Down_genesMF.plot and writes it to the PNG file.
print(Down_genesMF.plot)
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off()
Performing Gene Ontology (GO) enrichment analysis for Cellular Component (CC) terms using the enrichGO function from the clusterProfiler package. The enrichment analysis is conducted for Upregulated and Downregulated genes based on their ENTREZID identifiers. We have used the Saccharomyces cerevisiae (baker’s yeast) database from the org.Sc.sgd.db to map our genes ENTREZ IDs to GO terms specific to yeast. Additionally, we have used a pvalueCutoff of 0.05 and also applied the Holm-Bonferroni correction to adjust p-values for multiple testing as stated in the paper.
# Perform enrichGO for Upregulated genes.
Upregulated_genesCC = enrichGO(gene = Upregulated_genes, OrgDb = "org.Sc.sgd.db", keyType = "ENTREZID", ont = "CC", pvalueCutoff = 0.05, pAdjustMethod = "holm")
# Perform enrichGO for Downregulated genes.
Downregulated_genesCC = enrichGO(gene = Downregulated_genes, OrgDb = "org.Sc.sgd.db", keyType = "ENTREZID", ont = "CC", pvalueCutoff = 0.05, pAdjustMethod = "holm")
# View Downregulated_genesCC as dataframe with scroll bars.
kable(Downregulated_genesCC) %>%
kable_styling() %>%
scroll_box(width = "1000px", height = "800px")
ID | Description | GeneRatio | BgRatio | RichFactor | FoldEnrichment | zScore | pvalue | p.adjust | qvalue | geneID | Count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
GO:0005576 | GO:0005576 | extracellular region | 17/74 | 121/6435 | 0.1404959 | 12.217445 | 13.434702 | 0.0000000 | 0.0000000 | 0.0000000 | 853766/856546/850992/856671/856794/854421/855659/855416/852459/853366/853196/852897/853282/855352/855389/852856/855355 | 17 |
GO:0009277 | GO:0009277 | fungal-type cell wall | 15/74 | 132/6435 | 0.1136364 | 9.881757 | 11.120028 | 0.0000000 | 0.0000000 | 0.0000000 | 853766/856546/850992/856671/854421/855659/855940/855416/852459/853196/853282/855352/855389/852856/855355 | 15 |
GO:0005618 | GO:0005618 | cell wall | 15/74 | 133/6435 | 0.1127820 | 9.807458 | 11.069573 | 0.0000000 | 0.0000000 | 0.0000000 | 853766/856546/850992/856671/854421/855659/855940/855416/852459/853196/853282/855352/855389/852856/855355 | 15 |
GO:0030312 | GO:0030312 | external encapsulating structure | 15/74 | 134/6435 | 0.1119403 | 9.734268 | 11.019651 | 0.0000000 | 0.0000000 | 0.0000000 | 853766/856546/850992/856671/854421/855659/855940/855416/852459/853196/853282/855352/855389/852856/855355 | 15 |
GO:0009986 | GO:0009986 | cell surface | 6/74 | 18/6435 | 0.3333333 | 28.986487 | 12.823669 | 0.0000000 | 0.0000033 | 0.0000004 | 854421/855659/853196/852897/855352/852856 | 6 |
GO:1990204 | GO:1990204 | oxidoreductase complex | 7/74 | 42/6435 | 0.1666667 | 14.493243 | 9.461987 | 0.0000004 | 0.0000402 | 0.0000040 | 856801/855691/851726/856321/852235/854303/854231 | 7 |
GO:0009295 | GO:0009295 | nucleoid | 6/74 | 28/6435 | 0.2142857 | 18.634170 | 10.085568 | 0.0000006 | 0.0000592 | 0.0000044 | 851013/855691/851726/852177/850295/856682 | 6 |
GO:0042645 | GO:0042645 | mitochondrial nucleoid | 6/74 | 28/6435 | 0.2142857 | 18.634170 | 10.085568 | 0.0000006 | 0.0000592 | 0.0000044 | 851013/855691/851726/852177/850295/856682 | 6 |
GO:0045239 | GO:0045239 | tricarboxylic acid cycle enzyme complex | 4/74 | 10/6435 | 0.4000000 | 34.783784 | 11.530959 | 0.0000032 | 0.0003212 | 0.0000218 | 855866/855691/851726/854303 | 4 |
GO:0070469 | GO:0070469 | respirasome | 6/74 | 41/6435 | 0.1463415 | 12.725775 | 8.123456 | 0.0000062 | 0.0006096 | 0.0000376 | 856321/852235/854950/853507/854231/851111 | 6 |
GO:0005621 | GO:0005621 | cellular bud scar | 4/74 | 14/6435 | 0.2857143 | 24.845560 | 9.633052 | 0.0000148 | 0.0014490 | 0.0000821 | 856546/855659/855389/855355 | 4 |
GO:0005933 | GO:0005933 | cellular bud | 12/74 | 257/6435 | 0.0466926 | 4.060364 | 5.400194 | 0.0000307 | 0.0029763 | 0.0001561 | 850992/856671/852455/854421/854666/856861/853282/850334/855389/854437/854903/852119 | 12 |
GO:0005750 | GO:0005750 | mitochondrial respiratory chain complex III | 3/74 | 10/6435 | 0.3000000 | 26.087838 | 8.562890 | 0.0001653 | 0.0158729 | 0.0007210 | 856321/852235/854231 | 3 |
GO:0045275 | GO:0045275 | respiratory chain complex III | 3/74 | 10/6435 | 0.3000000 | 26.087838 | 8.562890 | 0.0001653 | 0.0158729 | 0.0007210 | 856321/852235/854231 | 3 |
GO:0000307 | GO:0000307 | cyclin-dependent protein kinase holoenzyme complex | 4/74 | 28/6435 | 0.1428571 | 12.422780 | 6.533068 | 0.0002677 | 0.0251654 | 0.0010216 | 855819/855427/855239/853003 | 4 |
GO:0070069 | GO:0070069 | cytochrome complex | 4/74 | 28/6435 | 0.1428571 | 12.422780 | 6.533068 | 0.0002677 | 0.0251654 | 0.0010216 | 856321/852235/854231/851111 | 4 |
GO:0098800 | GO:0098800 | inner mitochondrial membrane protein complex | 6/74 | 86/6435 | 0.0697674 | 6.066939 | 5.101947 | 0.0004280 | 0.0393777 | 0.0014676 | 856321/852235/852177/854231/851111/851560 | 6 |
GO:0005759 | GO:0005759 | mitochondrial matrix | 10/74 | 244/6435 | 0.0409836 | 3.563912 | 4.403636 | 0.0004327 | 0.0393777 | 0.0014676 | 851013/855866/855691/851726/854303/850361/852177/850295/855732/856682 | 10 |
GO:0098552 | GO:0098552 | side of membrane | 6/74 | 89/6435 | 0.0674157 | 5.862436 | 4.981871 | 0.0005149 | 0.0463367 | 0.0016544 | 853766/856546/856671/855416/855389/855355 | 6 |
# Plotting our results of Downregulated_genesCC in a barplot using ggplot2 and ggrepel for customization of the title.
Down_genesCC.plot = barplot(Downregulated_genesCC, showCategory = 20) + ggtitle("GO term Enrichment Analysis of Downregulated Genes (CC)") +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
# Extracting Down_genesCC.plot into a PNG file and saving it to our current directory.
#png("./Down_genesCC.png", res = 300, width = 3200, height = 5100)
# Renders the Down_genesCC.plot and writes it to the PNG file.
print(Down_genesCC.plot)
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off()
# View Upregulated_genesCC as dataframe.
as.data.frame(Upregulated_genesCC)
## [1] ID Description GeneRatio BgRatio RichFactor
## [6] FoldEnrichment zScore pvalue p.adjust qvalue
## [11] geneID Count
## <0 rows> (or 0-length row.names)
Performing KEGG pathway enrichment analysis for upregulated and downregulated genes using the enrichKEGG function from the clusterProfiler package. The pathway enrichment analysis is conducted for Upregulated and Downregulated genes based on NCBI gene identifiers (ENTREZ IDs) that match to their respective IDs in KEGG pathway database. organism = “sce” specifies that the organism is Saccharomyces cerevisiae. KEGG uses a three-letter organism code; “sce” corresponds to yeast. Additionally, we have used a pvalueCutoff of 0.05 and also applied the Holm-Bonferroni correction to adjust p-values for multiple testing as stated in the paper.
# Perform enrichKEGG for Upregulated_genes and store the results to Path_Upregulated_genes.
Path_Upregulated_genes = enrichKEGG(gene = Upregulated_genes, organism = "sce", keyType = "ncbi-geneid", pvalueCutoff = 0.05, pAdjustMethod = "holm", minGSSize = 5)
## Reading KEGG annotation online: "https://rest.kegg.jp/link/sce/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/sce"...
## Reading KEGG annotation online: "https://rest.kegg.jp/conv/ncbi-geneid/sce"...
# Perform enrichKEGG for Downregulated_genes and store the results to Path_Downregulated_genes.
Path_Downregulated_genes = enrichKEGG(gene = Downregulated_genes, organism = "sce", keyType = "ncbi-geneid", pvalueCutoff = 0.05, pAdjustMethod = "holm", minGSSize = 5)
# View Path_Upregulated_genes as dataframe with scroll bars.
kable(Path_Upregulated_genes) %>%
kable_styling() %>%
scroll_box(width = "1000px", height = "500px")
category | subcategory | ID | Description | GeneRatio | BgRatio | RichFactor | FoldEnrichment | zScore | pvalue | p.adjust | qvalue | geneID | Count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
sce00051 | Metabolism | Carbohydrate metabolism | sce00051 | Fructose and mannose metabolism - Saccharomyces cerevisiae (budding yeast) | 6/38 | 23/2545 | 0.2608696 | 17.471396 | 9.767760 | 0.0000006 | 0.0000241 | 0.0000162 | 855810/856639/850491/853984/850317/852128 | 6 |
sce00052 | Metabolism | Carbohydrate metabolism | sce00052 | Galactose metabolism - Saccharomyces cerevisiae (budding yeast) | 5/38 | 24/2545 | 0.2083333 | 13.952851 | 7.847977 | 0.0000196 | 0.0007438 | 0.0002576 | 853209/852602/854644/850317/852128 | 5 |
sce00500 | Metabolism | Carbohydrate metabolism | sce00500 | Starch and sucrose metabolism - Saccharomyces cerevisiae (budding yeast) | 5/38 | 41/2545 | 0.1219512 | 8.167523 | 5.695305 | 0.0002868 | 0.0106117 | 0.0022571 | 853209/852602/854644/850317/852128 | 5 |
sce04081 | NA | NA | sce04081 | Hormone signaling - Saccharomyces cerevisiae (budding yeast) | 3/38 | 10/2545 | 0.3000000 | 20.092105 | 7.446257 | 0.0003431 | 0.0123510 | 0.0022571 | 854052/854160/852102 | 3 |
# View Path_Downregulated_genes as dataframe with scroll bars.
kable(Path_Downregulated_genes) %>%
kable_styling() %>%
scroll_box(width = "1000px", height = "800px")
category | subcategory | ID | Description | GeneRatio | BgRatio | RichFactor | FoldEnrichment | zScore | pvalue | p.adjust | qvalue | geneID | Count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
sce01200 | Metabolism | Global and overview maps | sce01200 | Carbon metabolism - Saccharomyces cerevisiae (budding yeast) | 15/40 | 113/2545 | 0.1327434 | 8.445797 | 10.229473 | 0.0000000 | 0.0000000 | 0.0000000 | 853972/852979/851013/855606/851092/855866/856794/856108/855691/851726/854303/854262/850361/850295/855732 | 15 |
sce00020 | Metabolism | Carbohydrate metabolism | sce00020 | Citrate cycle (TCA cycle) - Saccharomyces cerevisiae (budding yeast) | 8/40 | 31/2545 | 0.2580645 | 16.419355 | 10.913124 | 0.0000000 | 0.0000004 | 0.0000001 | 853972/851013/855866/855691/851726/854303/850361/855732 | 8 |
sce00630 | Metabolism | Carbohydrate metabolism | sce00630 | Glyoxylate and dicarboxylate metabolism - Saccharomyces cerevisiae (budding yeast) | 6/40 | 29/2545 | 0.2068966 | 13.163793 | 8.323335 | 0.0000037 | 0.0001304 | 0.0000301 | 852979/851013/855606/856794/850361/855732 | 6 |
sce01210 | Metabolism | Global and overview maps | sce01210 | 2-Oxocarboxylic acid metabolism - Saccharomyces cerevisiae (budding yeast) | 6/40 | 42/2545 | 0.1428571 | 9.089286 | 6.678656 | 0.0000354 | 0.0012034 | 0.0002142 | 851013/855691/851726/854303/850361/855732 | 6 |
sce01110 | Metabolism | Global and overview maps | sce01110 | Biosynthesis of secondary metabolites - Saccharomyces cerevisiae (budding yeast) | 16/40 | 360/2545 | 0.0444444 | 2.827778 | 4.728611 | 0.0000455 | 0.0015000 | 0.0002201 | 853972/852979/851013/855606/851092/855866/856794/856539/855691/851726/854303/854262/850361/850295/855732/855786 | 16 |
sce00190 | Metabolism | Energy metabolism | sce00190 | Oxidative phosphorylation - Saccharomyces cerevisiae (budding yeast) | 7/40 | 79/2545 | 0.0886076 | 5.637658 | 5.290545 | 0.0001734 | 0.0055503 | 0.0006999 | 856321/852235/852177/853507/854231/851111/851560 | 7 |
sce01230 | Metabolism | Global and overview maps | sce01230 | Biosynthesis of amino acids - Saccharomyces cerevisiae (budding yeast) | 8/40 | 125/2545 | 0.0640000 | 4.072000 | 4.449930 | 0.0005516 | 0.0170996 | 0.0019078 | 851013/855691/854303/854262/850361/850295/855732/855786 | 8 |
# Plotting our results of Path_Upregulated_genes in a barplot using ggplot2 and ggrepel for customization of the title.
Up_genesPath_plot = barplot(Path_Upregulated_genes, showCategory = 30) + ggtitle("Pathway Enrichment Analysis of Upregulated Genes") +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
# Extracting Up_genesPath_plot into a PNG file and saving it to our current directory.
#png("./Up_genesPath_plot.png", res = 300, width = 3000, height = 1000)
# Renders the Up_genesPath_plot and writes it to the PNG file.
print(Up_genesPath_plot)
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off()
# Plotting our results of Path_Downregulated_genes in a barplot using ggplot2 and ggrepel for customization of the title.
Down_genesPath_plot = barplot(Path_Downregulated_genes, showCategory = 20) + ggtitle("Pathway Enrichment Analysis of Downregulated Genes") + theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
# Extracting Down_genesPath_plot into a PNG file and saving it to our current directory.
#png("./Down_genesPath_plot.png", res = 300, width = 3500, height = 2200)
# Renders the Down_genesPath_plot and writes it to the PNG file.
print(Down_genesPath_plot)
# "dev.off()" closes the graphics device, freeing up memory.
#dev.off()