ggplot2 is a powerful package for creating beautiful visualizations in R. It’s based on the “Grammar of Graphics,” which means we build plots layer by layer. Think of it like making a sandwich:
Let’s start by loading the required packages and our sample data:
# Load required R packages for visualization
library(tidyverse) # Includes ggplot2 and data manipulation tools
library(viridis) # Color-blind friendly color palettes## Loading required package: viridisLite
# Create sample gene expression dataset
gene_data <- tibble(
gene_id = c("BRCA1", "TP53", "EGFR", "KRAS", "HER2"), # Gene names
control_1 = c(100, 150, 80, 200, 120), # Control replicate 1
control_2 = c(110, 140, 85, 190, 125), # Control replicate 2
treated_1 = c(200, 300, 90, 180, 240), # Treatment replicate 1
treated_2 = c(190, 280, 95, 185, 230), # Treatment replicate 2
chromosome = c("17", "17", "7", "12", "17"), # Chromosome location
pathway = c("DNA repair", "Cell cycle", "Growth", "Signaling", "Growth") # Biological pathway
)
# Convert data from wide to long format for plotting
gene_data_long <- gene_data %>%
pivot_longer(
cols = c(control_1, control_2, treated_1, treated_2), # Columns to convert
names_to = "sample", # New column for sample names
values_to = "expression" # New column for expression values
)Every ggplot2 plot has three key components: 1.
Data: The dataset you want to visualize 2.
Aesthetics (aes): How your data maps to
visual properties (x, y, color, size, etc.) 3.
Geometries (geom_*): The type of plot you
want to create
# Create a simple bar plot of gene expression
ggplot(gene_data_long, aes(x = gene_id, y = expression)) + # Map genes to x-axis, expression to y-axis
geom_bar(stat = "identity") + # Create bars with heights = y values
labs(title = "Gene Expression Levels", # Add plot title
x = "Gene", # Label x-axis
y = "Expression Level") # Label y-axis# Create grouped bar plot showing expression by sample
ggplot(gene_data_long, aes(x = gene_id, y = expression, fill = sample)) + # Add color grouping
geom_bar(stat = "identity", position = "dodge") + # Place bars side by side
labs(title = "Gene Expression Levels by Sample", # Add title
x = "Gene", # Label x-axis
y = "Expression Level") + # Label y-axis
theme_minimal() + # Use minimal theme
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability# Calculate summary statistics for each gene
gene_summary <- gene_data_long %>%
group_by(gene_id) %>% # Group data by gene
summarise(
mean_expr = mean(expression), # Calculate mean expression
sd_expr = sd(expression), # Calculate standard deviation
sem_expr = sd(expression) / sqrt(n()) # Calculate standard error
)
# Create basic box plot
ggplot(gene_data_long, aes(x = gene_id, y = expression)) + # Map genes to x-axis, expression to y-axis
geom_boxplot() + # Create box plot
labs(title = "Distribution of Expression Levels", # Add title
x = "Gene", # Label x-axis
y = "Expression Level") # Label y-axis# Create enhanced box plot with points
ggplot(gene_data_long, aes(x = gene_id, y = expression, fill = gene_id)) + # Add color by gene
geom_boxplot(alpha = 0.7) + # Create semi-transparent boxes
geom_point(position = position_jitter(width = 0.2)) + # Add jittered data points
labs(title = "Gene Expression Distribution with Data Points", # Add title
x = "Gene", # Label x-axis
y = "Expression Level") + # Label y-axis
theme_minimal() + # Use minimal theme
theme(legend.position = "none") # Remove redundant legend# Calculate mean expression values for control and treated conditions
expression_means <- gene_data %>%
mutate(
mean_control = (control_1 + control_2) / 2, # Average control replicates
mean_treated = (treated_1 + treated_2) / 2 # Average treated replicates
)
# Create basic scatter plot
ggplot(expression_means,
aes(x = mean_control, y = mean_treated, label = gene_id)) + # Map control vs treated
geom_point() + # Add points
geom_text(vjust = -0.5) + # Add gene labels above points
geom_abline(intercept = 0, slope = 1, # Add y=x reference line
linetype = "dashed", color = "red") +
labs(title = "Control vs Treated Expression", # Add title
x = "Mean Control Expression", # Label x-axis
y = "Mean Treated Expression") + # Label y-axis
theme_minimal() # Use minimal theme# Create enhanced scatter plot with pathway information
ggplot(expression_means,
aes(x = mean_control, y = mean_treated,
color = pathway, label = gene_id)) + # Color points by pathway
geom_point(size = 3) + # Add larger points
geom_text(vjust = -0.5) + # Add gene labels
geom_abline(intercept = 0, slope = 1, # Add reference line
linetype = "dashed", color = "gray") +
labs(title = "Control vs Treated Expression by Pathway", # Add title
x = "Mean Control Expression", # Label x-axis
y = "Mean Treated Expression") + # Label y-axis
theme_minimal() # Use minimal theme# Create basic histogram of expression values
ggplot(gene_data_long, aes(x = expression)) + # Map expression to x-axis
geom_histogram(bins = 10) + # Create histogram with 10 bins
labs(title = "Distribution of Expression Values", # Add title
x = "Expression Level", # Label x-axis
y = "Count") # Label y-axis# Create histogram with density curve
ggplot(gene_data_long, aes(x = expression)) +
geom_histogram(aes(y = ..density..), # Show density instead of count
bins = 10, fill = "lightblue",
alpha = 0.7) + # Semi-transparent bars
geom_density(color = "red") + # Add density curve
labs(title = "Distribution of Expression Values with Density Curve",
x = "Expression Level",
y = "Density") +
theme_minimal()## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# Create faceted histograms by sample
ggplot(gene_data_long, aes(x = expression, fill = sample)) + # Color by sample
geom_histogram(bins = 10, alpha = 0.7, # Create semi-transparent histogram
position = "identity") +
facet_wrap(~sample) + # Create separate plot for each sample
labs(title = "Expression Distribution by Sample", # Add title
x = "Expression Level", # Label x-axis
y = "Count") + # Label y-axis
theme_minimal() # Use minimal theme# Prepare data for heatmap visualization
heatmap_data <- gene_data %>%
select(gene_id, control_1, control_2, treated_1, treated_2) %>% # Select relevant columns
pivot_longer(
cols = -gene_id, # Convert to long format
names_to = "sample", # Column for sample names
values_to = "expression" # Column for expression values
)
# Create basic heatmap
ggplot(heatmap_data, aes(x = sample, y = gene_id, fill = expression)) + # Map variables
geom_tile() + # Create tiles for heatmap
scale_fill_viridis() + # Use colorblind-friendly colors
labs(title = "Gene Expression Heatmap", # Add title
x = "Sample", # Label x-axis
y = "Gene", # Label y-axis
fill = "Expression") + # Label color scale
theme_minimal() + # Use minimal theme
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels# Create heatmap with Z-score normalized values
heatmap_data_scaled <- heatmap_data %>%
group_by(gene_id) %>% # Group by gene
mutate(scaled_expression = scale(expression)[,1]) %>% # Calculate Z-scores
ungroup()
# Create enhanced heatmap with Z-scores
ggplot(heatmap_data_scaled,
aes(x = sample, y = gene_id, fill = scaled_expression)) + # Map Z-scores to color
geom_tile() + # Create tiles
scale_fill_viridis(option = "magma") + # Use magma color palette
labs(title = "Gene Expression Heatmap (Scaled)", # Add title
x = "Sample", # Label x-axis
y = "Gene", # Label y-axis
fill = "Z-score") + # Label color scale
theme_minimal() + # Use minimal theme
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels# Create a highly customized plot combining multiple elements
ggplot(gene_data_long, aes(x = gene_id, y = expression, fill = sample)) + # Map variables
geom_boxplot(alpha = 0.7) + # Add semi-transparent boxplots
geom_point(position = position_jitterdodge( # Add jittered points
jitter.width = 0.2)) + # Control point spread
scale_fill_viridis_d() + # Use discrete viridis colors
labs(
title = "Gene Expression Analysis", # Main title
subtitle = "Comparing Control and Treated Samples", # Subtitle
x = "Gene", # x-axis label
y = "Expression Level", # y-axis label
fill = "Sample Type" # Legend title
) +
theme_minimal() + # Use minimal theme
theme(
plot.title = element_text(size = 16, face = "bold"), # Customize title
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x labels
legend.position = "top" # Move legend to top
)Here are some useful modifications you might want to apply to your plots:
# Same plot with different themes
base_plot <- ggplot(gene_data_long, aes(x = gene_id, y = expression)) +
geom_boxplot()
# Default theme
base_plotbase_plot +
scale_y_log10() + # Log scale for y-axis
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x labels
axis.title = element_text(size = 12, face = "bold") # Bold axis titles
)ggplot(expression_means,
aes(x = mean_control, y = mean_treated)) +
geom_point() +
geom_text(aes(label = gene_id), vjust = -0.5) + # Add labels
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + # Add line
annotate("text", x = 100, y = 200,
label = "Upregulated", color = "blue", fontface = "italic") + # Add annotation
theme_minimal()# Exercise 1: Scatter plot of technical replicates
ggplot(gene_data, aes(x = control_1, y = control_2, label = gene_id)) +
geom_point() +
geom_text(vjust = -0.5) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Technical Replicates Comparison",
x = "Control Replicate 1",
y = "Control Replicate 2") +
theme_minimal()# Exercise 2: Box plot by pathway
ggplot(gene_data_long, aes(x = pathway, y = expression, fill = pathway)) +
geom_boxplot() +
labs(title = "Expression Levels by Pathway",
x = "Pathway",
y = "Expression Level") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))After mastering these basics, you can explore: - Interactive plots with plotly - Multiple plots with patchwork - Custom themes and color palettes - Statistical visualizations - Publication-ready figure preparation