Data Visualization with ggplot2
Introduction to ggplot2
ggplot2 is a powerful package for creating beautiful visualizations in R. It’s based on the “Grammar of Graphics,” which means we build plots layer by layer. Think of it like making a sandwich:
- First, you need a base (the data)
- Then add your main ingredients (geometric shapes like points, lines, or bars)
- Finally, add your toppings (colors, labels, themes)
Let’s start by loading the required packages and our sample data:
# Load required packages for data visualization
library(tidyverse) # Includes ggplot2 and data manipulation tools
library(viridis) # Color-blind friendly color palettes
# Create sample gene expression dataset
gene_data <- tibble(
gene_id = c("BRCA1", "TP53", "EGFR", "KRAS", "HER2"), # Gene names
control_1 = c(100, 150, 80, 200, 120), # Control replicate 1
control_2 = c(110, 140, 85, 190, 125), # Control replicate 2
treated_1 = c(200, 300, 90, 180, 240), # Treatment replicate 1
treated_2 = c(190, 280, 95, 185, 230), # Treatment replicate 2
chromosome = c("17", "17", "7", "12", "17"), # Chromosome location
pathway = c("DNA repair", "Cell cycle", "Growth", "Signaling", "Growth") # Biological pathway
)
# Convert data from wide to long format for plotting
gene_data_long <- gene_data %>%
pivot_longer(
cols = c(control_1, control_2, treated_1, treated_2), # Columns to convert
names_to = "sample", # New column for sample names
values_to = "expression" # New column for expression values
)
# Basic bar plot showing gene expression levels
ggplot(gene_data_long, aes(x = gene_id, y = expression)) + # Map genes to x-axis, expression to y-axis
geom_bar(stat = "identity") + # Create bars with heights = y values
labs(title = "Gene Expression Levels", # Add plot title
x = "Gene", # Label x-axis
y = "Expression Level") # Label y-axis
# Grouped bar plot showing expression by sample
ggplot(gene_data_long, aes(x = gene_id, y = expression, fill = sample)) + # Add color grouping
geom_bar(stat = "identity", position = "dodge") + # Place bars side by side
labs(title = "Gene Expression Levels by Sample", # Add title
x = "Gene", # Label x-axis
y = "Expression Level") + # Label y-axis
theme_minimal() + # Use minimal theme
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
# Calculate summary statistics for each gene
gene_summary <- gene_data_long %>%
group_by(gene_id) %>% # Group data by gene
summarise(
mean_expr = mean(expression), # Calculate mean expression
sd_expr = sd(expression), # Calculate standard deviation
sem_expr = sd(expression) / sqrt(n()) # Calculate standard error
)
# Basic box plot showing expression distribution
ggplot(gene_data_long, aes(x = gene_id, y = expression)) + # Map genes and expression
geom_boxplot() + # Create box plot
labs(title = "Distribution of Expression Levels", # Add title
x = "Gene", # Label x-axis
y = "Expression Level") # Label y-axis
# Box plot with individual points and error bars
ggplot(gene_data_long, aes(x = gene_id, y = expression, fill = gene_id)) + # Add color by gene
geom_boxplot(alpha = 0.7) + # Add semi-transparent boxes
geom_point(position = position_jitter(width = 0.2)) + # Add jittered data points
labs(title = "Gene Expression Distribution with Data Points", # Add title
x = "Gene", # Label x-axis
y = "Expression Level") + # Label y-axis
theme_minimal() + # Use minimal theme
theme(legend.position = "none") # Remove legend
# Calculate mean expression for control and treated conditions
expression_means <- gene_data %>%
mutate(
mean_control = (control_1 + control_2) / 2, # Average control replicates
mean_treated = (treated_1 + treated_2) / 2 # Average treated replicates
)
# Create scatter plot comparing control vs treated expression
ggplot(expression_means,
aes(x = mean_control, y = mean_treated, label = gene_id)) + # Map control vs treated
geom_point() + # Add points
geom_text(vjust = -0.5) + # Add labels above points
geom_abline(intercept = 0, slope = 1, # Add y=x reference line
linetype = "dashed", color = "red") + # Make line dashed and red
labs(title = "Control vs Treated Expression", # Add title
x = "Mean Control Expression", # Label x-axis
y = "Mean Treated Expression") + # Label y-axis
theme_minimal() # Use minimal theme
# Scatter plot with additional pathway information
ggplot(expression_means,
aes(x = mean_control, y = mean_treated,
color = pathway, label = gene_id)) + # Color points by pathway
geom_point(size = 3) + # Add larger points
geom_text(vjust = -0.5) + # Add labels above points
geom_abline(intercept = 0, slope = 1, # Add reference line
linetype = "dashed", color = "gray") + # Gray dashed line
labs(title = "Control vs Treated Expression by Pathway", # Add title
x = "Mean Control Expression", # Label x-axis
y = "Mean Treated Expression") + # Label y-axis
theme_minimal() # Use minimal theme
# Create basic histogram of expression values
ggplot(gene_data_long, aes(x = expression)) + # Map expression to x-axis
geom_histogram(bins = 10) + # Create histogram with 10 bins
labs(title = "Distribution of Expression Values", # Add title
x = "Expression Level", # Label x-axis
y = "Count") # Label y-axis
# Histogram with density curve
ggplot(gene_data_long, aes(x = expression)) +
geom_histogram(aes(y = ..density..), # Convert to density scale
bins = 10, # Number of bins
fill = "lightblue", # Bar color
alpha = 0.7) + # Bar transparency
geom_density(color = "red") + # Add density curve in red
labs(title = "Distribution of Expression Values with Density Curve", # Add title
x = "Expression Level", # Label x-axis
y = "Density") + # Label y-axis
theme_minimal() # Use minimal theme
# Faceted histogram by sample type
ggplot(gene_data_long, aes(x = expression, fill = sample)) + # Map expression and color
geom_histogram(bins = 10, # Number of bins
alpha = 0.7, # Bar transparency
position = "identity") + # Overlay histograms
facet_wrap(~sample) + # Create separate plot per sample
labs(title = "Expression Distribution by Sample", # Add title
x = "Expression Level", # Label x-axis
y = "Count") + # Label y-axis
theme_minimal() # Use minimal theme
Basic ggplot2 Syntax
Every ggplot2 plot has three key components:
- Data: The dataset you want to visualize
- Aesthetics (
aes): How your data maps to visual properties (x, y, color, size, etc.) - Geometries (
geom_*): The type of plot you want to create
The basic template is:
ggplot(data = your_data, aes(x = x_variable, y = y_variable)) +
geom_something()
1. Bar Plots
Let’s start with a simple bar plot showing gene expression levels:
# Basic bar plot
ggplot(gene_data_long, aes(x = gene_id, y = expression)) +
geom_bar(stat = "identity") +
labs(title = "Gene Expression Levels",
x = "Gene",
y = "Expression Level")
# Grouped bar plot
ggplot(gene_data_long, aes(x = gene_id, y = expression, fill = sample)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Gene Expression Levels by Sample",
x = "Gene",
y = "Expression Level") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
2. Box Plots
Box plots are great for showing the distribution of data and identifying outliers:
# Calculate some summary statistics
gene_summary <- gene_data_long %>%
group_by(gene_id) %>%
summarise(
mean_expr = mean(expression),
sd_expr = sd(expression),
sem_expr = sd(expression) / sqrt(n()) # Standard Error of Mean
)
# Basic box plot
ggplot(gene_data_long, aes(x = gene_id, y = expression)) +
geom_boxplot() +
labs(title = "Distribution of Expression Levels",
x = "Gene",
y = "Expression Level")
# Box plot with individual points and error bars
ggplot(gene_data_long, aes(x = gene_id, y = expression, fill = gene_id)) +
geom_boxplot(alpha = 0.7) + # Alpha controls transparency
geom_point(position = position_jitter(width = 0.2)) + # Add individual points
labs(title = "Gene Expression Distribution with Data Points",
x = "Gene",
y = "Expression Level") +
theme_minimal() +
theme(legend.position = "none") # Remove legend
# Understanding the Box Plot Elements:
# - Box: Shows the Interquartile Range (IQR) - middle 50% of data
# - Line in box: Median
# - Whiskers: Extend to most extreme points within 1.5 * IQR
# - Points beyond whiskers: Potential outliers
3. Scatter Plots
Scatter plots are useful for showing relationships between two variables:
# Calculate mean expression for control and treated conditions
expression_means <- gene_data %>%
mutate(
mean_control = (control_1 + control_2) / 2,
mean_treated = (treated_1 + treated_2) / 2
)
# Basic scatter plot
ggplot(expression_means,
aes(x = mean_control, y = mean_treated, label = gene_id)) +
geom_point() +
geom_text(vjust = -0.5) + # Add labels above points
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + # Add y=x line
labs(title = "Control vs Treated Expression",
x = "Mean Control Expression",
y = "Mean Treated Expression") +
theme_minimal()
# Scatter plot with additional information
ggplot(expression_means,
aes(x = mean_control, y = mean_treated,
color = pathway, label = gene_id)) +
geom_point(size = 3) +
geom_text(vjust = -0.5) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
labs(title = "Control vs Treated Expression by Pathway",
x = "Mean Control Expression",
y = "Mean Treated Expression") +
theme_minimal()
4. Histograms
Histograms show the distribution of a continuous variable:
# Basic histogram
ggplot(gene_data_long, aes(x = expression)) +
geom_histogram(bins = 10) +
labs(title = "Distribution of Expression Values",
x = "Expression Level",
y = "Count")
# Histogram with density curve
ggplot(gene_data_long, aes(x = expression)) +
geom_histogram(aes(y = ..density..), bins = 10, fill = "lightblue", alpha = 0.7) +
geom_density(color = "red") +
labs(title = "Distribution of Expression Values with Density Curve",
x = "Expression Level",
y = "Density") +
theme_minimal()
# Faceted histogram by sample type
ggplot(gene_data_long, aes(x = expression, fill = sample)) +
geom_histogram(bins = 10, alpha = 0.7, position = "identity") +
facet_wrap(~sample) +
labs(title = "Expression Distribution by Sample",
x = "Expression Level",
y = "Count") +
theme_minimal()
5. Heatmaps
Heatmaps are excellent for visualizing expression patterns across multiple conditions:
# Prepare data for heatmap
heatmap_data <- gene_data %>%
select(gene_id, control_1, control_2, treated_1, treated_2) %>%
pivot_longer(
cols = -gene_id,
names_to = "sample",
values_to = "expression"
)
# Create heatmap
ggplot(heatmap_data, aes(x = sample, y = gene_id, fill = expression)) +
geom_tile() +
scale_fill_viridis() + # Color-blind friendly palette
labs(title = "Gene Expression Heatmap",
x = "Sample",
y = "Gene",
fill = "Expression") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Heatmap with centered and scaled values
heatmap_data_scaled <- heatmap_data %>%
group_by(gene_id) %>%
mutate(scaled_expression = scale(expression)[,1]) %>%
ungroup()
ggplot(heatmap_data_scaled,
aes(x = sample, y = gene_id, fill = scaled_expression)) +
geom_tile() +
scale_fill_viridis(option = "magma") +
labs(title = "Gene Expression Heatmap (Scaled)",
x = "Sample",
y = "Gene",
fill = "Z-score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Customizing Plots
You can customize almost every aspect of your plots. Here are some common modifications:
# Example of a highly customized plot
ggplot(gene_data_long, aes(x = gene_id, y = expression, fill = sample)) +
geom_boxplot(alpha = 0.7) +
geom_point(position = position_jitterdodge(jitter.width = 0.2)) +
scale_fill_viridis_d() + # Discrete color-blind friendly palette
labs(
title = "Gene Expression Analysis",
subtitle = "Comparing Control and Treated Samples",
x = "Gene",
y = "Expression Level",
fill = "Sample Type"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray50"),
axis.title = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top"
)
Common Plot Modifications
Here are some useful modifications you might want to apply to your plots:
- Changing Themes: ```r
Same plot with different themes
base_plot <- ggplot(gene_data_long, aes(x = gene_id, y = expression)) + geom_boxplot()
Default theme
base_plot
Minimal theme
base_plot + theme_minimal()
Classic theme
base_plot + theme_classic()
Black and white theme
base_plot + theme_bw()
2. **Modifying Axes**:
```r
base_plot +
scale_y_log10() + # Log scale for y-axis
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x labels
axis.title = element_text(size = 12, face = "bold") # Bold axis titles
)
- Adding Labels and Annotations:
ggplot(expression_means, aes(x = mean_control, y = mean_treated)) + geom_point() + geom_text(aes(label = gene_id), vjust = -0.5) + # Add labels geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + # Add line annotate("text", x = 100, y = 200, label = "Upregulated", color = "blue", fontface = "italic") + # Add annotation theme_minimal()
Practice Exercises
- Create a scatter plot showing the relationship between control_1 and control_2 values
- Make a box plot comparing expression levels across pathways
- Create a heatmap of log2-transformed expression values
- Make a histogram of fold changes between treated and control conditions
# Exercise 1: Scatter plot of technical replicates
ggplot(gene_data, aes(x = control_1, y = control_2, label = gene_id)) +
geom_point() +
geom_text(vjust = -0.5) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Technical Replicates Comparison",
x = "Control Replicate 1",
y = "Control Replicate 2") +
theme_minimal()
# Exercise 2: Box plot by pathway
ggplot(gene_data_long, aes(x = pathway, y = expression, fill = pathway)) +
geom_boxplot() +
labs(title = "Expression Levels by Pathway",
x = "Pathway",
y = "Expression Level") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Tips for Creating Good Plots
- Keep it Simple
- Don’t overload your plots with unnecessary information
- Use clear, readable fonts and appropriate sizes
- Choose colors wisely (consider color-blind friendly options)
- Label Everything
- Always include axis labels
- Use informative titles
- Add units where appropriate
- Consider Your Audience
- Make sure your plot tells a clear story
- Use appropriate scales
- Add explanatory notes if needed
- Technical Tips
- Save high-resolution versions for publications
- Be consistent with styling across related plots
- Test your plots with different data sizes
Next Steps
After mastering these basics, you can explore:
- Interactive plots with plotly
- Multiple plots with patchwork
- Custom themes and color palettes
- Statistical visualizations
- Publication-ready figure preparation