Data frames are one of the most commonly used data structures in R. They are particularly useful for working with tabular data, such as gene expression datasets, clinical data, or experimental results. A data frame is a 2-dimensional structure where:
# Create a simple data frame with patient information
df <- data.frame(
patient_id = c("P001", "P002", "P003", "P004"), # Patient identifiers
age = c(45, 52, 38, 61), # Patient ages
diabetic = c(TRUE, FALSE, FALSE, TRUE) # Diabetes status
)
# Display the created data frame
print("Basic data frame:") # Show the data## [1] "Basic data frame:"
## patient_id age diabetic
## 1 P001 45 TRUE
## 2 P002 52 FALSE
## 3 P003 38 FALSE
## 4 P004 61 TRUE
# Examine the structure of the data frame
print("\nStructure of the data frame:") # Show data types and structure## [1] "\nStructure of the data frame:"
## 'data.frame': 4 obs. of 3 variables:
## $ patient_id: chr "P001" "P002" "P003" "P004"
## $ age : num 45 52 38 61
## $ diabetic : logi TRUE FALSE FALSE TRUE
# Create vectors for RNA-seq analysis results
gene_names <- c("BRCA1", "TP53", "EGFR", "KRAS", "HER2") # Gene identifiers
expression_values <- c(1543.7, 2345.2, 1234.5, 876.3, 2345.6) # Expression levels
is_significant <- c(TRUE, TRUE, FALSE, FALSE, TRUE) # Statistical significance
p_values <- c(0.001, 0.0001, 0.08, 0.07, 0.001) # Statistical p-values
fold_changes <- c(2.5, 3.2, 1.1, 0.9, 2.8) # Expression fold changes
# Combine vectors into a data frame for analysis
gene_expression_df <- data.frame(
gene = gene_names, # Gene names column
expression = expression_values, # Expression values column
significant = is_significant, # Significance flags column
p_value = p_values, # P-values column
fold_change = fold_changes # Fold changes column
)
# Display the resulting data frame
print("Gene expression data frame:") # Show combined data## [1] "Gene expression data frame:"
## gene expression significant p_value fold_change
## 1 BRCA1 1543.7 TRUE 1e-03 2.5
## 2 TP53 2345.2 TRUE 1e-04 3.2
## 3 EGFR 1234.5 FALSE 8e-02 1.1
## 4 KRAS 876.3 FALSE 7e-02 0.9
## 5 HER2 2345.6 TRUE 1e-03 2.8
# Save data frame to CSV file for sharing/storage
write.csv(gene_expression_df, "gene_expression.csv", row.names = FALSE) # Export data
# Read data back from CSV file
df_from_csv <- read.csv("gene_expression.csv") # Import data from file
print("Data frame read from CSV:") # Show imported data## [1] "Data frame read from CSV:"
## gene expression significant p_value fold_change
## 1 BRCA1 1543.7 TRUE 1e-03 2.5
## 2 TP53 2345.2 TRUE 1e-04 3.2
## 3 EGFR 1234.5 FALSE 8e-02 1.1
## 4 KRAS 876.3 FALSE 7e-02 0.9
## 5 HER2 2345.6 TRUE 1e-03 2.8
# Display basic information about the data frame
print("Dimensions of the data frame:") # Show size of data## [1] "Dimensions of the data frame:"
## [1] 5 5
## [1] "\nColumn names:"
## [1] "gene" "expression" "significant" "p_value" "fold_change"
## [1] "\nSummary of the data:"
## gene expression significant p_value
## Length:5 Min. : 876.3 Mode :logical Min. :0.00010
## Class :character 1st Qu.:1234.5 FALSE:2 1st Qu.:0.00100
## Mode :character Median :1543.7 TRUE :3 Median :0.00100
## Mean :1669.1 Mean :0.03042
## 3rd Qu.:2345.2 3rd Qu.:0.07000
## Max. :2345.6 Max. :0.08000
## fold_change
## Min. :0.9
## 1st Qu.:1.1
## Median :2.5
## Mean :2.1
## 3rd Qu.:2.8
## Max. :3.2
## [1] "\nFirst few rows:"
## gene expression significant p_value fold_change
## 1 BRCA1 1543.7 TRUE 1e-03 2.5
## 2 TP53 2345.2 TRUE 1e-04 3.2
## 3 EGFR 1234.5 FALSE 8e-02 1.1
## [1] "\nLast few rows:"
## gene expression significant p_value fold_change
## 4 KRAS 876.3 FALSE 0.070 0.9
## 5 HER2 2345.6 TRUE 0.001 2.8
# Demonstrate different methods to access columns
# Using $ notation (direct column access)
expression_values <- gene_expression_df$expression # Get expression column
print("Expression values using $:")## [1] "Expression values using $:"
## [1] 1543.7 2345.2 1234.5 876.3 2345.6
# Using column name with brackets (returns data frame)
p_values <- gene_expression_df["p_value"] # Get p-value column
print("\nP-values using brackets:")## [1] "\nP-values using brackets:"
## p_value
## 1 1e-03
## 2 1e-04
## 3 8e-02
## 4 7e-02
## 5 1e-03
# Select multiple columns at once
selected_cols <- gene_expression_df[c("gene", "fold_change")] # Get gene names and fold changes
print("\nSelected columns:")## [1] "\nSelected columns:"
## gene fold_change
## 1 BRCA1 2.5
## 2 TP53 3.2
## 3 EGFR 1.1
## 4 KRAS 0.9
## 5 HER2 2.8
# Demonstrate row access methods
# Get a single row
row_1 <- gene_expression_df[1, ] # First row
print("First row:")## [1] "First row:"
## gene expression significant p_value fold_change
## 1 BRCA1 1543.7 TRUE 0.001 2.5
# Get multiple consecutive rows
rows_2_4 <- gene_expression_df[2:4, ] # Rows 2 through 4
print("\nRows 2-4:")## [1] "\nRows 2-4:"
## gene expression significant p_value fold_change
## 2 TP53 2345.2 TRUE 1e-04 3.2
## 3 EGFR 1234.5 FALSE 8e-02 1.1
## 4 KRAS 876.3 FALSE 7e-02 0.9
# Access individual elements using different methods
# Using row and column numbers
element <- gene_expression_df[1, 2] # First row, second column
print("Element at row 1, column 2:")## [1] "Element at row 1, column 2:"
## [1] 1543.7
# Using row number and column name
p_val <- gene_expression_df[2, "p_value"] # Second row, p-value column
print("\nP-value for second gene:")## [1] "\nP-value for second gene:"
## [1] 1e-04
# Get a subset of rows and columns
subset <- gene_expression_df[1:2, c("gene", "expression")] # First 2 rows, selected columns
print("\nSubset of first two genes with names and expression:")## [1] "\nSubset of first two genes with names and expression:"
## gene expression
## 1 BRCA1 1543.7
## 2 TP53 2345.2
# Filter data based on various conditions
# Find genes with statistical significance
significant_genes <- gene_expression_df[gene_expression_df$significant == TRUE, ]
print("Significantly expressed genes:")## [1] "Significantly expressed genes:"
## gene expression significant p_value fold_change
## 1 BRCA1 1543.7 TRUE 1e-03 2.5
## 2 TP53 2345.2 TRUE 1e-04 3.2
## 5 HER2 2345.6 TRUE 1e-03 2.8
# Find genes with high expression levels
high_expression <- gene_expression_df[gene_expression_df$expression > 2000, ]
print("\nHighly expressed genes:")## [1] "\nHighly expressed genes:"
## gene expression significant p_value fold_change
## 2 TP53 2345.2 TRUE 1e-04 3.2
## 5 HER2 2345.6 TRUE 1e-03 2.8
# Combine multiple filtering conditions
important_genes <- gene_expression_df[
gene_expression_df$significant == TRUE & # Must be significant
gene_expression_df$fold_change > 2.5, # Must have high fold change
]
print("\nSignificant genes with high fold change:")## [1] "\nSignificant genes with high fold change:"
## gene expression significant p_value fold_change
## 2 TP53 2345.2 TRUE 1e-04 3.2
## 5 HER2 2345.6 TRUE 1e-03 2.8
# Demonstrate which() function for finding matching elements
# Get indices of significant genes
sig_indices <- which(gene_expression_df$significant) # Find TRUE values
print("Indices of significant genes:")## [1] "Indices of significant genes:"
## [1] 1 2 5
# Use indices to extract rows
sig_genes <- gene_expression_df[sig_indices, ] # Get corresponding rows
print("\nSignificant genes (using which):")## [1] "\nSignificant genes (using which):"
## gene expression significant p_value fold_change
## 1 BRCA1 1543.7 TRUE 1e-03 2.5
## 2 TP53 2345.2 TRUE 1e-04 3.2
## 5 HER2 2345.6 TRUE 1e-03 2.8
# Add new calculated columns to the data frame
# Add log2 transformed expression values
gene_expression_df$log2_expression <- log2(gene_expression_df$expression) # Log transformation
# Add multiple columns simultaneously
gene_expression_df <- cbind(
gene_expression_df,
neg_log10_pvalue = -log10(gene_expression_df$p_value), # Transform p-values
pass_filter = gene_expression_df$p_value < 0.05 # Add significance filter
)
print("Data frame with new columns:")## [1] "Data frame with new columns:"
## gene expression significant p_value fold_change log2_expression
## 1 BRCA1 1543.7 TRUE 1e-03 2.5 10.592177
## 2 TP53 2345.2 TRUE 1e-04 3.2 11.195495
## 3 EGFR 1234.5 FALSE 8e-02 1.1 10.269711
## 4 KRAS 876.3 FALSE 7e-02 0.9 9.775281
## 5 HER2 2345.6 TRUE 1e-03 2.8 11.195741
## neg_log10_pvalue pass_filter
## 1 3.000000 TRUE
## 2 4.000000 TRUE
## 3 1.096910 FALSE
## 4 1.154902 FALSE
## 5 3.000000 TRUE
# Create and add a new row of data
# Create data for new gene
new_gene <- data.frame(
gene = "BRAF", # Gene name
expression = 1876.5, # Expression value
significant = TRUE, # Significance flag
p_value = 0.002, # P-value
fold_change = 2.1, # Fold change
log2_expression = log2(1876.5), # Log2 expression
neg_log10_pvalue = -log10(0.002), # Transformed p-value
pass_filter = TRUE # Filter status
)
# Add new row to existing data frame
gene_expression_df <- rbind(gene_expression_df, new_gene) # Combine rows
print("Data frame with new row:")## [1] "Data frame with new row:"
## gene expression significant p_value fold_change log2_expression
## 1 BRCA1 1543.7 TRUE 1e-03 2.5 10.592177
## 2 TP53 2345.2 TRUE 1e-04 3.2 11.195495
## 3 EGFR 1234.5 FALSE 8e-02 1.1 10.269711
## 4 KRAS 876.3 FALSE 7e-02 0.9 9.775281
## 5 HER2 2345.6 TRUE 1e-03 2.8 11.195741
## 6 BRAF 1876.5 TRUE 2e-03 2.1 10.873829
## neg_log10_pvalue pass_filter
## 1 3.000000 TRUE
## 2 4.000000 TRUE
## 3 1.096910 FALSE
## 4 1.154902 FALSE
## 5 3.000000 TRUE
## 6 2.698970 TRUE
# Update existing values in the data frame
# Change a single value
gene_expression_df$expression[1] <- 1600 # Update first expression value
# Update multiple values based on condition
gene_expression_df$significant[gene_expression_df$p_value > 0.05] <- FALSE # Update significance
print("Modified data frame:")## [1] "Modified data frame:"
## gene expression significant p_value fold_change log2_expression
## 1 BRCA1 1600.0 TRUE 1e-03 2.5 10.592177
## 2 TP53 2345.2 TRUE 1e-04 3.2 11.195495
## 3 EGFR 1234.5 FALSE 8e-02 1.1 10.269711
## 4 KRAS 876.3 FALSE 7e-02 0.9 9.775281
## 5 HER2 2345.6 TRUE 1e-03 2.8 11.195741
## 6 BRAF 1876.5 TRUE 2e-03 2.1 10.873829
## neg_log10_pvalue pass_filter
## 1 3.000000 TRUE
## 2 4.000000 TRUE
## 3 1.096910 FALSE
## 4 1.154902 FALSE
## 5 3.000000 TRUE
## 6 2.698970 TRUE
Create a data frame containing clinical trial data with: - Patient ID - Treatment group (A or B) - Age - Response (TRUE/FALSE) Then filter for responding patients over 50.
# Create the data frame
clinical_data <- data.frame(
patient_id = paste0("P", 1:10),
treatment = rep(c("A", "B"), each = 5),
age = c(45, 62, 55, 48, 71, 52, 67, 43, 58, 49),
response = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE)
)
# Filter for responding patients over 50
responders_over_50 <- clinical_data[
clinical_data$response == TRUE & clinical_data$age > 50,
]
print("Responding patients over 50:")## [1] "Responding patients over 50:"
## patient_id treatment age response
## 2 P2 A 62 TRUE
## 5 P5 A 71 TRUE
## 7 P7 B 67 TRUE
## 9 P9 B 58 TRUE
Using the gene expression data frame: 1. Calculate z-scores for expression values 2. Add a column for expression categories (High, Medium, Low) 3. Filter for genes meeting specific criteria
# Calculate z-scores
mean_expr <- mean(gene_expression_df$expression)
sd_expr <- sd(gene_expression_df$expression)
gene_expression_df$z_score <- (gene_expression_df$expression - mean_expr) / sd_expr
# Add expression categories
gene_expression_df$expr_category <- cut(
gene_expression_df$z_score,
breaks = c(-Inf, -1, 1, Inf),
labels = c("Low", "Medium", "High")
)
# Filter for interesting genes
interesting_genes <- gene_expression_df[
gene_expression_df$significant == TRUE &
gene_expression_df$expr_category == "High",
]
print("Genes with high expression and significance:")## [1] "Genes with high expression and significance:"
## gene expression significant p_value fold_change log2_expression
## 2 TP53 2345.2 TRUE 1e-04 3.2 11.19550
## 5 HER2 2345.6 TRUE 1e-03 2.8 11.19574
## neg_log10_pvalue pass_filter z_score expr_category
## 2 4 TRUE 1.063014 High
## 5 3 TRUE 1.063687 High
# Keep data frame structure with drop=FALSE
single_col_df <- gene_expression_df[, "expression", drop=FALSE]
print("Single column as data frame:")## [1] "Single column as data frame:"
## expression
## 1 1600.0
## 2 2345.2
## 3 1234.5
## 4 876.3
## 5 2345.6
## 6 1876.5
## [1] "Missing values in each column:"
## gene expression significant p_value
## 0 0 0 0
## fold_change log2_expression neg_log10_pvalue pass_filter
## 0 0 0 0
## z_score expr_category
## 0 0
After mastering data frames, you can move on to: - Data manipulation with dplyr - Data visualization with ggplot2 - Statistical analysis of data frames - Advanced filtering and aggregation techniques