Graphics (ggplot2)

Important

This course uses ggplot2 exclusively for all plotting and visualization. While base R has plotting functions like plot(), hist(), and boxplot(), we will only use ggplot2’s grammar of graphics approach. All assignments and exams will expect ggplot2 syntax for graphs.

Core Components

ggplot(data, aes(…))

Purpose: Initialize a ggplot object Help: Type ?ggplot in R console Structure: Data + aesthetic mappings + layers

Basic setup creates blank plot with axes:

library(ggplot2)
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width))

Add layers with +:

ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +
  geom_point()

Save for reuse:

p <- ggplot(mtcars, aes(x = factor(cyl), y = mpg))
p + geom_boxplot()
p + geom_violin()

aes(…)

Purpose: Map variables to visual properties Help: Type ?aes in R console Common aesthetics: x, y, color, fill, shape, size, alpha

Position aesthetics:

aes(x = Sepal.Length, y = Sepal.Width)

Color by group:

aes(x = Sepal.Length, y = Sepal.Width, color = Species)

Multiple aesthetics:

aes(x = wt, y = mpg, color = factor(cyl), size = hp)

Can set in ggplot() or individual geoms:

ggplot(iris) +
  geom_point(aes(x = Sepal.Length, y = Sepal.Width))

Geoms (Geometric Objects)

geom_histogram(binwidth, bins, aes(y = after_stat(…)))

Purpose: Create histograms Help: Type ?geom_histogram in R console Key arguments:

  • bins: Number of bins (default 30)

  • binwidth: Width of bins

  • aes(y = after_stat(density)): Scale to density

Important

In this course, all histograms must include:

  1. Density scaling using aes(y = after_stat(density))

  2. A kernel density curve (red) using geom_density()

  3. A normal curve (blue) using stat_function()

This allows visual comparison of the data distribution with theoretical distributions.

Standard histogram format for this course:

# Calculate mean and sd first
xbar <- mean(data$variable)
s <- sd(data$variable)

# Create histogram with required overlays
ggplot(data, aes(x = variable)) +
  geom_histogram(aes(y = after_stat(density)),
                bins = 24, fill = "grey", col = "black") +
  geom_density(col = "red", linewidth = 1) +
  stat_function(fun = dnorm,
               args = list(mean = xbar, sd = s),
               col = "blue", linewidth = 1) +
  ggtitle("Histogram with Density Curves")

Adding mean and median lines (often required):

# Calculate statistics
xbar <- mean(data$variable)
xtilde <- median(data$variable)
s <- sd(data$variable)

# Full histogram with all required elements
ggplot(data, aes(x = variable)) +
  geom_histogram(aes(y = after_stat(density)),
                bins = 24, fill = "grey", col = "black") +
  geom_vline(xintercept = xbar, color = "purple",
             linetype = "dotted", linewidth = 1) +
  geom_vline(xintercept = xtilde, color = "orange",
             linetype = "dashed", linewidth = 1) +
  geom_density(col = "red", linewidth = 1) +
  stat_function(fun = dnorm,
               args = list(mean = xbar, sd = s),
               col = "blue", linewidth = 1) +
  ggtitle("Complete Histogram for Course")

By group:

ggplot(iris, aes(x = Sepal.Length, fill = Species)) +
  geom_histogram(aes(y = after_stat(density)),
                position = "dodge", bins = 15) +
  facet_wrap(~ Species) +
  ggtitle("Histograms by Group")

Faceted histograms with required density overlays:

# Calculate group-specific statistics
xbar <- tapply(iris$Sepal.Length, iris$Species, mean)
s <- tapply(iris$Sepal.Length, iris$Species, sd)

# Add normal density values to the dataset
iris$normal.density <- ifelse(iris$Species == "setosa",
                              dnorm(iris$Sepal.Length, xbar["setosa"], s["setosa"]),
                              ifelse(iris$Species == "versicolor",
                                    dnorm(iris$Sepal.Length, xbar["versicolor"], s["versicolor"]),
                                    dnorm(iris$Sepal.Length, xbar["virginica"], s["virginica"])))

# Create faceted histogram with overlays
ggplot(iris, aes(x = Sepal.Length)) +
  geom_histogram(aes(y = after_stat(density)),
                bins = 15, fill = "grey", col = "black") +
  geom_density(col = "red", linewidth = 1) +
  geom_line(aes(y = normal.density), col = "blue", linewidth = 1) +
  facet_wrap(~ Species) +
  ggtitle("Distribution of Sepal Length by Species")

# For datasets with only two groups (like CO2), the approach is simpler:

# Two-group example using CO2 dataset
xbar <- tapply(CO2$uptake, CO2$Type, mean)
s <- tapply(CO2$uptake, CO2$Type, sd)

CO2$normal.density <- ifelse(CO2$Type == "Quebec",
                            dnorm(CO2$uptake, xbar["Quebec"], s["Quebec"]),
                            dnorm(CO2$uptake, xbar["Mississippi"], s["Mississippi"]))

ggplot(CO2, aes(x = uptake)) +
  geom_histogram(aes(y = after_stat(density)),
                bins = 20, fill = "grey", col = "black") +
  geom_density(col = "red", linewidth = 1) +
  geom_line(aes(y = normal.density), col = "blue", linewidth = 1) +
  facet_grid(. ~ Type) +
  ggtitle("Distribution of Uptake by Location Type")

geom_boxplot(outlier.shape, varwidth)

Purpose: Box and whisker plots Help: Type ?geom_boxplot in R console

Basic boxplot:

ggplot(iris, aes(x = Species, y = Sepal.Length)) +
  geom_boxplot()

Horizontal orientation:

ggplot(iris, aes(x = Species, y = Sepal.Length)) +
  geom_boxplot() +
  coord_flip()

With individual points:

ggplot(iris, aes(x = Species, y = Sepal.Length)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.5)

Variable width by sample size:

ggplot(iris, aes(x = Species, y = Sepal.Length)) +
  geom_boxplot(varwidth = TRUE)

geom_point(), geom_line()

Purpose: Scatterplots and line plots Help: Type ?geom_point or ?geom_line

Basic scatterplot:

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point()

Customize points:

ggplot(mtcars, aes(x = wt, y = mpg, color = factor(cyl))) +
  geom_point(size = 3, alpha = 0.7)

Add regression line:

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red")

Time series (using built-in dataset):

# Convert to time series format
data(AirPassengers)
ts_data <- data.frame(
  date = as.Date(time(AirPassengers)),
  value = as.numeric(AirPassengers)
)

ggplot(ts_data, aes(x = date, y = value)) +
  geom_line() +
  geom_point()

geom_smooth(method, formula, se)

Purpose: Add smoothed conditional means Help: Type ?geom_smooth in R console Key arguments:

  • method: “lm”, “loess”, “gam”

  • se: Show confidence band (TRUE/FALSE)

Linear regression with confidence band:

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm")

Without confidence band:

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

By group:

ggplot(mtcars, aes(x = wt, y = mpg, color = factor(cyl))) +
  geom_point() +
  geom_smooth(method = "lm")

stat_qq(), stat_qq_line()

Purpose: Q-Q plots for normality assessment Help: Type ?stat_qq in R console

Basic Q-Q plot:

ggplot(data.frame(sample = rnorm(100)), aes(sample = sample)) +
  stat_qq() +
  stat_qq_line()

For residuals:

mod <- lm(mpg ~ wt, data = mtcars)
res_df <- data.frame(residuals = residuals(mod))

ggplot(res_df, aes(sample = residuals)) +
  stat_qq() +
  stat_qq_line(color = "red") +
  labs(title = "Normal Q-Q Plot of Residuals")

geom_ribbon(aes(ymin, ymax))

Purpose: Area plots with upper/lower bounds Help: Type ?geom_ribbon in R console Use case: Custom confidence/prediction bands

Setup for regression bands:

mod <- lm(mpg ~ wt, data = mtcars)
new_data <- data.frame(wt = seq(min(mtcars$wt),
                                max(mtcars$wt),
                                length = 100))

pred_ci <- predict(mod, new_data, interval = "confidence")
pred_pi <- predict(mod, new_data, interval = "prediction")

plot_data <- cbind(new_data,
                  ci_fit = pred_ci[,"fit"],
                  ci_lwr = pred_ci[,"lwr"],
                  ci_upr = pred_ci[,"upr"],
                  pi_lwr = pred_pi[,"lwr"],
                  pi_upr = pred_pi[,"upr"])

Plot with bands:

ggplot(plot_data, aes(x = wt)) +
  geom_ribbon(aes(ymin = pi_lwr, ymax = pi_upr),
             fill = "grey90") +
  geom_ribbon(aes(ymin = ci_lwr, ymax = ci_upr),
             fill = "grey70") +
  geom_line(aes(y = ci_fit), color = "blue", size = 1) +
  geom_point(data = mtcars, aes(y = mpg)) +
  labs(title = "Regression with CI and PI bands")

Categorical Data Visualization

geom_bar(), geom_col()

Purpose: Bar charts for categorical data Help: Type ?geom_bar or ?geom_col in R console

  • geom_bar(): Counts categories (like table())

  • geom_col(): Uses pre-computed values

Count frequencies with geom_bar():

# Frequency bar chart
ggplot(mtcars, aes(x = factor(cyl))) +
  geom_bar() +
  xlab("Number of Cylinders") +
  ylab("Count") +
  ggtitle("Distribution of Cylinder Counts")

Pre-computed values with geom_col():

# Create summary data
cyl_means <- aggregate(mpg ~ cyl, data = mtcars, mean)

ggplot(cyl_means, aes(x = factor(cyl), y = mpg)) +
  geom_col(fill = "steelblue") +
  xlab("Number of Cylinders") +
  ylab("Mean MPG") +
  ggtitle("Average MPG by Cylinder Count")

Grouped bar chart:

ggplot(mtcars, aes(x = factor(cyl), fill = factor(am))) +
  geom_bar(position = "dodge") +
  xlab("Cylinders") +
  labs(fill = "Transmission") +
  scale_fill_discrete(labels = c("Automatic", "Manual"))

Stacked bar chart (proportions):

ggplot(mtcars, aes(x = factor(cyl), fill = factor(am))) +
  geom_bar(position = "fill") +
  ylab("Proportion") +
  labs(fill = "Transmission")

Note

For pie charts, while possible in ggplot2 using coord_polar(), they are generally discouraged in statistical visualization. Bar charts are preferred for comparing categorical proportions.

Plot Customization

Labels and Titles

All labels at once:

p + labs(title = "Main Title",
        subtitle = "Subtitle",
        x = "X-axis label",
        y = "Y-axis label",
        caption = "Data source: ...")

Individual functions:

p + ggtitle("Main Title") +
    xlab("X-axis") +
    ylab("Y-axis")

Math expressions:

library(latex2exp)
p + xlab(TeX("$\\beta_1$")) +
    ylab(TeX("$\\hat{y}$"))

Themes

Built-in themes:

p + theme_minimal()
p + theme_classic()
p + theme_bw()

Customize elements:

p + theme(
  axis.text = element_text(size = 12),
  axis.title = element_text(size = 14, face = "bold"),
  plot.title = element_text(size = 16, hjust = 0.5),
  legend.position = "bottom"
)

Faceting

Separate panels by variable:

ggplot(df, aes(x = score)) +
  geom_histogram() +
  facet_wrap(~ group)

Grid layout:

ggplot(df, aes(x = score)) +
  geom_histogram() +
  facet_grid(platform ~ group)

Tables & Reporting

knitr::kable(x, caption, digits, format)

Purpose: Create formatted tables Help: Type ?knitr::kable in R console

Basic table using built-in data:

library(knitr)
library(kableExtra)

# Summary statistics by species
summary_table <- aggregate(Sepal.Length ~ Species, data = iris,
                          FUN = function(x) c(mean = mean(x),
                                             sd = sd(x),
                                             n = length(x)))
kable(summary_table,
      caption = "Summary Statistics by Species",
      digits = 2)

Enhanced styling:

kable(summary_table, caption = "Iris Summary by Species") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
               full_width = FALSE) %>%
  column_spec(1, bold = TRUE)

gridExtra::grid.arrange(…)

Purpose: Arrange multiple plots Help: Type ?gridExtra::grid.arrange

Create and arrange plots:

library(gridExtra)

p1 <- ggplot(iris, aes(x = Sepal.Length)) +
      geom_histogram(bins = 20) +
      ggtitle("Distribution of Sepal Length")

p2 <- ggplot(iris, aes(x = Species, y = Sepal.Length)) +
      geom_boxplot() +
      ggtitle("Sepal Length by Species")

grid.arrange(p1, p2, ncol = 2)

Different layouts:

grid.arrange(p1, p2, p1, p2,
            ncol = 2, nrow = 2,
            top = "Four Panel Display")