Data Vis Chapter 6

Last updated on Sep 26, 2019 9 min read R

Show Several Fits at Once, with a Legend
Model-based Graphics
Tidy Model Objects with Broom
- get component-level statistics with tidy()
- Get observation-level statistics with augment()
Grouped Analysis
Plots for Surveys

p <-  ggplot(data = gapminder,
             mapping = aes(x = log(gdpPercap), y = lifeExp))

p + geom_point(alpha = 0.1) +
  geom_smooth(color = "tomato",
              fill = "tomato",
              method = MASS::rlm) + #robust regression line
  geom_smooth(color = "steelblue",
              fill = "steelblue",
              method = "lm")

p + geom_point(alpha = 0.1) +
  geom_smooth(
    color = "tomato",
    method = "lm",
    size = 1.2,
    formula = y ~ splines::bs(x, 3),
    se = FALSE
  )

p + geom_point(alpha = 0.1) +
  geom_quantile( # specialized version of geom)smooth that can fit quantile regression
    color = "tomato",
    size = 1.2,
    method = "rqss",
    lambda = 1,
    quantiles = c(0.20, 0.5, 0.85)
  )

## Smoothing formula not specified. Using: y ~ qss(x, lambda = 1)

Show Several Fits at Once, with a Legend

model_colors <- RColorBrewer::brewer.pal(3, "Set1")
model_colors

## [1] "#E41A1C" "#377EB8" "#4DAF4A"

p0 <- ggplot(data = gapminder,
             mapping = aes(x = log(gdpPercap), y = lifeExp))

p1 <- p0 + geom_point(alpha = 0.2) +
  geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS")) +
  geom_smooth(
    method = "lm",
    formula = y ~ splines::bs(x, df = 3),
    aes(color = "Cubic Spline", fill = "Cubic Spline")
  ) +
  geom_smooth(method = "loess",
              aes(color = "LOESS", fill = "LOESS"))

p1 + scale_color_manual(name = "Models", values = model_colors) +
  scale_fill_manual(name = "Models", values = model_colors) +
  theme(legend.position = "top")

Model-based Graphics

min_gdp <- min(gapminder$gdpPercap)
max_gdp <- max(gapminder$gdpPercap)
med_pop <- median(gapminder$pop)

pred_df <- expand.grid(gdpPercap = (seq(from = min_gdp, to = max_gdp,
length.out = 100)), pop = med_pop, continent = c("Africa",
"Americas", "Asia", "Europe", "Oceania"))

dim(pred_df)

## [1] 500   3

out <- lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)

pred_out <- predict(object = out, newdata = pred_df, interval = "predict")
pred_df <- cbind(pred_df, pred_out)

p <-
  ggplot(
    data = subset(pred_df, continent %in% c("Europe", "Africa")),
    aes(
      x = gdpPercap,
      y = fit,
      ymin = lwr,
      ymax = upr,
      color = continent,
      fill = continent,
      group = continent
    )
  )

p + geom_point(
  data = subset(gapminder,
                continent %in% c("Europe", "Africa")),
  aes(x = gdpPercap, y = lifeExp,
      color = continent),
  alpha = 0.5,
  inherit.aes = FALSE
) +
  geom_line() +
  geom_ribbon(alpha = 0.2, color = FALSE) +
  scale_x_log10(labels = scales::dollar)

Tidy Model Objects with Broom

get component-level statistics with tidy()

library(broom)
out_comp <- tidy(out)
out_comp %>% round_df()

## # A tibble: 7 x 5
##   term              estimate std.error statistic p.value
##   <chr>                <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)          47.8      0.34     141.         0
## 2 gdpPercap             0        0         19.2        0
## 3 pop                   0        0          3.33       0
## 4 continentAmericas    13.5      0.6       22.5        0
## 5 continentAsia         8.19     0.570     14.3        0
## 6 continentEurope      17.5      0.62      28.0        0
## 7 continentOceania     18.1      1.78      10.2        0

“not in” %nin% is availabe via socviz. prefix_strip from socviz drops prefixes

#confidence interval
out_conf <- tidy(out, conf.int = TRUE)
out_conf <- subset(out_conf, term %nin% "(Intercept)")
out_conf$nicelabs <- prefix_strip(out_conf$term, "continent")

p <- ggplot(out_conf,
            mapping = aes(
              x = reorder(nicelabs, estimate),
              y = estimate,
              ymin = conf.low,
              ymax = conf.high
            ))
p + geom_pointrange() + coord_flip() + labs(x = "", y = "OLS Estimate")

Get observation-level statistics with augment()

out_aug <- augment(out)
p <- ggplot(data = out_aug, mapping = aes(x = .fitted, y = .resid))
p + geom_point()

### Get model-level statistics with glance() Broom is able to tidy (and augment, and glance at) a wide range of model types.

library(survival)

out_cph <- coxph(Surv(time, status) ~ age + sex, data = lung)
out_surv <- survfit(out_cph)
out_tidy <- tidy(out_surv)
p <- ggplot(data = out_tidy, mapping = aes(time, estimate))
p + geom_line() + geom_ribbon(mapping = aes(ymin = conf.low,
                                            ymax = conf.high),
                              alpha = 0.2)

Grouped Analysis

nest and unnest

out_le <- gapminder %>%
  group_by(continent, year) %>%
  nest()

fit_ols <- function(df) {
  lm(lifeExp ~ log(gdpPercap), data = df)
}

out_le <- gapminder %>%
  group_by(continent, year) %>%
  nest() %>%
  mutate(model = map(data, fit_ols))



out_tidy <- gapminder %>%
  group_by(continent, year) %>%
  nest() %>%
  mutate(model = map(data, fit_ols),
         tidied = map(model, tidy)) %>%
  unnest(tidied, .drop = TRUE) %>%
  filter(term %nin% "(Intercept)" &
           continent %nin% "Oceania")

## Warning: The `.drop` argument of `unnest()` is deprecated as of tidyr 1.0.0.
## All list-columns are now preserved.
## This warning is displayed once per session.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

p <- ggplot(
  data = out_tidy,
  mapping = aes(
    x = year,
    y = estimate,
    ymin = estimate - 2 * std.error,
    ymax = estimate + 2 * std.error,
    group = continent,
    color = continent
  )
)

p + geom_pointrange(position = position_dodge(width = 1)) +
  scale_x_continuous(breaks = unique(gapminder$year)) +
  theme(legend.position = "top") +
  labs(x = "Year", y = "Estimate", color = "Continent")

## Plot Marginal Effects

library(margins)
gss_sm$polviews_m <- relevel(gss_sm$polviews, ref = "Moderate")
out_bo <- glm(obama ~ polviews_m + sex * race,
              family = "binomial",
              data = gss_sm)
summary(out_bo)

## 
## Call:
## glm(formula = obama ~ polviews_m + sex * race, family = "binomial", 
##     data = gss_sm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9045  -0.5541   0.1772   0.5418   2.2437  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       0.296493   0.134091   2.211  0.02703 *  
## polviews_mExtremely Liberal       2.372950   0.525045   4.520 6.20e-06 ***
## polviews_mLiberal                 2.600031   0.356666   7.290 3.10e-13 ***
## polviews_mSlightly Liberal        1.293172   0.248435   5.205 1.94e-07 ***
## polviews_mSlightly Conservative  -1.355277   0.181291  -7.476 7.68e-14 ***
## polviews_mConservative           -2.347463   0.200384 -11.715  < 2e-16 ***
## polviews_mExtremely Conservative -2.727384   0.387210  -7.044 1.87e-12 ***
## sexFemale                         0.254866   0.145370   1.753  0.07956 .  
## raceBlack                         3.849526   0.501319   7.679 1.61e-14 ***
## raceOther                        -0.002143   0.435763  -0.005  0.99608    
## sexFemale:raceBlack              -0.197506   0.660066  -0.299  0.76477    
## sexFemale:raceOther               1.574829   0.587657   2.680  0.00737 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2247.9  on 1697  degrees of freedom
## Residual deviance: 1345.9  on 1686  degrees of freedom
##   (1169 observations deleted due to missingness)
## AIC: 1369.9
## 
## Number of Fisher Scoring iterations: 6

bo_m <- margins(out_bo)
summary(bo_m)

##                            factor     AME     SE        z      p   lower
##            polviews_mConservative -0.4119 0.0283 -14.5394 0.0000 -0.4674
##  polviews_mExtremely Conservative -0.4538 0.0420 -10.7971 0.0000 -0.5361
##       polviews_mExtremely Liberal  0.2681 0.0295   9.0996 0.0000  0.2103
##                 polviews_mLiberal  0.2768 0.0229  12.0736 0.0000  0.2319
##   polviews_mSlightly Conservative -0.2658 0.0330  -8.0596 0.0000 -0.3304
##        polviews_mSlightly Liberal  0.1933 0.0303   6.3896 0.0000  0.1340
##                         raceBlack  0.4032 0.0173  23.3568 0.0000  0.3694
##                         raceOther  0.1247 0.0386   3.2297 0.0012  0.0490
##                         sexFemale  0.0443 0.0177   2.5073 0.0122  0.0097
##    upper
##  -0.3564
##  -0.3714
##   0.3258
##   0.3218
##  -0.2011
##   0.2526
##   0.4371
##   0.2005
##   0.0789

The margins library comes with several plot methods of its own. If you wish, at this point you can just try plot(bo_m) to see a plot of the average marginal effects, produced with the general look of a Stata graphic. Other plot methods in the margins library include cplot(), which visualizes marginal effects conditional on a second variable, and image(), which shows predictions or marginal effects as a filled heatmap or contour plot.

bo_gg <- as_tibble(summary(bo_m))
prefixes <- c("polviews_m", "sex")
bo_gg$factor <- prefix_strip(bo_gg$factor, prefixes)
bo_gg$factor <- prefix_replace(bo_gg$factor, "race", "Race: ")

bo_gg %>% select(factor, AME, lower, upper)

## # A tibble: 9 x 4
##   factor                     AME    lower   upper
##   <chr>                    <dbl>    <dbl>   <dbl>
## 1 Conservative           -0.412  -0.467   -0.356 
## 2 Extremely Conservative -0.454  -0.536   -0.371 
## 3 Extremely Liberal       0.268   0.210    0.326 
## 4 Liberal                 0.277   0.232    0.322 
## 5 Slightly Conservative  -0.266  -0.330   -0.201 
## 6 Slightly Liberal        0.193   0.134    0.253 
## 7 Race: Black             0.403   0.369    0.437 
## 8 Race: Other             0.125   0.0490   0.200 
## 9 Female                  0.0443  0.00967  0.0789

p <- ggplot(data = bo_gg, aes(
  x = reorder(factor, AME),
  y = AME,
  ymin = lower,
  ymax = upper
))

p + geom_hline(yintercept = 0, color = "gray80") +
  geom_pointrange() + coord_flip() +
  labs(x = NULL, y = "Average Marginal Effect")

pv_cp <- cplot(out_bo, x = "sex", draw = FALSE)

##    xvals     yvals     upper     lower
## 1   Male 0.5735849 0.6378653 0.5093045
## 2 Female 0.6344507 0.6887845 0.5801169

p <- ggplot(data = pv_cp, aes(
  x = reorder(xvals, yvals),
  y = yvals,
  ymin = lower,
  ymax = upper
))

p + geom_hline(yintercept = 0, color = "gray80") +
  geom_pointrange() + coord_flip() +
  labs(x = NULL, y = "Conditional Effect")

Plots for Surveys

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(srvyr)

## 
## Attaching package: 'srvyr'

## The following object is masked from 'package:stats':
## 
##     filter

options(survey.lonely.psu = "adjust")
options(na.action = "na.pass")

gss_wt <- subset(gss_lon, year > 1974) %>%
  mutate(stratvar = interaction(year, vstrat)) %>%
  as_survey_design(
    ids = vpsu,
    strata = stratvar,
    weights = wtssall,
    nest = TRUE
  )

out_grp <- gss_wt %>%
  filter(year %in% seq(1976, 2016, by = 4)) %>%
  group_by(year, race, degree) %>%
  summarize(prop = survey_mean(na.rm = TRUE)) # calculate  properly weighted survey means

## Warning: Factor `degree` contains implicit NA, consider using
## `forcats::fct_explicit_na`

out_mrg <- gss_wt %>%
  filter(year %in% seq(1976, 2016, by = 4)) %>%
  mutate(racedeg = interaction(race, degree)) %>%
  group_by(year, racedeg) %>%
  summarize(prop = survey_mean(na.rm = TRUE))

## Warning: Factor `racedeg` contains implicit NA, consider using
## `forcats::fct_explicit_na`

out_mrg <- gss_wt %>%  filter(year %in% seq(1976, 2016, by = 4)) %>%
  mutate(racedeg = interaction(race, degree)) %>% group_by(year,
                                                           racedeg) %>% 
  summarize(prop = survey_mean(na.rm = TRUE)) %>%
  separate(racedeg, sep = "\\.", into = c("race", "degree"))

## Warning: Factor `racedeg` contains implicit NA, consider using
## `forcats::fct_explicit_na`

p <- ggplot(
  data = subset(out_grp, race %nin% "Other"),
  mapping = aes(
    x = degree,
    y = prop,
    ymin = prop - 2 * prop_se,
    ymax = prop + 2 * prop_se,
    fill = race,
    color = race,
    group = race
  )
)

dodge <- position_dodge(width = 0.9)

p + geom_col(position = dodge, alpha = 0.2) +
  geom_errorbar(position = dodge, width = 0.2) +
  scale_x_discrete(labels = scales::wrap_format(10)) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_brewer(type = "qual", palette = "Dark2") +
  scale_fill_brewer(type = "qual", palette = "Dark2") +
  labs(
    title = "Educational Attainment by Race",
    subtitle = "GSS 1976-2016",
    fill = "Race",
    color = "Race",
    x = NULL,
    y = "Percent"
  ) +
  facet_wrap( ~ year, ncol = 2) +
  theme(legend.position = "top")

## Warning: Removed 13 rows containing missing values (geom_col).

## Warning: Removed 13 rows containing missing values (geom_errorbar).

p <- ggplot(
  data = subset(out_grp, race %nin% "Other"),
  mapping = aes(
    x = year,
    y = prop,
    ymin = prop - 2 * prop_se,
    ymax = prop + 2 * prop_se,
    fill = race,
    color = race,
    group = race
  )
)

p + geom_ribbon(alpha = 0.3, aes(color = NULL)) + #Use ribbon to show the error range
  geom_line() + #Use line to show a time trend
  facet_wrap( ~ degree, ncol = 1) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_brewer(type = "qual", palette = "Dark2") +
  scale_fill_brewer(type = "qual", palette = "Dark2") +
  labs(
    title = "Educational Attainment by Race",
    subtitle = "GSS 1976-2016",
    fill = "Race",
    color = "Race",
    x = NULL,
    y = "Percent"
  ) +
  theme(legend.position = "top")

## Warning: Removed 13 rows containing missing values (geom_path).

Other useful packages: infer, ggally

socviz

Yihong WANG

Wayfaring Stranger