Data Visualization Chapter 5

Last updated on Sep 26, 2019 9 min read R, Books

Chapter 5
- Use Pipes to Summerize Data
Continuous Variables by Group or Category
- Write and Draw in the Plot Area
- Scales, Guides, and Themes

Chapter 5

Use Pipes to Summerize Data

rel_by_region <- gss_sm %>%
  group_by(bigregion, religion) %>%
  summarize(N = n()) %>%
  mutate(freq = N / sum(N),
         pct = round((freq*100), 0))

## Warning: Factor `religion` contains implicit NA, consider using
## `forcats::fct_explicit_na`

rel_by_region

## # A tibble: 24 x 5
## # Groups:   bigregion [4]
##    bigregion religion       N    freq   pct
##    <fct>     <fct>      <int>   <dbl> <dbl>
##  1 Northeast Protestant   158 0.324      32
##  2 Northeast Catholic     162 0.332      33
##  3 Northeast Jewish        27 0.0553      6
##  4 Northeast None         112 0.230      23
##  5 Northeast Other         28 0.0574      6
##  6 Northeast <NA>           1 0.00205     0
##  7 Midwest   Protestant   325 0.468      47
##  8 Midwest   Catholic     172 0.247      25
##  9 Midwest   Jewish         3 0.00432     0
## 10 Midwest   None         157 0.226      23
## # … with 14 more rows

rel_by_region %>% group_by(bigregion) %>% summarize(total = sum(pct))

## # A tibble: 4 x 2
##   bigregion total
##   <fct>     <dbl>
## 1 Northeast   100
## 2 Midwest     101
## 3 South       100
## 4 West        101

p <- ggplot(rel_by_region, aes(x = bigregion, y = pct, fill = religion))
p + geom_col(position = "dodge2") +
  labs(x = "Region",y = "Percent", fill = "Religion") +
  theme(legend.position = "top")

Use coord_flip()

p <- ggplot(rel_by_region, aes(x = bigregion, y = pct, fill = religion))
p + geom_col(position = "dodge2") +
  labs(x = "Region",y = "Percent", fill = "Religion") +
  guides(fill = FALSE) + 
  coord_flip() + 
  facet_grid(~ bigregion)

p <- ggplot(rel_by_region, aes(x = religion, y = pct, fill = religion))
p + geom_col(position = "dodge2") +
  labs(x = NULL,y = "Percent", fill = "Religion") +
  guides(fill = FALSE) + 
  coord_flip() + 
  facet_grid(~ bigregion)

Continuous Variables by Group or Category

p <- ggplot(data = organdata, mapping = aes(x = year, y = donors))
p + geom_line(aes(group = country)) + facet_wrap(~country)

## Warning: Removed 34 rows containing missing values (geom_path).

p <- ggplot(data = organdata, mapping = aes(x = country, y = donors))
p + geom_boxplot() + coord_flip()

## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

p <- ggplot(data = organdata, mapping = aes(x = reorder(country,
                                                        donors, na.rm = TRUE), y = donors))
p + geom_boxplot() + labs(x = NULL) + coord_flip()

## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

p <- ggplot(data = organdata, mapping = aes(x = reorder(country, donors, na.rm = TRUE), 
                                            y = donors, fill = world))
p + geom_boxplot() + labs(x = NULL) + 
  coord_flip() + theme(legend.position = "top")

## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

p <- ggplot(data = organdata, mapping = aes(x = reorder(country, donors, na.rm = TRUE), 
                                            y = donors, color = world))
p + geom_point() + labs(x = NULL) + 
  coord_flip() + theme(legend.position = "top")

## Warning: Removed 34 rows containing missing values (geom_point).

p <-
  ggplot(data = organdata,
         mapping = aes(
           x = reorder(country, donors, na.rm = TRUE),
           y = donors,
           color = world
         ))
p + geom_jitter() + labs(x = NULL) +
  coord_flip() + theme(legend.position = "top")

## Warning: Removed 34 rows containing missing values (geom_point).

p <-
  ggplot(data = organdata,
         mapping = aes(
           x = reorder(country, donors, na.rm = TRUE),
           y = donors,
           color = world
         ))
p + geom_jitter(position = position_jitter(width = 0.15)) + labs(x = NULL) +
  coord_flip() + theme(legend.position = "top")

## Warning: Removed 34 rows containing missing values (geom_point).

by_country <-
  organdata %>% group_by(consent_law, country) %>% summarize(
    donors_mean = mean(donors, na.rm = TRUE),
    donors_sd = sd(donors, na.rm = TRUE),
    gdp_mean = mean(gdp, na.rm = TRUE),
    health_mean = mean(health, na.rm = TRUE),
    roads_mean = mean(roads, na.rm = TRUE),
    cerebvas_mean = mean(cerebvas, na.rm = TRUE)
  )

by_country

## # A tibble: 17 x 8
## # Groups:   consent_law [2]
##    consent_law country donors_mean donors_sd gdp_mean health_mean
##    <chr>       <chr>         <dbl>     <dbl>    <dbl>       <dbl>
##  1 Informed    Austra…        10.6     1.14    22179.       1958.
##  2 Informed    Canada         14.0     0.751   23711.       2272.
##  3 Informed    Denmark        13.1     1.47    23722.       2054.
##  4 Informed    Germany        13.0     0.611   22163.       2349.
##  5 Informed    Ireland        19.8     2.48    20824.       1480.
##  6 Informed    Nether…        13.7     1.55    23013.       1993.
##  7 Informed    United…        13.5     0.775   21359.       1561.
##  8 Informed    United…        20.0     1.33    29212.       3988.
##  9 Presumed    Austria        23.5     2.42    23876.       1875.
## 10 Presumed    Belgium        21.9     1.94    22500.       1958.
## 11 Presumed    Finland        18.4     1.53    21019.       1615.
## 12 Presumed    France         16.8     1.60    22603.       2160.
## 13 Presumed    Italy          11.1     4.28    21554.       1757 
## 14 Presumed    Norway         15.4     1.11    26448.       2217.
## 15 Presumed    Spain          28.1     4.96    16933        1289.
## 16 Presumed    Sweden         13.1     1.75    22415.       1951.
## 17 Presumed    Switze…        14.2     1.71    27233        2776.
## # … with 2 more variables: roads_mean <dbl>, cerebvas_mean <dbl>

by_country <- organdata %>% group_by(consent_law, country) %>%
  summarize_if(is.numeric, lst(mean, sd), na.rm = TRUE) %>%
  ungroup()
by_country

## # A tibble: 17 x 28
##    consent_law country donors_mean pop_mean pop_dens_mean gdp_mean
##    <chr>       <chr>         <dbl>    <dbl>         <dbl>    <dbl>
##  1 Informed    Austra…        10.6   18318.         0.237   22179.
##  2 Informed    Canada         14.0   29608.         0.297   23711.
##  3 Informed    Denmark        13.1    5257.        12.2     23722.
##  4 Informed    Germany        13.0   80255.        22.5     22163.
##  5 Informed    Ireland        19.8    3674.         5.23    20824.
##  6 Informed    Nether…        13.7   15548.        37.4     23013.
##  7 Informed    United…        13.5   58187.        24.0     21359.
##  8 Informed    United…        20.0  269330.         2.80    29212.
##  9 Presumed    Austria        23.5    7927.         9.45    23876.
## 10 Presumed    Belgium        21.9   10153.        30.7     22500.
## 11 Presumed    Finland        18.4    5112.         1.51    21019.
## 12 Presumed    France         16.8   58056.        10.5     22603.
## 13 Presumed    Italy          11.1   57360.        19.0     21554.
## 14 Presumed    Norway         15.4    4386.         1.35    26448.
## 15 Presumed    Spain          28.1   39666.         7.84    16933 
## 16 Presumed    Sweden         13.1    8789.         1.95    22415.
## 17 Presumed    Switze…        14.2    7037.        17.0     27233 
## # … with 22 more variables: gdp_lag_mean <dbl>, health_mean <dbl>,
## #   health_lag_mean <dbl>, pubhealth_mean <dbl>, roads_mean <dbl>,
## #   cerebvas_mean <dbl>, assault_mean <dbl>, external_mean <dbl>,
## #   txp_pop_mean <dbl>, donors_sd <dbl>, pop_sd <dbl>, pop_dens_sd <dbl>,
## #   gdp_sd <dbl>, gdp_lag_sd <dbl>, health_sd <dbl>, health_lag_sd <dbl>,
## #   pubhealth_sd <dbl>, roads_sd <dbl>, cerebvas_sd <dbl>,
## #   assault_sd <dbl>, external_sd <dbl>, txp_pop_sd <dbl>

p <- ggplot(data = by_country,
            mapping = aes(
              x = donors_mean,
              y = reorder(country, donors_mean),
              color = consent_law
            ))
p + geom_point(size = 3) +
  labs(x = "Donor Procurement Rate",
       y = "", color = "Consent Law") +
  theme(legend.position = "top")

p <- ggplot(data = by_country,
            mapping = aes(x = donors_mean,
                          y = reorder(country, donors_mean)))

p + geom_point(size = 3) +
  facet_wrap( ~ consent_law, scales = "free_y", ncol = 1) +
  labs(x = "Donor Procurement Rate",
       y = "")

p <- ggplot(data = by_country,
            mapping = aes(x = reorder(country,
                                      donors_mean), y = donors_mean))

p + geom_pointrange(mapping = aes(ymin = donors_mean - donors_sd,
                                  ymax = donors_mean + donors_sd)) +
  labs(x = "", y = "Donor Procurement Rate") + coord_flip()

### Plot Text Directly

p <- ggplot(data = by_country,
            mapping = aes(x = roads_mean,
                          y = donors_mean))
p + geom_point() + geom_text(mapping = aes(label = country))

p <- ggplot(data = by_country,
            mapping = aes(x = roads_mean,
                          y = donors_mean))
p + geom_point() + geom_text(mapping = aes(label = country), hjust = 0)

ggrepel is better than geom_text()

library(ggrepel)

p_title <-
  "Presidential Elections: Popular & Electoral College Margins"
p_subtitle <- "1824-2016"
p_caption <- "Data for 2016 are provisional."
x_label <- "Winner's share of Popular Vote"
y_label <- "Winner's share of Electoral College Votes"

p <- ggplot(elections_historic,
            aes(x = popular_pct, y = ec_pct,
                label = winner_label))

p + geom_hline(yintercept = 0.5,
               size = 1.4,
               color = "gray80") +
  geom_vline(xintercept = 0.5,
             size = 1.4,
             color = "gray80") +
  geom_point() +
  geom_text_repel() +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = x_label,
    y = y_label,
    title = p_title,
    subtitle = p_subtitle,
    caption = p_caption
  )

### Label Outliers

p <- ggplot(data = by_country,
            mapping = aes(x = gdp_mean, y = health_mean))

p + geom_point() +
  geom_text_repel(data = subset(by_country, gdp_mean > 25000),
                  mapping = aes(label = country))

p <- ggplot(data = by_country,
            mapping = aes(x = gdp_mean, y = health_mean))

p + geom_point() +
  geom_text_repel(
    data = subset(
      by_country,
      gdp_mean > 25000 | health_mean < 1500 |
        country %in% "Belgium"
    ),
    mapping = aes(label = country)
  )

organdata$ind <- organdata$ccode %in% c("Ita", "Spa") &
  organdata$year > 1998

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors, color = ind))
p + geom_point() +
  geom_text_repel(data = subset(organdata, ind),
                  mapping = aes(label = ccode)) +
  guides(label = FALSE, color = FALSE)

## Warning: Removed 34 rows containing missing values (geom_point).

Write and Draw in the Plot Area

p <- ggplot(data = organdata, mapping = aes(x = roads, y = donors))
p + geom_point() + annotate(
  geom = "text",
  x = 91,
  y = 33,
  label = "A surprisingly high \n recovery rate.",
  hjust = 0
)

## Warning: Removed 34 rows containing missing values (geom_point).

p <- ggplot(data = organdata,
            mapping = aes(x = roads, y = donors))
p + geom_point() +
  annotate(
    geom = "rect",
    xmin = 125,
    xmax = 155,
    ymin = 30,
    ymax = 35,
    fill = "red",
    alpha = 0.2
  ) +
  annotate(
    geom = "text",
    x = 157,
    y = 33,
    label = "A surprisingly high \n recovery rate.",
    hjust = 0
  )

## Warning: Removed 34 rows containing missing values (geom_point).

Scales, Guides, and Themes

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors,
                          color = world))
p + geom_point()

## Warning: Removed 34 rows containing missing values (geom_point).

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors,
                          color = world))
p + geom_point() + scale_x_log10() + scale_y_continuous(breaks = c(5,
                                                                   15, 25),
                                                        labels = c("Five", "Fifteen", "Twenty Five"))

## Warning: Removed 34 rows containing missing values (geom_point).

p <- ggplot(data = organdata,
            mapping = aes(x = roads, y = donors,
                          color = world))
p + geom_point() + scale_color_discrete(labels = c("Corporatist",
                                                   "Liberal", "Social Democratic", "Unclassified")) + 
  labs(x = "Road Deaths",
       y = "Donor Procurement", color = "Welfare State")

## Warning: Removed 34 rows containing missing values (geom_point).

p <- ggplot(data = organdata,
            mapping = aes(x = roads, y = donors,
                          color = world))
p + geom_point() + labs(x = "Road Deaths", y = "Donor Procurement") +
  guides(color = FALSE)

## Warning: Removed 34 rows containing missing values (geom_point).

socviz

Yihong WANG

Wayfaring Stranger

Data Visualization Chapter 5

Chapter 5

Use Pipes to Summerize Data

Continuous Variables by Group or Category

Write and Draw in the Plot Area

Scales, Guides, and Themes

Yihong WANG

Related