Hypothetical example of spurious fold changes in proportions in the vaginal microbiome

This doc develops a hypothetical example in which bias leads to a spurious differential abundance result. It is based on a real case of bias measured among three species of vaginally-associated bacteria.

Michael R. McLaren
2021-05-10

Changelog

Setup

set.seed(42)

knitr::opts_chunk$set(
  comment = "#>",
  collapse = TRUE,
  cache = FALSE,
  echo = TRUE
)

library(tidyverse)
library(here)
library(cowplot)
library(patchwork)
theme_set(theme_cowplot(12))

close_elts <- function(x) x / sum(x)

Simulate data

bias <- tibble(taxon = c("Lactobacillus", "Gardnerella", "Atopobium", "Spike-in"),
  efficiency = c(4.68, 0.16, 0.285, 1)) %>%
  mutate(across(efficiency, ~. / min(.)))
# efficiency associated with the targeted measurement of Lactobacillus
targeted_efficiency <- 2
a0 <- tribble(
  ~taxon, ~timepoint, ~abundance,
  "Lactobacillus", "T1", 5,
  "Lactobacillus", "T2", 0.5,
  "Gardnerella", "T1", 2,
  "Gardnerella", "T2", 8,
  "Atopobium", "T1", 3,
  "Atopobium", "T2", 1.5,
) %>%
  # Shrink abundance to have similar scale as proportions
  mutate(across(abundance, ~ . / max(.))) %>%
  left_join(bias, by = "taxon") %>%
  with_groups(timepoint, mutate,
    total_abundance = sum(abundance),
    proportion = close_elts(abundance),
    biased_abundance = efficiency * abundance,
    biased_proportion = close_elts(biased_abundance),
    biased_count = biased_proportion * 1e3,
    abundance_estimate_bulk = biased_proportion * total_abundance,
    #> abundance_estimate_spikein = biased_count / biased_count[taxon == "Spike-in"],
    abundance_estimate_targeted = biased_count * targeted_efficiency *
      abundance[taxon == "Lactobacillus"] / biased_count[taxon == "Lactobacillus"],
  )
a1 <- a0 %>%
  select(-total_abundance) %>%
  pivot_longer(-c(taxon, efficiency, timepoint), names_to = "type")
# data frame for plots
ptb <- a1 %>%
  filter(
    type %in% c("proportion", "biased_proportion"),
  ) %>%
  mutate(
    across(type, fct_relevel, "proportion", "biased_proportion"),
    across(type, fct_recode,
      "Actual" = "proportion",
      "Observed" = "biased_proportion"),
  )

Plots

Panels showing the error in measurement and in differential abundance

shared_layers <- list(
  geom_path(aes(group = taxon),
    arrow = grid::arrow(length = unit(0.15, "inches"))),
  geom_point(size = 2),
  scale_color_brewer(type = "qual", palette = 1, guide = "none"),
  labs(y = "Proportion", color = "Taxon"),
  scale_y_log10(),
  coord_cartesian(clip = "off"),
  # scale_y_log10(breaks = c(1e-2, 3e-2, 1e-1, 3e-1, 1)) +
  expand_limits(y = 1e-2),
  theme(plot.margin = unit(c(0, 1.3, 0, 0), units = "in"))
)
# How much to nudge the taxon labels and proportions
nudge.taxon <- 0.48
nudge.prop <- 0.24
# In future iterations, consider labelling the taxa in both facets
p.meas <- ptb %>%
  ggplot(aes(type, value, color = taxon)) +
  facet_wrap(~timepoint, nrow = 1, scales = "fixed",
    labeller = as_labeller(function(x) str_c("Time point ", x))
  ) +
  shared_layers +
  geom_text(data = ~filter(., type == "Actual"),
    aes(label = round(value, 2)), nudge_x = -nudge.prop) +
  geom_text(data = ~filter(., type == "Observed"),
    aes(label = round(value, 2)), nudge_x = nudge.prop) +
  geom_text(data = ~filter(., timepoint == "T2", type == "Observed"),
    aes(label = taxon), nudge_x = nudge.taxon, hjust = 0) +
  labs(
    x = "Type",
    title = "Measurement error at each time point"
  )
p.fc <- ptb %>%
  ggplot(aes(timepoint, value, color = taxon)) +
  facet_wrap(~type, nrow = 1, scales = "fixed") +
  shared_layers +
  geom_text(data = ~filter(., timepoint == "T1"),
    aes(label = round(value, 2)), nudge_x = -nudge.prop) +
  geom_text(data = ~filter(., timepoint == "T2"),
    aes(label = round(value, 2)), nudge_x = nudge.prop) +
  geom_text(data = ~filter(., timepoint == "T2", type == "Observed"),
    aes(label = taxon), nudge_x = nudge.taxon, hjust = 0) +
  labs(
    x = "Time point",
    title = "Actual and observed fold changes"
  )

Panel showing the efficiencies of individual taxa and the sample means

# First, compute the mean efficiency, then join with the taxon efficiencies in
# a table for plotting.
sme <- a0 %>%
  with_groups(timepoint, summarize, mean_efficiency = sum(proportion * efficiency))
sme0 <- sme %>%
  mutate(label = str_glue("mean ({timepoint})")) %>%
  select(label, efficiency = mean_efficiency)
sme1 <- sme %>%
  mutate(taxon = "Mean", type = "Mean") %>%
  select(taxon, efficiency = mean_efficiency, timepoint, type)
bias1 <- bias %>%
  filter(taxon != "Spike-in") %>%
  expand(nesting(taxon, efficiency), timepoint = c("T1", "T2")) %>%
  mutate(type = "Taxon")
lvls = c("Atopobium", "Gardnerella", "Lactobacillus", "Mean")
etb1 <- bind_rows(bias1, sme1) %>%
  mutate(across(taxon, factor, levels = lvls))

lvls = c("Gardnerella", "Atopobium", "Lactobacillus")
bias2 <- bias %>%
  filter(taxon != "Spike-in") %>%
  expand(nesting(taxon, efficiency), timepoint = c("T1", "T2")) %>%
  mutate(
    type = "Taxon",
    across(taxon, factor, levels = lvls),
    x = as.integer(taxon)
  ) %>%
  left_join(sme, by = "timepoint")

clrs <- c(RColorBrewer::brewer.pal(n = 3, "Accent"), rep("#585858", 2))
p.eff <- bias2 %>%
  ggplot(aes(x = x, y = efficiency, color = taxon)) +
  geom_point(size = 2) +
  geom_text(data = ~filter(., timepoint == "T2"),
    aes(label = taxon), x = 3.3, hjust = 0) +
  geom_segment(aes(xend = x, yend  = efficiency, y = mean_efficiency),
    arrow = grid::arrow(length = unit(0.15, "inches"))) +
  geom_segment(data = sme,
    aes(x = 1, xend = 3, y = mean_efficiency, yend = mean_efficiency),
    color = "#585858",
    inherit.aes = FALSE) +
  geom_text(data = sme0 %>% mutate(timepoint = "T2"),
    aes(label = label, y = efficiency), x = 3.3, hjust = 0,
    color = "#585858",
    inherit.aes = FALSE) +
  scale_color_manual(values = clrs) +
  labs(y = "Relative efficiency", x = NULL, color = "Taxon",
    title = "Taxonomic bias of protocol"
  ) +
  # Set the vertical span to match the other plots
  scale_y_log10(limits = c(1, 100) / 2,
    breaks = etb1$efficiency,
    labels = signif(etb1$efficiency, 2)
  ) +
  xlim(c(0.0, 4)) +
  facet_wrap(~timepoint, nrow = 1, scales = "fixed",
    labeller = as_labeller(function(x) str_c("Time point ", x))
  ) +
  coord_cartesian(clip = "off") +
  theme(
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.margin = unit(c(0, 1.0, 0, 0), units = "in")
  )

Plot as a multi-panel figure

p.meas + p.eff + p.fc + plot_spacer() + 
  plot_layout(byrow = TRUE, ncol = 2, widths = c(1, 0.6)) +
  plot_annotation(tag_levels = "A")

Caption (copied from main article version 47ce39c): Taxonomic bias can distort differential abundance results even when it is consistent for each taxon across samples. Panel A shows the actual and observed proportions for hypothetical community samples from two time points, which differ in their relative abundance of three taxa. Panel B shows taxonomic bias in terms of the relative efficiencies of the three taxa against the mean efficiency of each sample; the difference between the taxon’s efficiency and the sample’s mean (vertical arrows) determines the fold error seen in Panel A. Panel C rearranges the plot from Panel A to show the actual and observed fold changes between time points. The efficiencies of individual taxa were estimated by @mclaren2019cons from mock community data from @brooks2015thet. The abundances are hypothetical but inspired by observations from the human vaginal microbiome; see main text.

Some ideas for improvement:

Session info

Click for session info
sessioninfo::session_info()
#> ─ Session info ──────────────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 4.1.0 (2021-05-18)
#>  os       Arch Linux                  
#>  system   x86_64, linux-gnu           
#>  ui       X11                         
#>  language (EN)                        
#>  collate  en_US.UTF-8                 
#>  ctype    en_US.UTF-8                 
#>  tz       America/New_York            
#>  date     2021-07-17                  
#> 
#> ─ Packages ──────────────────────────────────────────────────────────────────────
#>  package      * version date       lib source        
#>  assertthat     0.2.1   2019-03-21 [1] CRAN (R 4.0.0)
#>  backports      1.2.1   2020-12-09 [1] CRAN (R 4.0.3)
#>  base64enc      0.1-3   2015-07-28 [1] CRAN (R 4.0.0)
#>  bookdown       0.22    2021-04-22 [1] CRAN (R 4.1.0)
#>  broom          0.7.6   2021-04-05 [1] CRAN (R 4.0.5)
#>  bslib          0.2.5.1 2021-05-18 [1] CRAN (R 4.1.0)
#>  cellranger     1.1.0   2016-07-27 [1] CRAN (R 4.0.0)
#>  cli            3.0.0   2021-06-30 [1] CRAN (R 4.1.0)
#>  colorspace     2.0-2   2021-06-24 [1] CRAN (R 4.1.0)
#>  cowplot      * 1.1.1   2020-12-30 [1] CRAN (R 4.0.4)
#>  crayon         1.4.1   2021-02-08 [1] CRAN (R 4.0.4)
#>  DBI            1.1.1   2021-01-15 [1] CRAN (R 4.0.4)
#>  dbplyr         2.1.1   2021-04-06 [1] CRAN (R 4.0.5)
#>  digest         0.6.27  2020-10-24 [1] CRAN (R 4.0.3)
#>  distill        1.2     2021-01-13 [1] CRAN (R 4.1.0)
#>  downlit        0.2.1   2020-11-04 [1] CRAN (R 4.0.3)
#>  dplyr        * 1.0.7   2021-06-18 [1] CRAN (R 4.1.0)
#>  ellipsis       0.3.2   2021-04-29 [1] CRAN (R 4.1.0)
#>  evaluate       0.14    2019-05-28 [1] CRAN (R 4.0.0)
#>  fansi          0.5.0   2021-05-25 [1] CRAN (R 4.1.0)
#>  farver         2.1.0   2021-02-28 [1] CRAN (R 4.0.4)
#>  forcats      * 0.5.1   2021-01-27 [1] CRAN (R 4.0.4)
#>  fs             1.5.0   2020-07-31 [1] CRAN (R 4.0.2)
#>  generics       0.1.0   2020-10-31 [1] CRAN (R 4.0.3)
#>  ggplot2      * 3.3.5   2021-06-25 [1] CRAN (R 4.1.0)
#>  glue           1.4.2   2020-08-27 [1] CRAN (R 4.0.2)
#>  gtable         0.3.0   2019-03-25 [1] CRAN (R 4.0.0)
#>  haven          2.4.1   2021-04-23 [1] CRAN (R 4.1.0)
#>  here         * 1.0.1   2020-12-13 [1] CRAN (R 4.0.5)
#>  highr          0.9     2021-04-16 [1] CRAN (R 4.1.0)
#>  hms            1.1.0   2021-05-17 [1] CRAN (R 4.1.0)
#>  htmltools      0.5.1.1 2021-01-22 [1] CRAN (R 4.0.3)
#>  httr           1.4.2   2020-07-20 [1] CRAN (R 4.0.2)
#>  jquerylib      0.1.4   2021-04-26 [1] CRAN (R 4.1.0)
#>  jsonlite       1.7.2   2020-12-09 [1] CRAN (R 4.0.3)
#>  knitr          1.33    2021-04-24 [1] CRAN (R 4.1.0)
#>  labeling       0.4.2   2020-10-20 [1] CRAN (R 4.0.3)
#>  lifecycle      1.0.0   2021-02-15 [1] CRAN (R 4.0.4)
#>  lubridate      1.7.10  2021-02-26 [1] CRAN (R 4.0.4)
#>  magrittr       2.0.1   2020-11-17 [1] CRAN (R 4.0.3)
#>  modelr         0.1.8   2020-05-19 [1] CRAN (R 4.0.0)
#>  munsell        0.5.0   2018-06-12 [1] CRAN (R 4.0.0)
#>  nvimcom      * 0.9-102 2021-07-17 [1] local         
#>  patchwork    * 1.1.1   2020-12-17 [1] CRAN (R 4.0.3)
#>  pillar         1.6.1   2021-05-16 [1] CRAN (R 4.1.0)
#>  pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.0.0)
#>  purrr        * 0.3.4   2020-04-17 [1] CRAN (R 4.0.0)
#>  R6             2.5.0   2020-10-28 [1] CRAN (R 4.0.3)
#>  RColorBrewer   1.1-2   2014-12-07 [1] CRAN (R 4.0.0)
#>  Rcpp           1.0.7   2021-07-07 [1] CRAN (R 4.1.0)
#>  readr        * 1.4.0   2020-10-05 [1] CRAN (R 4.0.3)
#>  readxl         1.3.1   2019-03-13 [1] CRAN (R 4.0.0)
#>  reprex         2.0.0   2021-04-02 [1] CRAN (R 4.0.5)
#>  rlang          0.4.11  2021-04-30 [1] CRAN (R 4.1.0)
#>  rmarkdown    * 2.8     2021-05-07 [1] CRAN (R 4.1.0)
#>  rprojroot      2.0.2   2020-11-15 [1] CRAN (R 4.0.3)
#>  rstudioapi     0.13    2020-11-12 [1] CRAN (R 4.0.3)
#>  rvest          1.0.0   2021-03-09 [1] CRAN (R 4.0.5)
#>  sass           0.4.0   2021-05-12 [1] CRAN (R 4.1.0)
#>  scales         1.1.1   2020-05-11 [1] CRAN (R 4.0.0)
#>  sessioninfo    1.1.1   2018-11-05 [1] CRAN (R 4.0.0)
#>  stringi        1.6.2   2021-05-17 [1] CRAN (R 4.1.0)
#>  stringr      * 1.4.0   2019-02-10 [1] CRAN (R 4.0.0)
#>  tibble       * 3.1.2   2021-05-16 [1] CRAN (R 4.1.0)
#>  tidyr        * 1.1.3   2021-03-03 [1] CRAN (R 4.0.4)
#>  tidyselect     1.1.1   2021-04-30 [1] CRAN (R 4.1.0)
#>  tidyverse    * 1.3.1   2021-04-15 [1] CRAN (R 4.1.0)
#>  utf8           1.2.1   2021-03-12 [1] CRAN (R 4.0.5)
#>  vctrs          0.3.8   2021-04-29 [1] CRAN (R 4.1.0)
#>  withr          2.4.2   2021-04-18 [1] CRAN (R 4.0.5)
#>  xfun           0.23    2021-05-15 [1] CRAN (R 4.1.0)
#>  xml2           1.3.2   2020-04-23 [1] CRAN (R 4.0.0)
#>  yaml           2.2.1   2020-02-01 [1] CRAN (R 4.0.0)
#> 
#> [1] /home/michael/.local/lib/R/library
#> [2] /usr/lib/R/library