Setup

set.seed(42)

knitr::opts_chunk$set(
  comment = "#>",
  collapse = TRUE,
  cache = FALSE,
  echo = TRUE
)

library(tidyverse)
library(here)
library(cowplot)
library(patchwork)
theme_set(theme_cowplot(12))

close_elts <- function(x) x / sum(x)

Simulate data

bias <- tibble(taxon = c("Lactobacillus", "Gardnerella", "Atopobium", "Spike-in"),
  efficiency = c(4.68, 0.16, 0.285, 1)) %>%
  mutate(across(efficiency, ~. / min(.)))
# efficiency associated with the targeted measurement of Lactobacillus
targeted_efficiency <- 2
a0 <- tribble(
  ~taxon, ~timepoint, ~abundance,
  "Lactobacillus", "T1", 5,
  "Lactobacillus", "T2", 0.5,
  "Gardnerella", "T1", 2,
  "Gardnerella", "T2", 8,
  "Atopobium", "T1", 3,
  "Atopobium", "T2", 1.5,
) %>%
  # Shrink abundance to have similar scale as proportions
  mutate(across(abundance, ~ . / max(.))) %>%
  left_join(bias, by = "taxon") %>%
  with_groups(timepoint, mutate,
    total_abundance = sum(abundance),
    proportion = close_elts(abundance),
    biased_abundance = efficiency * abundance,
    biased_proportion = close_elts(biased_abundance),
    biased_count = biased_proportion * 1e3,
    abundance_estimate_bulk = biased_proportion * total_abundance,
    #> abundance_estimate_spikein = biased_count / biased_count[taxon == "Spike-in"],
    abundance_estimate_targeted = biased_count * targeted_efficiency *
      abundance[taxon == "Lactobacillus"] / biased_count[taxon == "Lactobacillus"],
  )
a1 <- a0 %>%
  select(-total_abundance) %>%
  pivot_longer(-c(taxon, efficiency, timepoint), names_to = "type")
# data frame for plots
ptb <- a1 %>%
  filter(
    type %in% c("proportion", "biased_proportion"),
  ) %>%
  mutate(
    across(type, fct_relevel, "proportion", "biased_proportion"),
    across(type, fct_recode,
      "Actual" = "proportion",
      "Observed" = "biased_proportion"),
  )

Plots

Panels showing the error in measurement and in differential abundance

shared_layers <- list(
  geom_path(aes(group = taxon),
    arrow = grid::arrow(length = unit(0.15, "inches"))),
  geom_point(size = 2),
  scale_color_brewer(type = "qual", palette = 1, guide = "none"),
  labs(y = "Proportion", color = "Taxon"),
  scale_y_log10(),
  coord_cartesian(clip = "off"),
  # scale_y_log10(breaks = c(1e-2, 3e-2, 1e-1, 3e-1, 1)) +
  expand_limits(y = 1e-2),
  theme(plot.margin = unit(c(0, 1.3, 0, 0), units = "in"))
)
# How much to nudge the taxon labels and proportions
nudge.taxon <- 0.48
nudge.prop <- 0.24
# In future iterations, consider labelling the taxa in both facets
p.meas <- ptb %>%
  ggplot(aes(type, value, color = taxon)) +
  facet_wrap(~timepoint, nrow = 1, scales = "fixed",
    labeller = as_labeller(function(x) str_c("Time point ", x))
  ) +
  shared_layers +
  geom_text(data = ~filter(., type == "Actual"),
    aes(label = round(value, 2)), nudge_x = -nudge.prop) +
  geom_text(data = ~filter(., type == "Observed"),
    aes(label = round(value, 2)), nudge_x = nudge.prop) +
  geom_text(data = ~filter(., timepoint == "T2", type == "Observed"),
    aes(label = taxon), nudge_x = nudge.taxon, hjust = 0) +
  labs(
    x = "Type",
    title = "Measurement error at each time point"
  )
p.fc <- ptb %>%
  ggplot(aes(timepoint, value, color = taxon)) +
  facet_wrap(~type, nrow = 1, scales = "fixed") +
  shared_layers +
  geom_text(data = ~filter(., timepoint == "T1"),
    aes(label = round(value, 2)), nudge_x = -nudge.prop) +
  geom_text(data = ~filter(., timepoint == "T2"),
    aes(label = round(value, 2)), nudge_x = nudge.prop) +
  geom_text(data = ~filter(., timepoint == "T2", type == "Observed"),
    aes(label = taxon), nudge_x = nudge.taxon, hjust = 0) +
  labs(
    x = "Time point",
    title = "Actual and observed fold changes"
  )

Panel showing the efficiencies of individual taxa and the sample means

# First, compute the mean efficiency, then join with the taxon efficiencies in
# a table for plotting.
sme <- a0 %>%
  with_groups(timepoint, summarize, mean_efficiency = sum(proportion * efficiency))
sme0 <- sme %>%
  mutate(label = str_glue("mean ({timepoint})")) %>%
  select(label, efficiency = mean_efficiency)
sme1 <- sme %>%
  mutate(taxon = "Mean", type = "Mean") %>%
  select(taxon, efficiency = mean_efficiency, timepoint, type)
bias1 <- bias %>%
  filter(taxon != "Spike-in") %>%
  expand(nesting(taxon, efficiency), timepoint = c("T1", "T2")) %>%
  mutate(type = "Taxon")
lvls = c("Atopobium", "Gardnerella", "Lactobacillus", "Mean")
etb1 <- bind_rows(bias1, sme1) %>%
  mutate(across(taxon, factor, levels = lvls))

lvls = c("Gardnerella", "Atopobium", "Lactobacillus")
bias2 <- bias %>%
  filter(taxon != "Spike-in") %>%
  expand(nesting(taxon, efficiency), timepoint = c("T1", "T2")) %>%
  mutate(
    type = "Taxon",
    across(taxon, factor, levels = lvls),
    x = as.integer(taxon)
  ) %>%
  left_join(sme, by = "timepoint")

clrs <- c(RColorBrewer::brewer.pal(n = 3, "Accent"), rep("#585858", 2))
p.eff <- bias2 %>%
  ggplot(aes(x = x, y = efficiency, color = taxon)) +
  geom_point(size = 2) +
  geom_text(data = ~filter(., timepoint == "T2"),
    aes(label = taxon), x = 3.3, hjust = 0) +
  geom_segment(aes(xend = x, yend  = efficiency, y = mean_efficiency),
    arrow = grid::arrow(length = unit(0.15, "inches"))) +
  geom_segment(data = sme,
    aes(x = 1, xend = 3, y = mean_efficiency, yend = mean_efficiency),
    color = "#585858",
    inherit.aes = FALSE) +
  geom_text(data = sme0 %>% mutate(timepoint = "T2"),
    aes(label = label, y = efficiency), x = 3.3, hjust = 0,
    color = "#585858",
    inherit.aes = FALSE) +
  scale_color_manual(values = clrs) +
  labs(y = "Relative efficiency", x = NULL, color = "Taxon",
    title = "Taxonomic bias of protocol"
  ) +
  # Set the vertical span to match the other plots
  scale_y_log10(limits = c(1, 100) / 2,
    breaks = etb1$efficiency,
    labels = signif(etb1$efficiency, 2)
  ) +
  xlim(c(0.0, 4)) +
  facet_wrap(~timepoint, nrow = 1, scales = "fixed",
    labeller = as_labeller(function(x) str_c("Time point ", x))
  ) +
  coord_cartesian(clip = "off") +
  theme(
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.margin = unit(c(0, 1.0, 0, 0), units = "in")
  )

Plot as a multi-panel figure

p.meas + p.eff + p.fc + plot_spacer() + 
  plot_layout(byrow = TRUE, ncol = 2, widths = c(1, 0.6)) +
  plot_annotation(tag_levels = "A")

Caption (copied from main article version 47ce39c): Taxonomic bias can distort differential abundance results even when it is consistent for each taxon across samples. Panel A shows the actual and observed proportions for hypothetical community samples from two time points, which differ in their relative abundance of three taxa. Panel B shows taxonomic bias in terms of the relative efficiencies of the three taxa against the mean efficiency of each sample; the difference between the taxon’s efficiency and the sample’s mean (vertical arrows) determines the fold error seen in Panel A. Panel C rearranges the plot from Panel A to show the actual and observed fold changes between time points. The efficiencies of individual taxa were estimated by @mclaren2019cons from mock community data from @brooks2015thet. The abundances are hypothetical but inspired by observations from the human vaginal microbiome; see main text.

Some ideas for improvement:

Experiment with ways to make the connections even clearer between the efficiency differences and the error.
Try making the arrows vertical, so that they are directly connected to the fold changes in proportion and in efficiency.
Can also split the third plot into T1 and T2.
Might consider changing “Time point T1” to “Sample S1”, to fit the primary language of the text.
Can mention in the text that these could be samples from different women, or the same woman at different times.
also use a shape scale for the taxa, to give a second way besides color to distinguish them

Session info

Click for session info

sessioninfo::session_info()

#> ─ Session info ──────────────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 4.1.0 (2021-05-18)
#>  os       Arch Linux                  
#>  system   x86_64, linux-gnu           
#>  ui       X11                         
#>  language (EN)                        
#>  collate  en_US.UTF-8                 
#>  ctype    en_US.UTF-8                 
#>  tz       America/New_York            
#>  date     2021-07-17                  
#> 
#> ─ Packages ──────────────────────────────────────────────────────────────────────
#>  package      * version date       lib source        
#>  assertthat     0.2.1   2019-03-21 [1] CRAN (R 4.0.0)
#>  backports      1.2.1   2020-12-09 [1] CRAN (R 4.0.3)
#>  base64enc      0.1-3   2015-07-28 [1] CRAN (R 4.0.0)
#>  bookdown       0.22    2021-04-22 [1] CRAN (R 4.1.0)
#>  broom          0.7.6   2021-04-05 [1] CRAN (R 4.0.5)
#>  bslib          0.2.5.1 2021-05-18 [1] CRAN (R 4.1.0)
#>  cellranger     1.1.0   2016-07-27 [1] CRAN (R 4.0.0)
#>  cli            3.0.0   2021-06-30 [1] CRAN (R 4.1.0)
#>  colorspace     2.0-2   2021-06-24 [1] CRAN (R 4.1.0)
#>  cowplot      * 1.1.1   2020-12-30 [1] CRAN (R 4.0.4)
#>  crayon         1.4.1   2021-02-08 [1] CRAN (R 4.0.4)
#>  DBI            1.1.1   2021-01-15 [1] CRAN (R 4.0.4)
#>  dbplyr         2.1.1   2021-04-06 [1] CRAN (R 4.0.5)
#>  digest         0.6.27  2020-10-24 [1] CRAN (R 4.0.3)
#>  distill        1.2     2021-01-13 [1] CRAN (R 4.1.0)
#>  downlit        0.2.1   2020-11-04 [1] CRAN (R 4.0.3)
#>  dplyr        * 1.0.7   2021-06-18 [1] CRAN (R 4.1.0)
#>  ellipsis       0.3.2   2021-04-29 [1] CRAN (R 4.1.0)
#>  evaluate       0.14    2019-05-28 [1] CRAN (R 4.0.0)
#>  fansi          0.5.0   2021-05-25 [1] CRAN (R 4.1.0)
#>  farver         2.1.0   2021-02-28 [1] CRAN (R 4.0.4)
#>  forcats      * 0.5.1   2021-01-27 [1] CRAN (R 4.0.4)
#>  fs             1.5.0   2020-07-31 [1] CRAN (R 4.0.2)
#>  generics       0.1.0   2020-10-31 [1] CRAN (R 4.0.3)
#>  ggplot2      * 3.3.5   2021-06-25 [1] CRAN (R 4.1.0)
#>  glue           1.4.2   2020-08-27 [1] CRAN (R 4.0.2)
#>  gtable         0.3.0   2019-03-25 [1] CRAN (R 4.0.0)
#>  haven          2.4.1   2021-04-23 [1] CRAN (R 4.1.0)
#>  here         * 1.0.1   2020-12-13 [1] CRAN (R 4.0.5)
#>  highr          0.9     2021-04-16 [1] CRAN (R 4.1.0)
#>  hms            1.1.0   2021-05-17 [1] CRAN (R 4.1.0)
#>  htmltools      0.5.1.1 2021-01-22 [1] CRAN (R 4.0.3)
#>  httr           1.4.2   2020-07-20 [1] CRAN (R 4.0.2)
#>  jquerylib      0.1.4   2021-04-26 [1] CRAN (R 4.1.0)
#>  jsonlite       1.7.2   2020-12-09 [1] CRAN (R 4.0.3)
#>  knitr          1.33    2021-04-24 [1] CRAN (R 4.1.0)
#>  labeling       0.4.2   2020-10-20 [1] CRAN (R 4.0.3)
#>  lifecycle      1.0.0   2021-02-15 [1] CRAN (R 4.0.4)
#>  lubridate      1.7.10  2021-02-26 [1] CRAN (R 4.0.4)
#>  magrittr       2.0.1   2020-11-17 [1] CRAN (R 4.0.3)
#>  modelr         0.1.8   2020-05-19 [1] CRAN (R 4.0.0)
#>  munsell        0.5.0   2018-06-12 [1] CRAN (R 4.0.0)
#>  nvimcom      * 0.9-102 2021-07-17 [1] local         
#>  patchwork    * 1.1.1   2020-12-17 [1] CRAN (R 4.0.3)
#>  pillar         1.6.1   2021-05-16 [1] CRAN (R 4.1.0)
#>  pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.0.0)
#>  purrr        * 0.3.4   2020-04-17 [1] CRAN (R 4.0.0)
#>  R6             2.5.0   2020-10-28 [1] CRAN (R 4.0.3)
#>  RColorBrewer   1.1-2   2014-12-07 [1] CRAN (R 4.0.0)
#>  Rcpp           1.0.7   2021-07-07 [1] CRAN (R 4.1.0)
#>  readr        * 1.4.0   2020-10-05 [1] CRAN (R 4.0.3)
#>  readxl         1.3.1   2019-03-13 [1] CRAN (R 4.0.0)
#>  reprex         2.0.0   2021-04-02 [1] CRAN (R 4.0.5)
#>  rlang          0.4.11  2021-04-30 [1] CRAN (R 4.1.0)
#>  rmarkdown    * 2.8     2021-05-07 [1] CRAN (R 4.1.0)
#>  rprojroot      2.0.2   2020-11-15 [1] CRAN (R 4.0.3)
#>  rstudioapi     0.13    2020-11-12 [1] CRAN (R 4.0.3)
#>  rvest          1.0.0   2021-03-09 [1] CRAN (R 4.0.5)
#>  sass           0.4.0   2021-05-12 [1] CRAN (R 4.1.0)
#>  scales         1.1.1   2020-05-11 [1] CRAN (R 4.0.0)
#>  sessioninfo    1.1.1   2018-11-05 [1] CRAN (R 4.0.0)
#>  stringi        1.6.2   2021-05-17 [1] CRAN (R 4.1.0)
#>  stringr      * 1.4.0   2019-02-10 [1] CRAN (R 4.0.0)
#>  tibble       * 3.1.2   2021-05-16 [1] CRAN (R 4.1.0)
#>  tidyr        * 1.1.3   2021-03-03 [1] CRAN (R 4.0.4)
#>  tidyselect     1.1.1   2021-04-30 [1] CRAN (R 4.1.0)
#>  tidyverse    * 1.3.1   2021-04-15 [1] CRAN (R 4.1.0)
#>  utf8           1.2.1   2021-03-12 [1] CRAN (R 4.0.5)
#>  vctrs          0.3.8   2021-04-29 [1] CRAN (R 4.1.0)
#>  withr          2.4.2   2021-04-18 [1] CRAN (R 4.0.5)
#>  xfun           0.23    2021-05-15 [1] CRAN (R 4.1.0)
#>  xml2           1.3.2   2020-04-23 [1] CRAN (R 4.0.0)
#>  yaml           2.2.1   2020-02-01 [1] CRAN (R 4.0.0)
#> 
#> [1] /home/michael/.local/lib/R/library
#> [2] /usr/lib/R/library

Hypothetical example of spurious fold changes in proportions in the vaginal microbiome

Setup

Simulate data

Plots

Session info