This doc develops a hypothetical example in which bias leads to a spurious differential abundance result. It is based on a real case of bias measured among three species of vaginally-associated bacteria.
Changelog
bias <- tibble(taxon = c("Lactobacillus", "Gardnerella", "Atopobium", "Spike-in"),
efficiency = c(4.68, 0.16, 0.285, 1)) %>%
mutate(across(efficiency, ~. / min(.)))
# efficiency associated with the targeted measurement of Lactobacillus
targeted_efficiency <- 2
a0 <- tribble(
~taxon, ~timepoint, ~abundance,
"Lactobacillus", "T1", 5,
"Lactobacillus", "T2", 0.5,
"Gardnerella", "T1", 2,
"Gardnerella", "T2", 8,
"Atopobium", "T1", 3,
"Atopobium", "T2", 1.5,
) %>%
# Shrink abundance to have similar scale as proportions
mutate(across(abundance, ~ . / max(.))) %>%
left_join(bias, by = "taxon") %>%
with_groups(timepoint, mutate,
total_abundance = sum(abundance),
proportion = close_elts(abundance),
biased_abundance = efficiency * abundance,
biased_proportion = close_elts(biased_abundance),
biased_count = biased_proportion * 1e3,
abundance_estimate_bulk = biased_proportion * total_abundance,
#> abundance_estimate_spikein = biased_count / biased_count[taxon == "Spike-in"],
abundance_estimate_targeted = biased_count * targeted_efficiency *
abundance[taxon == "Lactobacillus"] / biased_count[taxon == "Lactobacillus"],
)
a1 <- a0 %>%
select(-total_abundance) %>%
pivot_longer(-c(taxon, efficiency, timepoint), names_to = "type")
# data frame for plots
ptb <- a1 %>%
filter(
type %in% c("proportion", "biased_proportion"),
) %>%
mutate(
across(type, fct_relevel, "proportion", "biased_proportion"),
across(type, fct_recode,
"Actual" = "proportion",
"Observed" = "biased_proportion"),
)
Panels showing the error in measurement and in differential abundance
shared_layers <- list(
geom_path(aes(group = taxon),
arrow = grid::arrow(length = unit(0.15, "inches"))),
geom_point(size = 2),
scale_color_brewer(type = "qual", palette = 1, guide = "none"),
labs(y = "Proportion", color = "Taxon"),
scale_y_log10(),
coord_cartesian(clip = "off"),
# scale_y_log10(breaks = c(1e-2, 3e-2, 1e-1, 3e-1, 1)) +
expand_limits(y = 1e-2),
theme(plot.margin = unit(c(0, 1.3, 0, 0), units = "in"))
)
# How much to nudge the taxon labels and proportions
nudge.taxon <- 0.48
nudge.prop <- 0.24
# In future iterations, consider labelling the taxa in both facets
p.meas <- ptb %>%
ggplot(aes(type, value, color = taxon)) +
facet_wrap(~timepoint, nrow = 1, scales = "fixed",
labeller = as_labeller(function(x) str_c("Time point ", x))
) +
shared_layers +
geom_text(data = ~filter(., type == "Actual"),
aes(label = round(value, 2)), nudge_x = -nudge.prop) +
geom_text(data = ~filter(., type == "Observed"),
aes(label = round(value, 2)), nudge_x = nudge.prop) +
geom_text(data = ~filter(., timepoint == "T2", type == "Observed"),
aes(label = taxon), nudge_x = nudge.taxon, hjust = 0) +
labs(
x = "Type",
title = "Measurement error at each time point"
)
p.fc <- ptb %>%
ggplot(aes(timepoint, value, color = taxon)) +
facet_wrap(~type, nrow = 1, scales = "fixed") +
shared_layers +
geom_text(data = ~filter(., timepoint == "T1"),
aes(label = round(value, 2)), nudge_x = -nudge.prop) +
geom_text(data = ~filter(., timepoint == "T2"),
aes(label = round(value, 2)), nudge_x = nudge.prop) +
geom_text(data = ~filter(., timepoint == "T2", type == "Observed"),
aes(label = taxon), nudge_x = nudge.taxon, hjust = 0) +
labs(
x = "Time point",
title = "Actual and observed fold changes"
)
Panel showing the efficiencies of individual taxa and the sample means
# First, compute the mean efficiency, then join with the taxon efficiencies in
# a table for plotting.
sme <- a0 %>%
with_groups(timepoint, summarize, mean_efficiency = sum(proportion * efficiency))
sme0 <- sme %>%
mutate(label = str_glue("mean ({timepoint})")) %>%
select(label, efficiency = mean_efficiency)
sme1 <- sme %>%
mutate(taxon = "Mean", type = "Mean") %>%
select(taxon, efficiency = mean_efficiency, timepoint, type)
bias1 <- bias %>%
filter(taxon != "Spike-in") %>%
expand(nesting(taxon, efficiency), timepoint = c("T1", "T2")) %>%
mutate(type = "Taxon")
lvls = c("Atopobium", "Gardnerella", "Lactobacillus", "Mean")
etb1 <- bind_rows(bias1, sme1) %>%
mutate(across(taxon, factor, levels = lvls))
lvls = c("Gardnerella", "Atopobium", "Lactobacillus")
bias2 <- bias %>%
filter(taxon != "Spike-in") %>%
expand(nesting(taxon, efficiency), timepoint = c("T1", "T2")) %>%
mutate(
type = "Taxon",
across(taxon, factor, levels = lvls),
x = as.integer(taxon)
) %>%
left_join(sme, by = "timepoint")
clrs <- c(RColorBrewer::brewer.pal(n = 3, "Accent"), rep("#585858", 2))
p.eff <- bias2 %>%
ggplot(aes(x = x, y = efficiency, color = taxon)) +
geom_point(size = 2) +
geom_text(data = ~filter(., timepoint == "T2"),
aes(label = taxon), x = 3.3, hjust = 0) +
geom_segment(aes(xend = x, yend = efficiency, y = mean_efficiency),
arrow = grid::arrow(length = unit(0.15, "inches"))) +
geom_segment(data = sme,
aes(x = 1, xend = 3, y = mean_efficiency, yend = mean_efficiency),
color = "#585858",
inherit.aes = FALSE) +
geom_text(data = sme0 %>% mutate(timepoint = "T2"),
aes(label = label, y = efficiency), x = 3.3, hjust = 0,
color = "#585858",
inherit.aes = FALSE) +
scale_color_manual(values = clrs) +
labs(y = "Relative efficiency", x = NULL, color = "Taxon",
title = "Taxonomic bias of protocol"
) +
# Set the vertical span to match the other plots
scale_y_log10(limits = c(1, 100) / 2,
breaks = etb1$efficiency,
labels = signif(etb1$efficiency, 2)
) +
xlim(c(0.0, 4)) +
facet_wrap(~timepoint, nrow = 1, scales = "fixed",
labeller = as_labeller(function(x) str_c("Time point ", x))
) +
coord_cartesian(clip = "off") +
theme(
axis.line.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
plot.margin = unit(c(0, 1.0, 0, 0), units = "in")
)
Plot as a multi-panel figure
p.meas + p.eff + p.fc + plot_spacer() +
plot_layout(byrow = TRUE, ncol = 2, widths = c(1, 0.6)) +
plot_annotation(tag_levels = "A")
Caption (copied from main article version 47ce39c): Taxonomic bias can distort differential abundance results even when it is consistent for each taxon across samples. Panel A shows the actual and observed proportions for hypothetical community samples from two time points, which differ in their relative abundance of three taxa. Panel B shows taxonomic bias in terms of the relative efficiencies of the three taxa against the mean efficiency of each sample; the difference between the taxon’s efficiency and the sample’s mean (vertical arrows) determines the fold error seen in Panel A. Panel C rearranges the plot from Panel A to show the actual and observed fold changes between time points. The efficiencies of individual taxa were estimated by @mclaren2019cons from mock community data from @brooks2015thet. The abundances are hypothetical but inspired by observations from the human vaginal microbiome; see main text.
Some ideas for improvement:
sessioninfo::session_info()
#> ─ Session info ──────────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.1.0 (2021-05-18)
#> os Arch Linux
#> system x86_64, linux-gnu
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz America/New_York
#> date 2021-07-17
#>
#> ─ Packages ──────────────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.0)
#> backports 1.2.1 2020-12-09 [1] CRAN (R 4.0.3)
#> base64enc 0.1-3 2015-07-28 [1] CRAN (R 4.0.0)
#> bookdown 0.22 2021-04-22 [1] CRAN (R 4.1.0)
#> broom 0.7.6 2021-04-05 [1] CRAN (R 4.0.5)
#> bslib 0.2.5.1 2021-05-18 [1] CRAN (R 4.1.0)
#> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.0.0)
#> cli 3.0.0 2021-06-30 [1] CRAN (R 4.1.0)
#> colorspace 2.0-2 2021-06-24 [1] CRAN (R 4.1.0)
#> cowplot * 1.1.1 2020-12-30 [1] CRAN (R 4.0.4)
#> crayon 1.4.1 2021-02-08 [1] CRAN (R 4.0.4)
#> DBI 1.1.1 2021-01-15 [1] CRAN (R 4.0.4)
#> dbplyr 2.1.1 2021-04-06 [1] CRAN (R 4.0.5)
#> digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.3)
#> distill 1.2 2021-01-13 [1] CRAN (R 4.1.0)
#> downlit 0.2.1 2020-11-04 [1] CRAN (R 4.0.3)
#> dplyr * 1.0.7 2021-06-18 [1] CRAN (R 4.1.0)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.1.0)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.0)
#> fansi 0.5.0 2021-05-25 [1] CRAN (R 4.1.0)
#> farver 2.1.0 2021-02-28 [1] CRAN (R 4.0.4)
#> forcats * 0.5.1 2021-01-27 [1] CRAN (R 4.0.4)
#> fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.2)
#> generics 0.1.0 2020-10-31 [1] CRAN (R 4.0.3)
#> ggplot2 * 3.3.5 2021-06-25 [1] CRAN (R 4.1.0)
#> glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.2)
#> gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.0)
#> haven 2.4.1 2021-04-23 [1] CRAN (R 4.1.0)
#> here * 1.0.1 2020-12-13 [1] CRAN (R 4.0.5)
#> highr 0.9 2021-04-16 [1] CRAN (R 4.1.0)
#> hms 1.1.0 2021-05-17 [1] CRAN (R 4.1.0)
#> htmltools 0.5.1.1 2021-01-22 [1] CRAN (R 4.0.3)
#> httr 1.4.2 2020-07-20 [1] CRAN (R 4.0.2)
#> jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.1.0)
#> jsonlite 1.7.2 2020-12-09 [1] CRAN (R 4.0.3)
#> knitr 1.33 2021-04-24 [1] CRAN (R 4.1.0)
#> labeling 0.4.2 2020-10-20 [1] CRAN (R 4.0.3)
#> lifecycle 1.0.0 2021-02-15 [1] CRAN (R 4.0.4)
#> lubridate 1.7.10 2021-02-26 [1] CRAN (R 4.0.4)
#> magrittr 2.0.1 2020-11-17 [1] CRAN (R 4.0.3)
#> modelr 0.1.8 2020-05-19 [1] CRAN (R 4.0.0)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.0)
#> nvimcom * 0.9-102 2021-07-17 [1] local
#> patchwork * 1.1.1 2020-12-17 [1] CRAN (R 4.0.3)
#> pillar 1.6.1 2021-05-16 [1] CRAN (R 4.1.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.0)
#> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.0)
#> R6 2.5.0 2020-10-28 [1] CRAN (R 4.0.3)
#> RColorBrewer 1.1-2 2014-12-07 [1] CRAN (R 4.0.0)
#> Rcpp 1.0.7 2021-07-07 [1] CRAN (R 4.1.0)
#> readr * 1.4.0 2020-10-05 [1] CRAN (R 4.0.3)
#> readxl 1.3.1 2019-03-13 [1] CRAN (R 4.0.0)
#> reprex 2.0.0 2021-04-02 [1] CRAN (R 4.0.5)
#> rlang 0.4.11 2021-04-30 [1] CRAN (R 4.1.0)
#> rmarkdown * 2.8 2021-05-07 [1] CRAN (R 4.1.0)
#> rprojroot 2.0.2 2020-11-15 [1] CRAN (R 4.0.3)
#> rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.0.3)
#> rvest 1.0.0 2021-03-09 [1] CRAN (R 4.0.5)
#> sass 0.4.0 2021-05-12 [1] CRAN (R 4.1.0)
#> scales 1.1.1 2020-05-11 [1] CRAN (R 4.0.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.0)
#> stringi 1.6.2 2021-05-17 [1] CRAN (R 4.1.0)
#> stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.0)
#> tibble * 3.1.2 2021-05-16 [1] CRAN (R 4.1.0)
#> tidyr * 1.1.3 2021-03-03 [1] CRAN (R 4.0.4)
#> tidyselect 1.1.1 2021-04-30 [1] CRAN (R 4.1.0)
#> tidyverse * 1.3.1 2021-04-15 [1] CRAN (R 4.1.0)
#> utf8 1.2.1 2021-03-12 [1] CRAN (R 4.0.5)
#> vctrs 0.3.8 2021-04-29 [1] CRAN (R 4.1.0)
#> withr 2.4.2 2021-04-18 [1] CRAN (R 4.0.5)
#> xfun 0.23 2021-05-15 [1] CRAN (R 4.1.0)
#> xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.0)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.0)
#>
#> [1] /home/michael/.local/lib/R/library
#> [2] /usr/lib/R/library