Illustrate the basic problem

ref:brooks2015thet ref:leopold2020host
Michael R. McLaren true
2021-10-27

R setup

# Tools for microbiome data
library(speedyseq)
# Tools for general purpose data manipulation and plotting
library(tidyverse)
library(fs)
# ggplot helpers
library(ggbeeswarm)
library(cowplot)
library(patchwork)
library(scales)
theme_set(theme_cowplot())
# stats helpers
# library(broom)

library(metacal); packageVersion("metacal")
[1] '0.2.0.9005'

Brooks et al

Setup

colors_brooks <- c(
  "Atopobium_vaginae" = "#009E73",
  "Gardnerella_vaginalis" = "#56B4E9",
  "Lactobacillus_crispatus" = "#D55E00",
  "Lactobacillus_iners" = "#505050",
  "Prevotella_bivia" = "#0072B2",
  "Sneathia_amnii" = "#CC79A7",
  "Streptococcus_agalactiae" = "#E69F00")

scale_y_custom <- scale_y_continuous(
    trans = 'log10',
    breaks = trans_breaks('log10', function(x) 10^x),
    labels = trans_format('log10', math_format(10^.x))
  )

Load data from the cellular mock communities of Brooks et al 2015 from metacal,

dr <- system.file("extdata", package = "metacal")
list.files(dr)
[1] "brooks2015-actual.csv"      "brooks2015-observed.csv"   
[3] "brooks2015-sample-data.csv"
actual <- file.path(dr, "brooks2015-actual.csv") |>
  read.csv(row.names = "Sample") |>
  otu_table(taxa_are_rows = FALSE)
observed <- file.path(dr, "brooks2015-observed.csv") |>
  read.csv(row.names = "Sample") |>
  subset(select = - Other) |>
  otu_table(taxa_are_rows = FALSE)

Note that the single-species samples present.

actual %>% 
  as_tibble %>%
  filter(.abundance == 1) %>%
  count(.otu)
# A tibble: 7 × 2
  .otu                         n
  <chr>                    <int>
1 Atopobium_vaginae            2
2 Gardnerella_vaginalis        1
3 Lactobacillus_crispatus      1
4 Lactobacillus_iners          1
5 Prevotella_bivia             1
6 Sneathia_amnii               1
7 Streptococcus_agalactiae     2

Plots

Idea is to focus on a single species and show that the error in log proportions is inconsistent, and therefore there is error in the fold changes. L. crispatus is a species MM and BC commonly use to illustrate the inconsistent error, as since it has an intermediate efficiency, we see its error varying in sign/direction. However, the errors in fold changes don’t depend on this aspect, but only the variation in mean efficiency, and is therefore the same for all species.

Data frame for comparing measured and actual proportions:

brooks_prop <- list(
  Actual = actual, 
  Measured = observed
) %>%
  map(transform_sample_counts, close_elts) %>%
  map_dfr(as_tibble, .id = 'type') %>%
  pivot_wider(names_from = type, values_from = .abundance)

brooks_fc <- list(
  Actual = actual, 
  Measured = observed
) %>%
  map(transform_sample_counts, close_elts) %>%
  map(pairwise_ratios, margin = 'samples', filter = FALSE) %>%
  map_dfr(as_tibble, .id = 'type') %>%
  pivot_wider(names_from = type, values_from = .abundance) %>%
  separate(.sample, c('sample1', 'sample2'), sep = ':') %>%
  # Filter to remove cases where Actual is 0 in one or both samples, or where
  # the numerator and denominator sample are equal
  filter(is.finite(Actual), Actual != 0, sample1 != sample2)
set.seed(42)
species <- 'Lactobacillus_crispatus'

samples <- actual %>%
  as_tibble %>%
  filter(.otu == species, .abundance > 0) %>%
  pull(.sample)

p1 <- brooks_prop %>%
  filter(.otu == species, Actual > 0) %>%
  ggplot(aes(Actual, Measured, color = .otu)) +
  scale_color_manual(values = colors_brooks) +
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(color = 'grey', size = 1) +
  geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
  # geom_quasirandom() +
  coord_fixed() +
  expand_limits(x = 0.1, y = 0.1) +
  labs(x = 'Actual proportion', y = 'Measured proportion', color = 'Species') +
  theme(legend.position = 'none')
p2 <- brooks_fc %>%
  filter(.otu == species, sample1 %in% samples, sample2 %in% samples) %>%
  ggplot(aes(Actual, Measured, color = .otu)) +
  scale_color_manual(values = colors_brooks) +
  scale_x_log10() +
  # scale_y_custom +
  scale_y_log10() +
  geom_abline(color = 'grey', size = 1) +
  geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
  # geom_quasirandom() +
  coord_fixed() +
  labs(x = 'Actual fold change', y = 'Measured fold change', color = 'Species') +
  theme(legend.position = 'none')
# p2
(p1 + labs(x = 'Actual', y = 'Measured', title = 'Proportion')) + 
  (p2 + labs(x = 'Actual', y = 'Measured', title = 'Fold change')) + 
  plot_annotation(tag_levels = 'A')

Leopold and Busby 2020

This dataset is for DNA mocks rather than cellular mocks; however the magnitude of bias is similar. This dataset has a couple advantages for illustration purposes:

Setup

## From the metacal 2.0 tutorial
# Download data from https://zenodo.org/record/3872145
data_path <- here::here("notebook/_data", "leopold2020host")
# To use a temporary directory:
# data_path <- file.path(tempdir(), "leopold2020")
if (!dir.exists(data_path)) {
  dir.create(data_path, recursive = TRUE)
  download.file(
    "https://zenodo.org/record/3872145/files/dleopold/Populus_priorityEffects-v1.2.zip",
    file.path(data_path, "Populus_priorityEffects-v1.2.zip")
  )
  unzip(
    file.path(data_path, "Populus_priorityEffects-v1.2.zip"), 
    exdir = data_path
  )
}
#> The microbiome data is stored in a phyloseq object,
ps <- file.path(data_path, 
  "dleopold-Populus_priorityEffects-8594f7c/output/compiled/phy.rds") %>%
  readRDS

sample_data(ps) <- sample_data(ps) %>%
  transform(
    Timepoint = factor(Timepoint)
  )

mock_actual <- file.path(data_path, 
  "dleopold-Populus_priorityEffects-8594f7c/data/MockCommunities.csv") %>%
  read.csv(row.names = 1) %>%
  select(-Sym4) %>%
  as("matrix") %>%
  otu_table(taxa_are_rows = FALSE) %>%
  transform_sample_counts(function(x) close_elts(1 / x))
mock_taxa <- taxa_names(mock_actual)
sam <- sample_data(ps) %>% as("data.frame") %>% as_tibble(rownames = "Sample")
tax <- tax_table(ps) %>% as("matrix") %>% as_tibble(rownames = "Taxon")
ps.mock <- ps %>% 
  subset_samples(Samp_type == "Mock") %>%
  prune_taxa(mock_taxa, .)
leopold_actual <- mock_actual
leopold_observed <- ps.mock %>% otu_table
rm(mock_actual, mock_taxa, sam, tax, ps.mock)

Plots

leopold_prop <- list(
  Actual = leopold_actual, 
  Measured = leopold_observed
) %>%
  map(transform_sample_counts, close_elts) %>%
  map_dfr(as_tibble, .id = 'type') %>%
  pivot_wider(names_from = type, values_from = .abundance)

leopold_fc <- list(
  Actual = leopold_actual, 
  Measured = leopold_observed
) %>%
  map(transform_sample_counts, close_elts) %>%
  map(pairwise_ratios, margin = 'samples', filter = FALSE) %>%
  map_dfr(as_tibble, .id = 'type') %>%
  pivot_wider(names_from = type, values_from = .abundance) %>%
  separate(.sample, c('sample1', 'sample2'), sep = ':') %>%
  # Filter to remove cases where Actual is 0 in one or both samples, or where
  # the numerator and denominator sample are equal
  filter(is.finite(Actual), Actual != 0, sample1 != sample2)

Note, in this case all species are (nominally) in all samples, but that Epicoccum is observed in 0 reads in one sample.

species <- 'Cladosporium'
# species <- 'Alternaria'
# species <- 'Trichoderma'

p1 <- leopold_prop %>%
  filter(.otu == species, Actual > 0) %>%
  ggplot(aes(Actual, Measured, color = .otu)) +
  # scale_color_manual(values = colors_brooks) +
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(color = 'grey', size = 1) +
  geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
  # geom_quasirandom() +
  coord_fixed() +
  expand_limits(x = 0.1, y = 0.1) +
  labs(x = 'Actual proportion', y = 'Measured proportion', color = 'Species') +
  theme(legend.position = 'none')
p2 <- leopold_fc %>%
  filter(.otu == species) %>%
  ggplot(aes(Actual, Measured, color = .otu)) +
  # scale_color_manual(values = colors_brooks) +
  scale_x_log10() +
  # scale_y_custom +
  scale_y_log10() +
  geom_abline(color = 'grey', size = 1) +
  geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
  # geom_quasirandom() +
  coord_fixed() +
  labs(x = 'Actual fold change', y = 'Measured fold change', color = 'Species') +
  theme(legend.position = 'none')
p1 + p2

Same but for the ratios between two species:

x <- list(
  Actual = leopold_actual, 
  Measured = leopold_observed
) %>%
  map(transform_sample_counts, close_elts) %>%
  map(pairwise_ratios, margin = 'taxa', filter = FALSE)

leopold_ratio <- x %>%
  map_dfr(as_tibble, .id = 'type') %>%
  pivot_wider(names_from = type, values_from = .abundance) %>%
  separate(.otu, c('otu1', 'otu2'), sep = ':') %>%
  filter(otu1 != otu2)

leopold_ratio_fc <- x %>%
  map(pairwise_ratios, margin = 'samples', filter = FALSE) %>%
  map_dfr(as_tibble, .id = 'type') %>%
  pivot_wider(names_from = type, values_from = .abundance) %>%
  separate(.otu, c('otu1', 'otu2'), sep = ':') %>%
  separate(.sample, c('sample1', 'sample2'), sep = ':') %>%
  # Filter to remove cases where Actual is 0 in one or both samples, or where
  # the numerator and denominator sample are equal
  filter(otu1 != otu2, sample1 != sample2)
species1 <- 'Cladosporium'
species2 <- 'Fusarium'
# species2 <- 'Melampsora'
# species <- 'Trichoderma'

p3 <- leopold_ratio %>%
  filter(otu1 == species1, otu2 == species2) %>%
  ggplot(aes(Actual, Measured)) +
  # scale_color_manual(values = colors_brooks) +
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(color = 'grey', size = 1) +
  geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
  # geom_quasirandom() +
  coord_fixed() +
  expand_limits(x = 0.1, y = 0.1) +
  labs(x = 'Actual ratio', y = 'Measured ratio', color = 'Species') +
  theme(legend.position = 'none')
p4 <- leopold_ratio_fc %>%
  filter(otu1 == species1, otu2 == species2) %>%
  ggplot(aes(Actual, Measured)) +
  # scale_color_manual(values = colors_brooks) +
  scale_x_log10() +
  # scale_y_custom +
  scale_y_log10() +
  geom_abline(color = 'grey', size = 1) +
  geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
  # geom_quasirandom() +
  coord_fixed() +
  labs(x = 'Actual fold change', y = 'Measured fold change', color = 'Species') +
  theme(legend.position = 'none')
(p1 + p2) / (p3 + p4)

Note, the ratio-view isn’t always obviously better; e.g. when Mel is involved; perhaps because of noise?

To make things more directly comparable, it might be best to use faceting and to fix the axes to have the same span.

Session info

Click for session info
sessioninfo::session_info()
─ Session info ──────────────────────────────────────────────────────────────────
 setting  value                       
 version  R version 4.1.1 (2021-08-10)
 os       Arch Linux                  
 system   x86_64, linux-gnu           
 ui       X11                         
 language (EN)                        
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       America/New_York            
 date     2021-10-27                  

─ Packages ──────────────────────────────────────────────────────────────────────
 package          * version    date       lib source                           
 ade4               1.7-18     2021-09-16 [1] CRAN (R 4.1.1)                   
 ape                5.5        2021-04-25 [1] CRAN (R 4.1.0)                   
 assertthat         0.2.1      2019-03-21 [1] CRAN (R 4.0.0)                   
 backports          1.2.1      2020-12-09 [1] CRAN (R 4.0.3)                   
 beeswarm           0.4.0      2021-06-01 [1] CRAN (R 4.1.0)                   
 Biobase            2.52.0     2021-05-19 [1] Bioconductor                     
 BiocGenerics       0.38.0     2021-05-19 [1] Bioconductor                     
 biomformat         1.20.0     2021-05-19 [1] Bioconductor                     
 Biostrings         2.60.1     2021-06-06 [1] Bioconductor                     
 bitops             1.0-7      2021-04-24 [1] CRAN (R 4.1.0)                   
 bookdown           0.24       2021-09-02 [1] CRAN (R 4.1.1)                   
 broom              0.7.9      2021-07-27 [1] CRAN (R 4.1.0)                   
 bslib              0.3.1      2021-10-06 [1] CRAN (R 4.1.1)                   
 cellranger         1.1.0      2016-07-27 [1] CRAN (R 4.0.0)                   
 cli                3.0.1      2021-07-17 [1] CRAN (R 4.1.0)                   
 cluster            2.1.2      2021-04-17 [2] CRAN (R 4.1.1)                   
 codetools          0.2-18     2020-11-04 [2] CRAN (R 4.1.1)                   
 colorspace         2.0-2      2021-08-11 [1] R-Forge (R 4.1.1)                
 cowplot          * 1.1.1      2021-08-27 [1] Github (wilkelab/cowplot@555c9ae)
 crayon             1.4.1      2021-02-08 [1] CRAN (R 4.0.4)                   
 data.table         1.14.2     2021-09-27 [1] CRAN (R 4.1.1)                   
 DBI                1.1.1      2021-01-15 [1] CRAN (R 4.0.4)                   
 dbplyr             2.1.1      2021-04-06 [1] CRAN (R 4.0.5)                   
 digest             0.6.28     2021-09-23 [1] CRAN (R 4.1.1)                   
 distill            1.3        2021-10-13 [1] CRAN (R 4.1.1)                   
 downlit            0.2.1      2020-11-04 [1] CRAN (R 4.0.3)                   
 dplyr            * 1.0.7      2021-06-18 [1] CRAN (R 4.1.0)                   
 ellipsis           0.3.2      2021-04-29 [1] CRAN (R 4.1.0)                   
 evaluate           0.14       2019-05-28 [1] CRAN (R 4.0.0)                   
 fansi              0.5.0      2021-05-25 [1] CRAN (R 4.1.0)                   
 farver             2.1.0      2021-02-28 [1] CRAN (R 4.0.4)                   
 fastmap            1.1.0      2021-01-25 [1] CRAN (R 4.0.4)                   
 forcats          * 0.5.1      2021-01-27 [1] CRAN (R 4.0.4)                   
 foreach            1.5.1      2020-10-15 [1] CRAN (R 4.0.3)                   
 fs               * 1.5.0      2020-07-31 [1] CRAN (R 4.0.2)                   
 generics           0.1.1      2021-10-25 [1] CRAN (R 4.1.1)                   
 GenomeInfoDb       1.28.1     2021-07-01 [1] Bioconductor                     
 GenomeInfoDbData   1.2.6      2021-05-31 [1] Bioconductor                     
 ggbeeswarm       * 0.6.0      2017-08-07 [1] CRAN (R 4.0.0)                   
 ggplot2          * 3.3.5      2021-06-25 [1] CRAN (R 4.1.0)                   
 glue               1.4.2      2020-08-27 [1] CRAN (R 4.0.2)                   
 gtable             0.3.0      2019-03-25 [1] CRAN (R 4.0.0)                   
 haven              2.4.3      2021-08-04 [1] CRAN (R 4.1.1)                   
 here               1.0.1      2020-12-13 [1] CRAN (R 4.0.5)                   
 highr              0.9        2021-04-16 [1] CRAN (R 4.1.0)                   
 hms                1.1.1      2021-09-26 [1] CRAN (R 4.1.1)                   
 htmltools          0.5.2      2021-08-25 [1] CRAN (R 4.1.1)                   
 httr               1.4.2      2020-07-20 [1] CRAN (R 4.0.2)                   
 igraph             1.2.7      2021-10-15 [1] CRAN (R 4.1.1)                   
 IRanges            2.26.0     2021-05-19 [1] Bioconductor                     
 iterators          1.0.13     2020-10-15 [1] CRAN (R 4.0.3)                   
 jquerylib          0.1.4      2021-04-26 [1] CRAN (R 4.1.0)                   
 jsonlite           1.7.2      2020-12-09 [1] CRAN (R 4.0.3)                   
 knitr              1.36       2021-09-29 [1] CRAN (R 4.1.1)                   
 lattice            0.20-44    2021-05-02 [2] CRAN (R 4.1.1)                   
 lifecycle          1.0.1      2021-09-24 [1] CRAN (R 4.1.1)                   
 lubridate          1.8.0      2021-10-07 [1] CRAN (R 4.1.1)                   
 magrittr           2.0.1      2020-11-17 [1] CRAN (R 4.0.3)                   
 MASS               7.3-54     2021-05-03 [2] CRAN (R 4.1.1)                   
 Matrix             1.3-4      2021-06-01 [2] CRAN (R 4.1.1)                   
 metacal          * 0.2.0.9005 2021-10-04 [1] Github (mikemc/metacal@773cbf3)  
 mgcv               1.8-36     2021-06-01 [2] CRAN (R 4.1.1)                   
 modelr             0.1.8      2020-05-19 [1] CRAN (R 4.0.0)                   
 multtest           2.48.0     2021-05-19 [1] Bioconductor                     
 munsell            0.5.0      2018-06-12 [1] CRAN (R 4.0.0)                   
 nlme               3.1-152    2021-02-04 [2] CRAN (R 4.1.1)                   
 nvimcom          * 0.9-102    2021-10-25 [1] local                            
 patchwork        * 1.1.1      2020-12-17 [1] CRAN (R 4.0.3)                   
 permute            0.9-5      2019-03-12 [1] CRAN (R 4.0.0)                   
 phyloseq         * 1.36.0     2021-05-19 [1] Bioconductor                     
 pillar             1.6.4      2021-10-18 [1] CRAN (R 4.1.1)                   
 pkgconfig          2.0.3      2019-09-22 [1] CRAN (R 4.0.0)                   
 plyr               1.8.6      2020-03-03 [1] CRAN (R 4.0.0)                   
 purrr            * 0.3.4      2020-04-17 [1] CRAN (R 4.0.0)                   
 R6                 2.5.1      2021-08-19 [1] CRAN (R 4.1.1)                   
 Rcpp               1.0.7      2021-07-07 [1] CRAN (R 4.1.0)                   
 RCurl              1.98-1.5   2021-09-17 [1] CRAN (R 4.1.1)                   
 readr            * 2.0.2      2021-09-27 [1] CRAN (R 4.1.1)                   
 readxl             1.3.1      2019-03-13 [1] CRAN (R 4.0.0)                   
 reprex             2.0.1      2021-08-05 [1] CRAN (R 4.1.1)                   
 reshape2           1.4.4      2020-04-09 [1] CRAN (R 4.0.0)                   
 rhdf5              2.36.0     2021-05-19 [1] Bioconductor                     
 rhdf5filters       1.4.0      2021-05-19 [1] Bioconductor                     
 Rhdf5lib           1.14.2     2021-07-06 [1] Bioconductor                     
 rlang              0.4.12     2021-10-18 [1] CRAN (R 4.1.1)                   
 rmarkdown        * 2.11       2021-09-14 [1] CRAN (R 4.1.1)                   
 rprojroot          2.0.2      2020-11-15 [1] CRAN (R 4.0.3)                   
 rstudioapi         0.13       2020-11-12 [1] CRAN (R 4.0.3)                   
 rvest              1.0.2      2021-10-16 [1] CRAN (R 4.1.1)                   
 S4Vectors          0.30.0     2021-05-19 [1] Bioconductor                     
 sass               0.4.0      2021-05-12 [1] CRAN (R 4.1.0)                   
 scales           * 1.1.1      2020-05-11 [1] CRAN (R 4.0.0)                   
 sessioninfo        1.1.1      2018-11-05 [1] CRAN (R 4.0.0)                   
 speedyseq        * 0.5.3.9018 2021-06-29 [1] Github (mikemc/speedyseq@ceb941f)
 stringi            1.7.5      2021-10-04 [1] CRAN (R 4.1.1)                   
 stringr          * 1.4.0      2019-02-10 [1] CRAN (R 4.0.0)                   
 survival           3.2-11     2021-04-26 [2] CRAN (R 4.1.1)                   
 tibble           * 3.1.5      2021-09-30 [1] CRAN (R 4.1.1)                   
 tidyr            * 1.1.4      2021-09-27 [1] CRAN (R 4.1.1)                   
 tidyselect         1.1.1      2021-04-30 [1] CRAN (R 4.1.0)                   
 tidyverse        * 1.3.1      2021-04-15 [1] CRAN (R 4.1.0)                   
 tzdb               0.1.2      2021-07-20 [1] CRAN (R 4.1.0)                   
 useful             1.2.6      2018-10-08 [1] CRAN (R 4.0.0)                   
 utf8               1.2.2      2021-07-24 [1] CRAN (R 4.1.0)                   
 vctrs              0.3.8      2021-04-29 [1] CRAN (R 4.1.0)                   
 vegan              2.5-7      2020-11-28 [1] CRAN (R 4.0.3)                   
 vipor              0.4.5      2017-03-22 [1] CRAN (R 4.0.0)                   
 withr              2.4.2      2021-04-18 [1] CRAN (R 4.0.5)                   
 xfun               0.27       2021-10-18 [1] CRAN (R 4.1.1)                   
 xml2               1.3.2      2020-04-23 [1] CRAN (R 4.0.0)                   
 XVector            0.32.0     2021-05-19 [1] Bioconductor                     
 yaml               2.2.1      2020-02-01 [1] CRAN (R 4.0.0)                   
 zlibbioc           1.38.0     2021-05-19 [1] Bioconductor                     

[1] /home/michael/.local/lib/R/library
[2] /usr/lib/R/library