Illustrate calibration types in the mock communities from Brooks et al (2015)

ref:brooks2015thet
Michael R. McLaren true
2021-10-25

Setup

# Tools for microbiome data
library(speedyseq)
# Tools for general purpose data manipulation and plotting
library(tidyverse)
library(fs)
# ggplot helpers
library(ggbeeswarm)
library(cowplot)
library(patchwork)
library(scales)
theme_set(theme_cowplot())
# stats helpers
# library(broom)

library(metacal); packageVersion("metacal")
[1] '0.2.0.9005'
colors_brooks <- c(
  "Atopobium_vaginae" = "#009E73",
  "Gardnerella_vaginalis" = "#56B4E9",
  "Lactobacillus_crispatus" = "#D55E00",
  "Lactobacillus_iners" = "#505050",
  "Prevotella_bivia" = "#0072B2",
  "Sneathia_amnii" = "#CC79A7",
  "Streptococcus_agalactiae" = "#E69F00")

scale_y_custom <- scale_y_continuous(
    trans = 'log10',
    breaks = trans_breaks('log10', function(x) 10^x),
    labels = trans_format('log10', math_format(10^.x))
  )
# Load data from the cellular mock communities of Brooks et al 2015
dr <- system.file("extdata", package = "metacal")
list.files(dr)
[1] "brooks2015-actual.csv"      "brooks2015-observed.csv"   
[3] "brooks2015-sample-data.csv"
actual <- file.path(dr, "brooks2015-actual.csv") |>
  read.csv(row.names = "Sample") |>
  otu_table(taxa_are_rows = FALSE)
observed <- file.path(dr, "brooks2015-observed.csv") |>
  read.csv(row.names = "Sample") |>
  subset(select = - Other) |>
  otu_table(taxa_are_rows = FALSE)

# Estimate bias with bootstrapping for error estimation
mc_fit <- estimate_bias(observed, actual, boot = TRUE)
control_species <- mc_fit %>% coef %>% names
summary(mc_fit)
Summary of a metacal bias fit.

Estimated relative efficiencies:
# A tibble: 7 × 4
  taxon                    estimate gm_mean gm_se
  <chr>                       <dbl>   <dbl> <dbl>
1 Atopobium_vaginae           0.285   0.285  1.04
2 Gardnerella_vaginalis       0.160   0.160  1.05
3 Lactobacillus_crispatus     2.29    2.29   1.03
4 Lactobacillus_iners         4.68    4.69   1.02
5 Prevotella_bivia            1.79    1.78   1.04
6 Sneathia_amnii              4.59    4.59   1.04
7 Streptococcus_agalactiae    0.250   0.250  1.03

Geometric standard error estimated from 1000 bootstrap replicates.

are the single-species samples present?

actual %>% 
  as_tibble %>%
  filter(.abundance == 1) %>%
  count(.otu)
# A tibble: 7 × 2
  .otu                         n
  <chr>                    <int>
1 Atopobium_vaginae            2
2 Gardnerella_vaginalis        1
3 Lactobacillus_crispatus      1
4 Lactobacillus_iners          1
5 Prevotella_bivia             1
6 Sneathia_amnii               1
7 Streptococcus_agalactiae     2

Yes.

Calibration

Calibration from community control

Demonstrate that calibration using a single mock community can create a substantial improvement.

smpls <- actual %>%
  as_tibble %>%
  with_groups(.sample, summarize, n_species = sum(.abundance > 0)) %>%
  filter(n_species == 7) %>%
  pull(.sample)
control_samples <- smpls[1]
bias_from_control <- estimate_bias(
  observed %>% prune_samples(control_samples, .),
  actual %>% prune_samples(control_samples, .),
  ) %>%
  coef
calibrated <- calibrate(observed, bias_from_control)

Calibration using a reference species

One way to do this is to get the correction for each pair of samples, then join it with the table from previous. Another is to make the calibrated OTU table, then redo the pairwise ratio stuff.

Basic Idea: Multiply the ratio of focal to ref species by the true proportion of the ref. One way to achieve this with phyloseq objects is to construct the correction matrix, then multiply this by the observed proportions

ref_species <- 'Lactobacillus_crispatus'

ref_actual <- actual %>% prune_taxa(ref_species, .) %>% c
ref_observed <- observed %>% prune_taxa(ref_species, .) %>% c

correction_matrix <- matrix(
  ref_actual / ref_observed, 
  nrow = nsamples(observed), ncol= ntaxa(observed),
  byrow = FALSE
)

calibrated_ref <- otu_table(observed * correction_matrix, taxa_are_rows = FALSE)

Plot comparing calibrated and uncalibrated fold changes

x <- list(
  Actual = actual, 
  Uncalibrated = observed %>% transform_sample_counts(close_elts),
  Calibrated_community = calibrated,
  Calibrated_reference = calibrated_ref
  ) %>%
  map(pairwise_ratios, margin = 'samples', filter = FALSE) %>%
  map_dfr(as_tibble, .id = 'type')
lvls <- mc_fit %>% coef %>% sort %>% names

brooks2015_fcs <- x %>%
  pivot_wider(names_from = type, values_from = .abundance) %>%
  pivot_longer(contains('calibrated'), 
    names_to = 'type', values_to = 'Measured'
  ) %>%
  separate(.sample, c('sample1', 'sample2'), sep = ':') %>%
  # Filter to remove cases where Actual is 0 in one or both samples, or where
  # the numerator and denominator sample are equal
  filter(is.finite(Actual), Actual != 0, sample1 != sample2) %>%
  mutate(
    across(type, fct_relevel, 'Uncalibrated'),
    across(
      type, fct_recode,
      'Calibrated (community)' = 'Calibrated_community',
      'Calibrated (reference)' = 'Calibrated_reference'
      ),
    across(.otu, factor, levels = rev(lvls)),
    # across(.otu, fct_relabel, adjust_species_names)
  )

Check on number of rows in these data frames,

# expected in 'x'
n1 <- observed %>% nsamples %>% print
[1] 80
n2 <- observed %>% ntaxa
all.equal(n1 * n1 * n2 * 4, x %>% nrow)
[1] TRUE
x %>% nrow
[1] 179200
brooks2015_fcs %>% nrow
[1] 17586

Let’s plot just the sample-pairs with L. crispatus present, so that the same samples are shown in all panels.

# Find subset of pairs to plot
sample_pairs <- brooks2015_fcs %>%
  filter(is.finite(Actual), Actual > 0, is.finite(Measured), Measured > 0) %>%
  select(sample1, sample2) %>%
  unite(pair, sample1, sample2) %>%
  distinct %>%
  pull(pair)

# Species labels
species_label <- function(species) {
  species %>%
    str_replace('(?<=^[A-Z])[a-z]+_', '. ')
}
species_label(lvls)
[1] "G. vaginalis"  "S. agalactiae" "A. vaginae"    "P. bivia"     
[5] "L. crispatus"  "S. amnii"      "L. iners"     
p <- brooks2015_fcs %>%
  unite(pair, sample1, sample2) %>%
  filter(pair %in% sample_pairs, is.finite(Actual), Actual > 0, is.finite(Measured), Measured > 0) %>%
  ggplot(aes(Actual, Measured, color = .otu)) +
  scale_color_manual(
    values = colors_brooks, 
    labels = species_label(colors_brooks %>% names)
  ) +
  facet_wrap(~type) +
  scale_x_log10() +
  scale_y_custom +
  geom_abline(color = 'grey', size = 1) +
  geom_point(position = position_jitter(), alpha = 0.5) +
  coord_fixed() +
  labs(x = 'Actual fold change', y = 'Measured fold change', color = 'Species') +
  theme(legend.position = 'bottom')
p