R setup

Run with SAVE_FIGURES = TRUE to save figures in figures/.

SAVE_FIGURES = TRUE

Libraries and paths

library(here)
library(tidyverse)
library(ggthemes)
library(cowplot); theme_set(theme_grey()) 
library(ggbeeswarm)
# Functions for bias estimation and calibration
library(metacal); packageVersion("metacal")

## [1] '0.1.0'

# This package
devtools::load_all(here())

Options for ggplot:

base_theme <- theme_tufte() + 
    theme(
        text = element_text(size=9, family = ""),
        strip.text = element_text(size=9, family = ""),
        # axis.text = element_text(size=8, family = ""),
        legend.position = "none"
        )
tax_theme <- theme(
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
        axis.title.x = element_blank())

update_geom_defaults("point", list(size = 1))
update_geom_defaults("text", list(size = 2.5))
update_geom_defaults("hline", list(color = "grey"))
update_geom_defaults("abline", list(color = "grey"))
# Colors for color scales
colors.OkabeIto <-  tribble(
    ~Name, ~Hex,
    "orange", "#E69F00",
    "sky blue", "#56B4E9",
    "bluish green", "#009E73",
    # "yellow", "#F0E442",
    "blue", "#0072B2",
    "vermilion", "#D55E00",
    "reddish purple", "#CC79A7",
    "black", "#000000",
    # "grey", "#999999",
    "yellow", "#F0E442",
    )
colors.taxon <- c(
    "Atopobium_vaginae" = "#009E73",
    "Gardnerella_vaginalis" = "#56B4E9",
    "Lactobacillus_crispatus" = "#D55E00",
    # "Lactobacillus_iners" = "#000000",
    "Lactobacillus_iners" = "#505050",
    "Prevotella_bivia" = "#0072B2",
    "Sneathia_amnii" = "#CC79A7",
    "Streptococcus_agalactiae" = "#E69F00") 
colors.num_species <- c(
    "2" = "#E69F00",
    "3" = "#56B4E9",
    "4" = "#CC79A7",
    # "7" = "#000000"
    "7" = "#505050"
    )
# Width of figures in elife latex template pdf when width=\fullwidth is used
# elife_width = 5.626

Load and inspect the data and prep for analysis

Data from the Brooks2015 supplement; for details and code used to obtain this data, see data-raw/brooks2015.R.

data("brooks2015_sample_data")
data("brooks2015_counts")
data("brooks2015_species_info")

Sample data

The key sample metadata variables are the Mixture type (or experiment), the number of species in the mixture, and the species in each mixture.

sam <- brooks2015_sample_data
print(sam, n = 10)

## # A tibble: 240 x 6
##    Sample Plate Barcode Mixture_type Num_species Species_list              
##    <chr>  <dbl>   <dbl> <chr>              <int> <chr>                     
##  1 s1-1       1       1 Cells                  3 Lactobacillus_crispatus;P…
##  2 s1-2       1       2 Cells                  3 Gardnerella_vaginalis;Ato…
##  3 s1-3       1       3 Cells                  3 Gardnerella_vaginalis;Ato…
##  4 s1-4       1       4 Cells                  3 Atopobium_vaginae;Lactoba…
##  5 s1-5       1       5 Cells                  2 Gardnerella_vaginalis;Pre…
##  6 s1-6       1       6 Cells                  3 Atopobium_vaginae;Lactoba…
##  7 s1-7       1       7 Cells                  2 Atopobium_vaginae;Strepto…
##  8 s1-8       1       8 Cells                  2 Gardnerella_vaginalis;Str…
##  9 s1-9       1       9 Cells                  3 Atopobium_vaginae;Sneathi…
## 10 s1-10      1      10 Cells                  3 Atopobium_vaginae;Lactoba…
## # … with 230 more rows

There are 80 samples for each of the three mixture types:

sam %>%
    group_by(Mixture_type) %>%
    summarize(Num_samples = n())

## # A tibble: 3 x 2
##   Mixture_type Num_samples
##   <chr>              <int>
## 1 Cells                 80
## 2 DNA                   80
## 3 PCR_product           80

We’ll ultimately just be using the 71 of 80 that contain two or more species,

sam %>%
    group_by(Mixture_type) %>%
    filter(Num_species > 1) %>%
    summarize(Num_samples = n())

## # A tibble: 3 x 2
##   Mixture_type Num_samples
##   <chr>              <int>
## 1 Cells                 71
## 2 DNA                   71
## 3 PCR_product           71

Distinct compositions:

sam %>%
    select(Mixture_type, Num_species, Species_list) %>%
    distinct() %>%
    group_by(Mixture_type) %>%
    filter(Num_species > 1) %>%
    summarize(Num_samples = n())

## # A tibble: 3 x 2
##   Mixture_type Num_samples
##   <chr>              <int>
## 1 Cells                 58
## 2 DNA                   58
## 3 PCR_product           58

Count data

The count data includes reads from Brooks2015’s above- and below-threshold tables, specifying reads that were classified above or below a 97% sequence identity threshold to 16S sequences in their database.

print(brooks2015_counts, n = 10)

## # A tibble: 3,840 x 4
##    Sample Taxon                   Table Count
##    <chr>  <chr>                   <chr> <dbl>
##  1 s1-1   Atopobium_vaginae       above     1
##  2 s1-1   Atopobium_vaginae       below     2
##  3 s1-1   Gardnerella_vaginalis   above     1
##  4 s1-1   Gardnerella_vaginalis   below     0
##  5 s1-1   Lactobacillus_crispatus above 13670
##  6 s1-1   Lactobacillus_crispatus below   540
##  7 s1-1   Lactobacillus_iners     above     0
##  8 s1-1   Lactobacillus_iners     below     0
##  9 s1-1   Other                   above     7
## 10 s1-1   Other                   below    12
## # … with 3,830 more rows

The vast majority of reads were classified above threshold,

brooks2015_counts %>%
    group_by(Table) %>%
    summarize(Sum = sum(Count)) %>%
    mutate(Proportion = Sum / sum(Sum))

## # A tibble: 2 x 3
##   Table     Sum Proportion
##   <chr>   <dbl>      <dbl>
## 1 above 3668922     0.935 
## 2 below  253078     0.0645

In preparing this dataframe, we have grouped all reads classified to non-mock taxa as “Other”. The vast majority of above and below threshold reads were classified to the mock taxa,

brooks2015_counts %>%
    group_by(Table, Taxon != "Other") %>%
    summarize(Sum = sum(Count)) %>%
    mutate(Proportion = Sum / sum(Sum))

## # A tibble: 4 x 4
## # Groups:   Table [2]
##   Table `Taxon != "Other"`     Sum Proportion
##   <chr> <lgl>                <dbl>      <dbl>
## 1 above FALSE                  733   0.000200
## 2 above TRUE               3668189   1.000   
## 3 below FALSE                 2279   0.00901 
## 4 below TRUE                250799   0.991

We will follow Brooks2015 in using the above threshold reads only, and will restrict to just the mock taxa. We will also create a new dataframe that will be our main data frame for our analysis going forward.

main <- brooks2015_counts %>%
    filter(Table == "above", Taxon != "Other") %>%
    select(Sample, Taxon, Count)

Let’s add the sample data,

main <- main %>%
    left_join(sam, by = "Sample")

and a column for whether the species is expected to be in the sample.

main <- main %>%
    mutate(Expected = str_detect(Species_list, Taxon))

Species info

Estimated genome size and 16S copy number, obtained in analysis/brooks2015-species-info.Rmd.

species_info <- brooks2015_species_info
taxa <- species_info$Taxon
species_info

## # A tibble: 7 x 3
##   Taxon                    Genome_size Copy_number
##   <chr>                          <dbl>       <dbl>
## 1 Atopobium_vaginae           1440070.           2
## 2 Gardnerella_vaginalis       1642448.           2
## 3 Lactobacillus_crispatus     2043161            4
## 4 Lactobacillus_iners         1277649            5
## 5 Prevotella_bivia            2521238            4
## 6 Sneathia_amnii              1330224            3
## 7 Streptococcus_agalactiae    2160267            7

Expected copy-number bias for the three experiments:

cnbias <- species_info %>%
    expand(Mixture_type = sam$Mixture_type, 
        nesting(Taxon, Genome_size, Copy_number)) %>%
    mutate(CN_bias = case_when(
            Mixture_type == "Cells" ~ Copy_number,
            Mixture_type == "DNA" ~ Copy_number / Genome_size,
            Mixture_type == "PCR_product" ~ 1,
        )
    ) %>%
    mutate_by(Mixture_type, CN_bias = center_elts(CN_bias))
print(cnbias, n = Inf)

## # A tibble: 21 x 5
##    Mixture_type Taxon                    Genome_size Copy_number CN_bias
##    <chr>        <chr>                          <dbl>       <dbl>   <dbl>
##  1 Cells        Atopobium_vaginae           1440070.           2   0.568
##  2 Cells        Gardnerella_vaginalis       1642448.           2   0.568
##  3 Cells        Lactobacillus_crispatus     2043161            4   1.14 
##  4 Cells        Lactobacillus_iners         1277649            5   1.42 
##  5 Cells        Prevotella_bivia            2521238            4   1.14 
##  6 Cells        Sneathia_amnii              1330224            3   0.852
##  7 Cells        Streptococcus_agalactiae    2160267            7   1.99 
##  8 DNA          Atopobium_vaginae           1440070.           2   0.679
##  9 DNA          Gardnerella_vaginalis       1642448.           2   0.595
## 10 DNA          Lactobacillus_crispatus     2043161            4   0.957
## 11 DNA          Lactobacillus_iners         1277649            5   1.91 
## 12 DNA          Prevotella_bivia            2521238            4   0.775
## 13 DNA          Sneathia_amnii              1330224            3   1.10 
## 14 DNA          Streptococcus_agalactiae    2160267            7   1.58 
## 15 PCR_product  Atopobium_vaginae           1440070.           2   1    
## 16 PCR_product  Gardnerella_vaginalis       1642448.           2   1    
## 17 PCR_product  Lactobacillus_crispatus     2043161            4   1    
## 18 PCR_product  Lactobacillus_iners         1277649            5   1    
## 19 PCR_product  Prevotella_bivia            2521238            4   1    
## 20 PCR_product  Sneathia_amnii              1330224            3   1    
## 21 PCR_product  Streptococcus_agalactiae    2160267            7   1

Inspecting and removing cross-sample contaminant reads

Many samples contain reads that are assigned to taxa not supposed to be in the mixture.

main %>%
    group_by(Sample, Expected) %>%
    summarize(Sum = sum(Count)) %>%
    ungroup %>%
    filter(!Expected) %>%
    summarize(Frac = mean(Sum > 0)) %>%
    {paste("The fraction of samples with out-of-sample reads is", .$Frac)}

## [1] "The fraction of samples with out-of-sample reads is 0.97008547008547"

main %>%
    group_by(Mixture_type, Num_species, Sample, Expected) %>%
    summarize(Sum = sum(Count)) %>%
    mutate_by(Sample, Prop = Sum / sum(Sum)) %>%
    filter(!Expected) %>%
    ggplot(aes(Prop, fill = as.factor(Num_species))) +
    geom_dotplot() +
    scale_x_log10() +
    facet_grid(Mixture_type~.) +
    labs(x = "Fraction of out-of-sample reads", fill = "# species")

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 7 rows containing non-finite values (stat_bindot).

Note, no 7-species samples appear as their fraction is always zero. The fraction of out-of-sample reads can be quite high in the DNA mixtures and a few PCR mixtures. But the fraction is still generally quite low, and we suspect most to all cases are due to cross-sample contamination during handling and sequencing (e.g., index/barcode hopping), and not sample mislabeling or mis-construction.

Prep for main analysis

Our main analysis will use Observed and Actual variables, with the following simplifications. Our downstream analysis requires that the samples only have reads from taxa actually in the samples. We will therefore remove the out-of-sample reads and treat the actual compositions as an even mixture of the expected taxa. We will also ignore the extra precision information in the observed read counts, and treat Observed as well as Actual as compositional vectors. We will further filter to samples with more than one species. We will generally store relative abundances as proportions for plotting purposes (we use the close_elts function within samples to convert compositional vectors to vectors of proportions).

main0 <- main %>%
    filter(Expected, Num_species > 1) %>%
    mutate_by(Sample, 
        Observed = close_elts(Count),
        Actual = close_elts(Expected),
        Error = Observed / Actual) %>%
    select(-Count, -Expected) %>%
    select(Sample, Taxon, Observed, Actual, everything())

The Observed and Actual compositions are normalized to proportions for plotting in the next section.

Add copy-number bias for each taxon + experiment pair:

main0 <- main0 %>%
    left_join(cnbias %>% select(Mixture_type, Taxon, CN_bias),
        by = c("Mixture_type", "Taxon"))
print(main0, n = 10)

## # A tibble: 591 x 11
##    Sample Taxon Observed Actual Plate Barcode Mixture_type Num_species
##    <chr>  <chr>    <dbl>  <dbl> <dbl>   <dbl> <chr>              <int>
##  1 s1-1   Lact…   0.602   0.333     1       1 Cells                  3
##  2 s1-1   Prev…   0.332   0.333     1       1 Cells                  3
##  3 s1-1   Stre…   0.0663  0.333     1       1 Cells                  3
##  4 s1-10  Atop…   0.0391  0.333     1      10 Cells                  3
##  5 s1-10  Lact…   0.392   0.333     1      10 Cells                  3
##  6 s1-10  Snea…   0.569   0.333     1      10 Cells                  3
##  7 s1-11  Gard…   0.0154  0.333     1      11 Cells                  3
##  8 s1-11  Lact…   0.337   0.333     1      11 Cells                  3
##  9 s1-11  Lact…   0.647   0.333     1      11 Cells                  3
## 10 s1-12  Atop…   0.0616  0.5       1      12 Cells                  2
## # … with 581 more rows, and 3 more variables: Species_list <chr>,
## #   Error <dbl>, CN_bias <dbl>

View the errors

Let’s take a look at the observed vs. actual proportions, taking a log-odds transform to avoid compressing the variation near p=0 and p=1.

main0 %>%
    mutate_at(vars(Actual, Observed), logit) %>%
    ggplot(aes(Actual, Observed, color = Taxon)) + 
    geom_quasirandom() +
    facet_grid(Mixture_type ~ .) +
    geom_rangeframe(sides = "bl", color= "black") +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    labs(title = "Observed vs. Actual (log-odds)")

There is significant error. Moreover, it is not consistent even for a particular species and actual abundance (except for the 7-species mixtures), as our model predicts:

main0 %>%
    mutate_at(vars(Actual, Observed), logit) %>%
    ggplot(aes(Actual, Observed, color = Taxon)) + 
    geom_quasirandom() +
    facet_grid(Mixture_type ~ Taxon) +
    geom_rangeframe(sides = "bl", color= "black") +
    labs(title = "Observed vs. Actual (log-odds)") +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    theme(legend.position = "none")

What about after copy-number correction?

main0 <- main0 %>%
    mutate_by(Sample, CN_corrected = close_elts(Observed / CN_bias))

main0 %>%
    mutate_at(vars(Actual, CN_corrected), logit) %>%
    ggplot(aes(Actual, CN_corrected, color = Taxon)) + 
    geom_quasirandom() +
    facet_grid(Mixture_type ~ .) +
    geom_rangeframe(sides = "bl", color= "black") +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    labs(title = "Observed with CN correction vs. Actual (log-odds)")

The error in the DNA mixtures looks to be reduced. We return to the efficacy of CN correction later on.

Now let’s look at the taxon ratios, which we predict should show a consistent error. To do so, we first need a data frame with all the pairwise ratios for each sample,

gvs <- c("Mixture_type", "Sample", "Num_species")
ratios <- main0 %>%
    select(-Plate, -Barcode) %>%
    compute_ratios(group_vars = gvs) %>%
    filter(!is.na(Actual)) %>%
    mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))

Pick a non-redundant set of pairs for plotting,

cmb <- combn(taxa, 2, simplify = FALSE) %>% 
    map_chr( ~paste(.[1], .[2], sep = ":"))

and a reduced table for plotting the copy-number prediction (which only depends on the taxa pair and not the sample)

cn_ratios <- ratios %>%
    select(Mixture_type, Pair, CN_bias) %>%
    distinct %>%
    filter(Pair %in% cmb)

Plot the error ratios in along with the error predicted by 16S CN (dark red cross),

p.pw <- ratios %>% 
    filter(Pair %in% cmb) %>%
    ggplot(aes(Pair, Error, color = as.factor(Num_species))) +
    geom_hline(yintercept = 1) +
    facet_grid(Mixture_type ~ .) +
    geom_rangeframe(sides = "l", color= "black") + 
    scale_y_log10() +
    scale_x_discrete(labels = tax_labeller) + 
    scale_color_manual(values = colors.num_species) +
    tax_theme +
    theme(legend.position = "bottom")
p.pw +
    geom_point(data = cn_ratios, aes(Pair, CN_bias), 
        color = "darkred", shape = 3, size = 3) +
    geom_quasirandom()

The error in ratios is consistent across samples, as our model predicts.

Bias estimation

Estimate bias w/ standard error,

set.seed(20190517)
temp <- main0 %>%
    select(Mixture_type, Sample, Taxon, Observed, Actual) %>%
    mutate(Error = Observed / Actual) %>%
    group_by(Mixture_type) %>%
    nest %>%
    mutate(
        Error_matrix = map(data, build_matrix, 
            Sample, Taxon, Error, fill = NaN),
        Estimate = map(Error_matrix, center, enframe = TRUE),
        Bootreps = map(Error_matrix, bootrep_center, R = 4e3)
    )
est <- temp %>%
    unnest(Estimate) %>%
    rename(Bias = Center)
bootreps <- temp %>%
    unnest(Bootreps)
stats <- bootreps %>%
    group_by(Mixture_type, Taxon) %>%
    summarize(gm_mean = gm_mean(Center), gm_se = gm_sd(Center)) %>%
    ungroup
bias <- left_join(est, stats, by = c("Mixture_type", "Taxon"))
rm(temp)

Check the results.

bias %>%
    print(n = Inf)

## # A tibble: 21 x 5
##    Mixture_type Taxon                     Bias gm_mean gm_se
##    <chr>        <chr>                    <dbl>   <dbl> <dbl>
##  1 Cells        Atopobium_vaginae        0.285   0.285  1.04
##  2 Cells        Gardnerella_vaginalis    0.160   0.160  1.05
##  3 Cells        Lactobacillus_crispatus  2.29    2.28   1.03
##  4 Cells        Lactobacillus_iners      4.68    4.68   1.02
##  5 Cells        Prevotella_bivia         1.79    1.79   1.04
##  6 Cells        Sneathia_amnii           4.59    4.59   1.04
##  7 Cells        Streptococcus_agalactiae 0.250   0.250  1.03
##  8 DNA          Atopobium_vaginae        1.06    1.06   1.03
##  9 DNA          Gardnerella_vaginalis    0.403   0.403  1.05
## 10 DNA          Lactobacillus_crispatus  0.536   0.536  1.04
## 11 DNA          Lactobacillus_iners      2.31    2.31   1.02
## 12 DNA          Prevotella_bivia         0.392   0.393  1.03
## 13 DNA          Sneathia_amnii           2.39    2.39   1.04
## 14 DNA          Streptococcus_agalactiae 2.01    2.01   1.02
## 15 PCR_product  Atopobium_vaginae        1.03    1.03   1.04
## 16 PCR_product  Gardnerella_vaginalis    0.817   0.818  1.05
## 17 PCR_product  Lactobacillus_crispatus  0.907   0.907  1.03
## 18 PCR_product  Lactobacillus_iners      1.19    1.19   1.05
## 19 PCR_product  Prevotella_bivia         0.927   0.926  1.05
## 20 PCR_product  Sneathia_amnii           1.30    1.30   1.03
## 21 PCR_product  Streptococcus_agalactiae 0.911   0.911  1.04

bias <- select(bias, -gm_mean)

Add the bias estimate, predicted proportions, and residual compositional error

main0 <- main0 %>%
    left_join(bias, by = c("Mixture_type", "Taxon")) %>%
    mutate_by(Sample, 
        Predicted = close_elts(Actual * Bias),
        Residual = Observed / Predicted
    )

And also get the predicted ratios,

ratios.pred <- bias %>%
    select(Mixture_type, Taxon, Bias) %>%
    compute_ratios(group_vars = c("Mixture_type")) %>%
    rename(Predicted = Bias) %>%
    mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))

Ratio errors are explained by the bias (Supplemental Figure SBrooksRatios)

# Put predictions underneath the geom layer showing the observed errors
p.pw +
    geom_point(data = ratios.pred %>% filter(Pair %in% cmb),
        aes(Pair, Predicted), inherit.aes = FALSE,
        shape = 3, size = 4, color = "black") +
    geom_quasirandom() +
    labs(color = "# species\nin mixture", y = "Observed / Actual") +
    base_theme +
    tax_theme +
    theme(legend.position = "right",
        plot.margin = margin(l = 0.2, unit = "in")
    )

Get the prediction of our measurements given the estimated bias. For comparison, also get the prediction under no bias and under copy-number bias,

pred <- main0 %>%
    mutate(`No bias` = 1) %>%
    rename(`Copy-number bias` = CN_bias, `Estimated bias` = Bias) %>%
    gather("Bias_type", "Bias", 
        `No bias`, `Copy-number bias`, `Estimated bias`) %>%
    mutate_by(c(Sample, Bias_type), Predicted = close_elts(Actual * Bias)) %>%
    mutate(Bias_type = factor(Bias_type, 
            c("No bias", "Copy-number bias", "Estimated bias")))

Check that the predictions accounting for bias reduce the error by various proportion-based (non-compositional) measures.

error <- pred %>%
    group_by(Mixture_type, Bias_type) %>%
    summarize(
        MSE.prop = mean((Observed - Predicted)^2),
        MSE.logit = mean((logit(Observed) - logit(Predicted))^2),
        RMSE.logit = sqrt(mean( (logit(Observed) - logit(Predicted))^2 )),
        )
error

## # A tibble: 9 x 5
## # Groups:   Mixture_type [3]
##   Mixture_type Bias_type        MSE.prop MSE.logit RMSE.logit
##   <chr>        <fct>               <dbl>     <dbl>      <dbl>
## 1 Cells        No bias           0.0853     3.49        1.87 
## 2 Cells        Copy-number bias  0.0859     3.25        1.80 
## 3 Cells        Estimated bias    0.00106    0.0506      0.225
## 4 DNA          No bias           0.0419     1.11        1.06 
## 5 DNA          Copy-number bias  0.0198     0.546       0.739
## 6 DNA          Estimated bias    0.00136    0.0423      0.206
## 7 PCR_product  No bias           0.00585    0.120       0.346
## 8 PCR_product  Copy-number bias  0.00585    0.120       0.346
## 9 PCR_product  Estimated bias    0.00323    0.0696      0.264

Summary for the manuscript: The proportions predicted from the fitted model reduced the mean squared error by 98.8%.

Also check the reduction in compositional error using the Aitchison distance,

pred %>%
    group_by(Mixture_type, Bias_type, Sample) %>%
    summarize(Adist = anorm(Observed / Predicted)) %>%
    summarize(Adist = mean(Adist))

## # A tibble: 9 x 3
## # Groups:   Mixture_type [3]
##   Mixture_type Bias_type        Adist
##   <chr>        <fct>            <dbl>
## 1 Cells        No bias          1.77 
## 2 Cells        Copy-number bias 1.70 
## 3 Cells        Estimated bias   0.178
## 4 DNA          No bias          0.995
## 5 DNA          Copy-number bias 0.702
## 6 DNA          Estimated bias   0.179
## 7 PCR_product  No bias          0.297
## 8 PCR_product  Copy-number bias 0.297
## 9 PCR_product  Estimated bias   0.235

Plot the log-odds with mean-squared errors of the log-odds proportion,

p <- ggplot(pred, aes(logit(Predicted), logit(Observed), color = Taxon)) +
    geom_abline(intercept = 0, slope = 1, color = "grey") +
    geom_jitter(width = 0.1, height = 0) +
    geom_rangeframe(color = "black") + 
    facet_grid(Mixture_type ~ Bias_type) +
    base_theme +
    labs(x = "log-odds(Predicted proportion)", 
        y = "log-odds(Observed proportion)") +
    coord_fixed() +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    theme(
        panel.spacing.x = unit(1, "lines"),
        legend.position = "bottom",
    ) + 
    geom_text(data = error, 
        aes(x = 2.5, y = -4, 
            label = paste("MSE:", round(MSE.logit, 2))),
        color = "black")
p

Main text 4-panel figure

These are the taxa used in the second row of the main figure.

plot_taxa <- c("Gardnerella_vaginalis", "Lactobacillus_crispatus",
    "Lactobacillus_iners", "Sneathia_amnii", "Streptococcus_agalactiae")

Observed proportion (odds) vs. expected (Upper left)
Observed proportion vs. model prediction (upper right)
Fold error in proportion (lower left)
Fold error in ratios (lower right)

Top row: Observed vs. expected proportions before and after model fitting

# Tibble for plotting
pred0 <- pred %>%
    filter(Mixture_type == "Cells", Bias_type == "Estimated bias")
# Pick a fixed xy range to use in the top row
range.top <- c(0, 1)
# for y=x ref line
ref_line = tibble(x = range.top[1], y = range.top[1], 
        xend = range.top[2], yend = range.top[2])
# for (original) formatting the x-axis labels in left panel
xtb <- tibble(labels = c("0", "1/7", "1/4", "1/3", "1/2", "1")) %>%
    rowwise() %>%
    mutate(breaks = eval(parse(text = labels)))
# For setting the rangeframe axes to run from 0 to 1 instead of giving the data
# range
rftb <- tibble(Actual = c(0, 1), Observed = c(0, 1), Predicted = c(0, 1))
# Panel A
p.A <- ggplot(pred0, aes(Actual, Observed, color = Taxon)) +
    geom_segment(data=ref_line, aes(x=x, xend=xend, y=y, yend=yend), 
        colour="grey") +
    # geom_abline(slope = 1, intercept = 0, color = "grey") + 
    geom_quasirandom() +
    geom_rangeframe(data = rftb, color = "black") +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    scale_x_continuous(limits = range.top,
        breaks = xtb$breaks, labels = xtb$labels) +
    # scale_x_continuous(limits = range.top, 
    #     breaks = c(0, 0.5, 1), labels = c("0", "0.5", "1")) +
    scale_y_continuous(limits = range.top, 
        breaks = c(0, 0.5, 1), labels = c("0", "0.5", "1")) +
    labs(x = "Actual", y = "Observed",
        title = "Observed proportion vs. actual") +
    base_theme
# Panel B
p.B <- ggplot(pred0, aes(Predicted, Observed, 
        color = fct_reorder(Taxon, -Observed))) +
    geom_segment(data=ref_line, aes(x=x, xend=xend, y=y, yend=yend), 
        colour="grey") +
    geom_point() +
    geom_rangeframe(data = rftb, color = "black") +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    scale_x_continuous(limits = range.top, 
        breaks = c(0, 0.5, 1), labels = c("0", "0.5", "1")) +
    scale_y_continuous(limits = range.top, 
        breaks = c(0, 0.5, 1), labels = c("0", "0.5", "1")) +
    labs(x = "Model prediction", y = "Observed", color = "Taxon",
        title = "Observed proportion vs. model prediction") +
    base_theme

Bottom row: Error in taxon proportions and error in taxon ratios. To give both plots the same error (y-axis) scale, we odds transform the proportions.

# Ratios, for just five of the seven taxa to simplify the plot:
plot_pairs <- combn(plot_taxa, 2, simplify = FALSE) %>% 
    map_chr( ~paste(.[1], .[2], sep = ":"))
ratios0 <- ratios %>% 
    filter(Mixture_type == "Cells", Pair %in% plot_pairs)
ratios0.pred <- ratios.pred %>%
    filter(Mixture_type == "Cells", Pair %in% plot_pairs)
# Add shape keys for figure legend
ratios0 <- ratios0 %>%
    mutate(Shape = "Observed")
ratios0.pred <- ratios0.pred %>%
    mutate(Shape = "Predicted")
# Odds transformed proportions
pred0.odds <- pred0 %>%
    mutate_at(vars(Observed, Actual, Predicted), odds)
# Choose y-axis range
pred0.odds %>%
    {range(.$Observed / .$Actual)}

## [1]  0.02874036 32.32727273

range.bot <- c(0.02,33)
# Panel C: Error in proportions (odds transformed)
p.C <- ggplot(pred0.odds, aes(x = Taxon, y = Observed / Actual,
            color = as.factor(Num_species))) +
    geom_hline(yintercept = 1) +
    geom_quasirandom() +
    geom_rangeframe(color = "black", sides = "l") +
    labs(title = "Fold error in taxon proportions (odds)",
        y = "Observed / Actual", 
        color = "# species\nin mixture") +
    scale_y_log10(limits = range.bot, breaks = c(0.02, 0.1, 1, 10, 30), 
        labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    scale_color_manual(values = colors.num_species) +
    base_theme +
    tax_theme
p.D <- ggplot(ratios0, aes(Pair, Error, color = as.factor(Num_species), 
        shape = Shape)) +
    # Reference line
    geom_hline(yintercept = 1) +
    # Prediction from bias estimate
    geom_point(data = ratios0.pred,
        aes(Pair, Predicted),
        size = 4, color = "black") +
    # Observed
    # TODO: figure out why the points are smaller in just this panel
    geom_quasirandom(size = 1.4) +
    # Other stuff
    geom_rangeframe(sides = "l", color= "black") + 
    # scale_y_log10(breaks = c(0.02, 0.1, 1, 10, 20),
    #     labels = log_formatter) +
    scale_y_log10(limits = range.bot, breaks = c(0.02, 0.1, 1, 10, 30), 
        labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    scale_shape_manual(name = "", values = c(16, 3)) +
    scale_color_manual(values = colors.num_species) +
    labs(title = "Fold error in taxon ratios with model prediction",
        y = "Observed / Actual", 
        color = "# species\nin mixture") +
    base_theme +
    tax_theme

Make the plot!

l.A <- get_legend(p.B + theme(legend.position = "right"))
l.D <- get_legend(p.D + theme(legend.position = "right"))

row1 <- plot_grid(p.A, p.B,
    labels = c("A", "B"), align = "hv") %>%
    plot_grid(l.A, rel_widths = c(1, 0.14))
row2 <- plot_grid(p.C, p.D, 
    labels = c("C", "D"), align = "hv") %>%
    plot_grid(l.D, rel_widths = c(1, 0.14))
plot_grid(row1, row2, ncol=1, rel_heights = c(1, 1.1))

Fraction of squared Aitchison error due to bias

err <- main0 %>%
    group_by(Mixture_type, Sample) %>%
    summarize(
        Error2 = anorm(Observed / Actual)^2,
        Bias2 = anorm(Bias)^2,
        Resid2 = anorm(Observed / (Actual * Bias))^2
        ) %>%
    summarize_at(vars(-Sample), sum)
err <- err %>% 
    mutate(Frac_Bias = Bias2 / Error2, Frac_Resid = Resid2 / Error2)
err

## # A tibble: 3 x 6
##   Mixture_type Error2  Bias2 Resid2 Frac_Bias Frac_Resid
##   <chr>         <dbl>  <dbl>  <dbl>     <dbl>      <dbl>
## 1 Cells        267.   263.     3.70     0.986     0.0139
## 2 DNA           87.0   83.4    3.65     0.958     0.0419
## 3 PCR_product    9.18   3.47   5.71     0.378     0.622

Bias of each step in the workflow

Compute Bias for each step:

bias_steps <- bias %>% 
    select(Mixture_type, Taxon, Bias) %>%
    spread(Mixture_type, Bias) %>%
    mutate(Extraction = Cells / DNA,
        PCR = DNA / PCR_product,
        Sequencing = PCR_product
    )
bias_steps

## # A tibble: 7 x 7
##   Taxon                 Cells   DNA PCR_product Extraction   PCR Sequencing
##   <chr>                 <dbl> <dbl>       <dbl>      <dbl> <dbl>      <dbl>
## 1 Atopobium_vaginae     0.285 1.06        1.03       0.268 1.03       1.03 
## 2 Gardnerella_vaginalis 0.160 0.403       0.817      0.396 0.493      0.817
## 3 Lactobacillus_crispa… 2.29  0.536       0.907      4.27  0.591      0.907
## 4 Lactobacillus_iners   4.68  2.31        1.19       2.03  1.94       1.19 
## 5 Prevotella_bivia      1.79  0.392       0.927      4.55  0.423      0.927
## 6 Sneathia_amnii        4.59  2.39        1.30       1.92  1.85       1.30 
## 7 Streptococcus_agalac… 0.250 2.01        0.911      0.124 2.20       0.911

Add copy-number correction as a hypothetical final step in the bioinformatics pipeline. CN correction involves dividing by the CN for each taxon, and thus has a bias given by the reciprocal of the CNs.

cn_corr <- species_info %>% 
    transmute(Taxon, CN_correction = center_elts(1 / Copy_number))
bias_steps <- left_join(bias_steps, cn_corr, by = c("Taxon")) %>%
    mutate(Total = Extraction * PCR * Sequencing * CN_correction)

Gather back into tidy form

bias_steps <- bias_steps %>%
    gather("Type", "Bias", -Taxon)

Get standard errors from the bootreps,

bootreps0 <- bootreps %>%
    spread(Mixture_type, Center) %>%
    mutate(
        Extraction = Cells / DNA,
        PCR = DNA / PCR_product,
        Sequencing = PCR_product
    )
ses <- bootreps0 %>%
    gather("Type", "Bias", -.id, -Taxon) %>%
    group_by(Taxon, Type) %>%
    summarize(gm_se = gm_sd(Bias)) %>%
    ungroup

Join Bias w/ the standard errors:

bias_steps <- left_join(bias_steps, ses, by = c("Type", "Taxon"))

Because CN correction is deterministic, the geometric standard error is 1.

bias_steps <- bias_steps %>%
    mutate(gm_se = ifelse(Type == "CN_correction", 1, gm_se))

Create the 2-panel figure for the main text, showing the bias estimates in the left panel and the predicted composition through the workflow in the right panel.

# We will order the taxa by the in the legend by which are observed as most
# abundant in an actual even mixture; this will be the taxa with the largest
# "Total" bias.
lvls.taxon <- bias_steps %>%
    filter(Type == "Total") %>%
    arrange(Bias) %>%
    .$Taxon
# Order for the steps (left panel)
lvls.step <- c("Extraction", "PCR", "Sequencing", "CN_correction")
labs.step <- c("Extraction", "PCR", "Seq. + Inf.", "CN correction")
names(labs.step) <- c("Extraction", "PCR", "Sequencing", "CN_correction")
# Labels for the positions (right panel)
# labs.position <- c("Actual", "Post extraction", "Post PCR", "Reads", 
#             "CN corrected")
labs.position <- c("Cells", "DNA", "PCR product", "Reads", "Reads / CN")
names(labs.position) <- paste0("T", 0:4)
# Tibble of the relative abundances through the workflow (right panel)
ra_steps <- bias_steps %>%
    select(Taxon, Type, Bias) %>%
    spread(Type, Bias) %>%
    transmute(Taxon,
        T0 = 1,
        T1 = T0 * Extraction, 
        T2 = T1 * PCR, 
        T3 = T2 * Sequencing,
        T4 = T3 * CN_correction,
        ) %>%
    mutate(Taxon = fct_reorder(Taxon, T4)) %>%
    gather("Position", "Abundance", -Taxon)
# shared params
# ylimits <- ra_steps$Abundance %>% range
ylimits <- c(0.1, 6)
xexpand <- c(0.1, 0.1)
## Left panel
# Tibble for reference line
ref.left <- tibble(Step = factor(lvls.step, lvls.step), Bias = 1)
# make plot
p.left <- bias_steps %>% 
    filter(Type %in% lvls.step) %>%
    mutate(
        Step = factor(Type, lvls.step),
        Taxon = factor(Taxon, levels(ra_steps$Taxon))
        ) %>%
    ggplot(aes(Step, Bias, color = Taxon)) +
    geom_path(data = ref.left, aes(group = Bias), color = "grey") +
    # geom_quasirandom(width = 0.2) +
    geom_pointrange(aes(ymin = Bias / gm_se^2, ymax = Bias * gm_se^2),
        fatten = 0.5,
        position = position_dodge(width = 0.3)) +
    scale_y_log10(limits = ylimits, labels = log_formatter) +
    scale_x_discrete(expand = xexpand,labels = labs.step) +
    geom_rangeframe(color = "black") +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    labs(x = "Step", y = "Efficiency / gm. mean", 
        title = "Bias of each step") +
    guides(color = guide_legend(reverse = TRUE)) +
    base_theme +
    theme(axis.text.x = element_text(size=8, family = ""))
## Right panel
# Tibble for reference line
ref.right <- tibble(Position = names(labs.position), Abundance = 1)
# make plot
p.right <- ra_steps %>%
    ggplot(aes(Position, Abundance, color = Taxon)) +
    geom_path(data = ref.right, aes(group = Abundance), color = "grey") +
    geom_path(aes(group = Taxon)) +
    geom_point() +
    scale_y_log10(limits = ylimits, breaks = c(0.1, 0.3, 1, 3, 10), 
        labels = log_formatter) +
    scale_x_discrete(labels = labs.position, expand = xexpand) +
    geom_rangeframe(color = "black") +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    # labs(x = "Position in the workflow", y = "Abundance / gm. mean", 
    labs(x = "Biological substance", y = "Abundance / gm. mean", 
        title = "Composition through the workflow") +
    guides(color = guide_legend(reverse = TRUE)) +
    base_theme +
    theme(axis.text.x = element_text(size=8, family = ""))
l <- get_legend(p.right + theme(legend.position = c(0.45, 0.55), 
        plot.margin = margin(t=5, b=5)))

plot_grid(
    # p.left + theme(plot.margin = margin(l=5, t=5, b=5)),
    # p.right + theme(plot.margin = margin(t=5, b=5)),
    p.left,
    p.right,
    l,
    labels = c("A", "B", NULL),
    nrow = 1, rel_widths = c(1, 1, 0.3))

Bias versus 16S CN

How much PCR bias and total bias is explained by 16S CN variation?

Note: The “Total” variable in bias_steps includes CN correction; here we are interested in the total bias observed in the cell mixtures without CN correction being applied.

For this analysis I’m going to ignore the uncertainty in the estimate of the total bias and the PCR bias, since the standard errors are small relative to the magnitude of the bias,

bias_steps %>%
    filter(Type == "Cells")

## # A tibble: 7 x 4
##   Taxon                    Type   Bias gm_se
##   <chr>                    <chr> <dbl> <dbl>
## 1 Atopobium_vaginae        Cells 0.285  1.04
## 2 Gardnerella_vaginalis    Cells 0.160  1.05
## 3 Lactobacillus_crispatus  Cells 2.29   1.03
## 4 Lactobacillus_iners      Cells 4.68   1.02
## 5 Prevotella_bivia         Cells 1.79   1.04
## 6 Sneathia_amnii           Cells 4.59   1.04
## 7 Streptococcus_agalactiae Cells 0.250  1.03

bias_steps %>%
    filter(Type == "PCR")

## # A tibble: 7 x 4
##   Taxon                    Type   Bias gm_se
##   <chr>                    <chr> <dbl> <dbl>
## 1 Atopobium_vaginae        PCR   1.03   1.05
## 2 Gardnerella_vaginalis    PCR   0.493  1.08
## 3 Lactobacillus_crispatus  PCR   0.591  1.05
## 4 Lactobacillus_iners      PCR   1.94   1.06
## 5 Prevotella_bivia         PCR   0.423  1.06
## 6 Sneathia_amnii           PCR   1.85   1.05
## 7 Streptococcus_agalactiae PCR   2.20   1.04

First let’s get a table with all the various bias estimates as well as CN and genome size, with elements centered (divided by the geometric mean over taxa).

tb <- bias_steps %>%
    select(-gm_se) %>%
    spread(Type, Bias) %>%
    left_join(species_info, by = "Taxon") %>%
    select(Taxon, Cells, PCR, Copy_number, Genome_size) %>%
    mutate_if(is.numeric, center_elts)
tb

## # A tibble: 7 x 5
##   Taxon                    Cells   PCR Copy_number Genome_size
##   <chr>                    <dbl> <dbl>       <dbl>       <dbl>
## 1 Atopobium_vaginae        0.285 1.03        0.568       0.837
## 2 Gardnerella_vaginalis    0.160 0.493       0.568       0.954
## 3 Lactobacillus_crispatus  2.29  0.591       1.14        1.19 
## 4 Lactobacillus_iners      4.68  1.94        1.42        0.742
## 5 Prevotella_bivia         1.79  0.423       1.14        1.46 
## 6 Sneathia_amnii           4.59  1.85        0.852       0.773
## 7 Streptococcus_agalactiae 0.250 2.20        1.99        1.26

Next, we compute the predicted bias based on copy-number variation. The expected PCR bias is given by the CN per bp for the PCR step, and by the CN per genome for the total bias measured in the cell mixtures.

tb <- tb %>%
    mutate(
        Cells.pred = Copy_number, 
        PCR.pred = Copy_number / Genome_size,
        Cells.resid = Cells / Cells.pred,
        PCR.resid = PCR / PCR.pred
    )

How much of the bias is explained by CN variation? We will quantify the bias and the residual bias by the squared error as measured by the Aitchison norm, and compute a coefficient of determination quantifying how much of the bias is explained by the predicted bias from CN.

an2 <- tb %>%
    summarize_if(is.numeric, ~anorm(.)^2) %>% 
    unlist %>%
    as.list

The coefficient of determination for CN variation as an explanation of PCR bias is

1 - an2$PCR.resid / an2$PCR

## [1] 0.5998252

and of total bias is

1 - an2$Cells.resid / an2$Cells

## [1] 0.0991634

These numbers are equivalant to computing the standard R^2 measure on the log-transformed bias vectors. E.g.,

tb0 <- tb %>%
    mutate_if(is.numeric, log)
mu <- mean(tb0$PCR)
1 - sum((tb0$PCR - tb0$PCR.pred)^2) / sum((tb0$PCR - mu)^2)

## [1] 0.5998252

Supplemental figure plotting bias against copy number:

p.pcr <- tb %>%
    mutate(Taxon.abbrev = tax_abbrev(Taxon)) %>%
    ggplot(aes(Copy_number / Genome_size, PCR, label = Taxon.abbrev)) +
    geom_abline() +
    geom_text() +
    scale_x_log10(expand = expand_scale(mult = 0.1)) +
    scale_y_log10() +
    geom_rangeframe() +
    coord_fixed() +
    labs(title = "PCR bias vs. copy number", y = "Efficiency / gm. mean", 
        x = "16S copies per bp / gm. mean") +
    base_theme
p.tot <- tb %>%
    mutate(Taxon.abbrev = tax_abbrev(Taxon)) %>%
    ggplot(aes(Copy_number, Cells, label = Taxon.abbrev)) +
    geom_abline() +
    geom_text() +
    scale_x_log10(expand = expand_scale(mult = 0.25)) +
    scale_y_log10() +
    geom_rangeframe() +
    coord_fixed() +
    labs(title = "Total bias vs. copy number", y = "Efficiency / gm. mean", 
        x = "16S copies per genome / gm. mean") +
    base_theme
plot_grid(p.pcr, p.tot, nrow = 1, labels = c("A", "B"), rel_widths = c(1, 1))

Permutation tests

I use a permutation test to see whether the fraction of the bias explained by copy number could be due to chance, for PCR bias and the total (Cells) bias. In particular, I compute the p-value as the fraction of permutations of the predicted bias that gives a squared error (residual bias, measured by the Aitchison norm) less than or equal to the observed error,

K <- nrow(tb)
perms <- gtools::permutations(K, K)
# the residual bias we observe
pcr_obs <- anorm(tb$PCR / tb$PCR.pred)^2
cells_obs <- anorm(tb$Cells / tb$Cells.pred)^2
# the residual bias for each permutation
pcr_perms  <- apply(perms, 1, 
    function (x) { anorm(tb$PCR / tb$PCR.pred[x])^2 })
cells_perms  <- apply(perms, 1, 
    function (x) { anorm(tb$Cells / tb$Cells.pred[x])^2 })
# the p values
pcr_pval <- mean(pcr_perms <= pcr_obs) %>% round(3)
cells_pval <- mean(cells_perms <= cells_obs) %>% round(3)

p.pcr <- qplot(pcr_perms) + 
    geom_vline(xintercept = pcr_obs) +
    labs(title = paste0("PCR bias vs. copy number; p = ", pcr_pval), 
        x = "Squared Aitchison error")
p.cells <- qplot(cells_perms) + 
    geom_vline(xintercept = cells_obs) +
    labs(title = paste0("Total bias vs. copy number; p = ", cells_pval), 
        x = "Squared Aitchison error")
plot_grid(p.pcr, p.cells)

Table with estimated bias and summary statistics

Compute pairwise-error summary statistics

Mixture experiments: Bias and noise

Get the ratios with the estimated bias and the residuals,

ratios1 <- main0 %>%
    select(Sample, Taxon, Mixture_type, Observed, Actual, Error, Bias,
        Residual) %>%
    mutate(Taxon = tax_abbrev(Taxon)) %>%
    compute_ratios(group_vars = c("Mixture_type", "Sample")) %>%
    filter(!is.na(Actual)) %>%
    mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))

Compute pairwise error summary statistics:

avg_errors <- ratios1 %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Average the errors for each pair of different taxa
    filter(Taxon.x != Taxon.y) %>%
    group_by(Mixture_type, Taxon.x, Taxon.y) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    # Average over pairs of taxa for each mixture type
    group_by(Mixture_type) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".avg")
max_errors <- ratios1 %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Find the max fold error in a ratio for each mixture type
    group_by(Mixture_type) %>%
    summarize_at(vars(Error, Bias, Residual), gm_range) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".max")
pw_summary <- bind_cols(avg_errors, max_errors) %>%
    select(Type = Mixture_type, Bias.max, Bias.avg, Residual.avg)
pw_summary

## # A tibble: 3 x 4
##   Type        Bias.max Bias.avg Residual.avg
##   <chr>          <dbl>    <dbl>        <dbl>
## 1 Cells          29.3      5.57         1.21
## 2 DNA             6.10     2.65         1.19
## 3 PCR_product     1.59     1.22         1.27

Workflow steps: Bias only

Similarly, compute pairwise bias summary statistics for each protocol step

bias_steps.ratios <- bias_steps %>%
    filter(Type %in% c("Extraction", "PCR", "Sequencing")) %>%
    select(Type, Taxon, Bias) %>%
    compute_ratios(group_vars = c("Type"))
avg_bias_steps <- bias_steps.ratios %>%
    mutate_at(vars(Bias), gm_abs) %>%
    filter(Taxon.x != Taxon.y) %>%
    group_by(Type, Taxon.x, Taxon.y) %>%
    summarize_at(vars(Bias), gm_mean) %>%
    group_by(Type) %>%
    summarize_at(vars(Bias), gm_mean) %>%
    rename_at(vars(Bias), paste0, ".avg")
max_bias_steps <- bias_steps.ratios %>%
    mutate_at(vars(Bias), gm_abs) %>%
    group_by(Type) %>%
    summarize_at(vars(Bias), gm_range) %>%
    rename_at(vars(Bias), paste0, ".max")
pw_summary_steps <- bind_cols(avg_bias_steps, max_bias_steps) %>%
    select(Type, Bias.max, Bias.avg)
pw_summary_steps

## # A tibble: 3 x 3
##   Type       Bias.max Bias.avg
##   <chr>         <dbl>    <dbl>
## 1 Extraction    36.6      5.53
## 2 PCR            5.21     2.32
## 3 Sequencing     1.59     1.22

Combine with bias estimates in a single table

# Function to create string for formatting numbers with glue::glue()
fmt_string <- function(var, digits) {
    paste0("{format(", var, ", digits = ", digits, ")}")
}
bias_tab <- bias_steps %>%
    select(-gm_se) %>%
    mutate(Bias = glue::glue(fmt_string("Bias", 1))) %>%
    mutate_all(as.character) %>%
    spread(Type, Bias) %>%
    select(Taxon, Cells, DNA, PCR_product, Extraction, PCR, Sequencing)
pw_summary_tab <- bind_rows(pw_summary, pw_summary_steps) %>%
    mutate(
        Bias.max = glue::glue(fmt_string("Bias.max", 2)),
        Bias.avg = glue::glue(fmt_string("Bias.avg", 2)),
        Residual.avg = glue::glue(fmt_string("Residual.avg", 2))
        ) %>%
    mutate_all(as.character) %>%
    gather("Taxon", "Value", -Type) %>%
    spread(Type, Value) %>%
    mutate_all(~ifelse(str_detect(., "NA"), "---", .))
# Order taxa from highest to lowest efficiency in the cell mixtures
taxa.ranked <- bias %>%
    filter(Mixture_type == "Cells") %>%
    arrange(-Bias) %>%
    {.$Taxon}
lvls.taxon <- c(taxa.ranked, "Bias.max", "Bias.avg", "Residual.avg")
lbls.taxon <- c(taxa.ranked, "Max pairwise bias", 
    "Avg. pairwise bias", "Avg. pairwise noise")
bias_tab0 <- bind_rows(bias_tab, pw_summary_tab) %>%
    mutate(
        Taxon = factor(Taxon, lvls.taxon, lbls.taxon),
        ) %>%
    arrange(Taxon)
bias_tab0

## # A tibble: 10 x 7
##    Taxon               Cells DNA    PCR_product Extraction PCR   Sequencing
##    <fct>               <chr> <chr>  <chr>       <chr>      <chr> <chr>     
##  1 Lactobacillus_iners 4.7   2.3    1.2         2.0        1.9   1.2       
##  2 Sneathia_amnii      4.6   2.4    1.3         1.9        1.8   1.3       
##  3 Lactobacillus_cris… 2.3   0.5    0.9         4.3        0.6   0.9       
##  4 Prevotella_bivia    1.8   0.4    0.9         4.6        0.4   0.9       
##  5 Atopobium_vaginae   0.3   1.1    1.0         0.3        1.0   1.0       
##  6 Streptococcus_agal… 0.2   2.0    0.9         0.1        2.2   0.9       
##  7 Gardnerella_vagina… 0.2   0.4    0.8         0.4        0.5   0.8       
##  8 Max pairwise bias   29.3  " 6.1" " 1.6"      36.6       " 5.… " 1.6"    
##  9 Avg. pairwise bias  5.6   2.7    1.2         5.5        2.3   1.2       
## 10 Avg. pairwise noise 1.2   1.2    1.3         ---        ---   ---

Latex version for the manuscript:

tex <- bias_tab0 %>%
    rename(`PCR prod.` = PCR_product, `Seq. + Inf.` = Sequencing) %>%
    mutate(
        Taxon = kableExtra::cell_spec(
            str_replace(Taxon, "_", " "), 
            "latex", 
            italic = Taxon %in% taxa)
        ) %>%
    knitr::kable(format="latex", booktabs = TRUE, linesep = "",
        escape = FALSE, align = c("l", rep("r", 6))) %>%
    kableExtra::add_header_above(c(" ", "Mixtures" = 3, 
            "Steps" = 3)) %>%
    kableExtra::row_spec(length(taxa), extra_latex_after = "\\midrule")
# tex
# clipr::write_clip(tex)

Calibration

To illustrate how one can calibrate samples with compositions different from those used to measure bias, we estimate bias from just the 7-species samples and use it to correct all samples.

Estimate bias as before,

bias.7sp <- main0 %>%
    filter(Num_species == 7) %>%
    select(Mixture_type, Sample, Taxon, Observed, Actual) %>%
    mutate(Error = Observed / Actual) %>%
    group_by(Mixture_type) %>%
    nest %>%
    mutate(
        Error_matrix = map(data, build_matrix, 
            Sample, Taxon, Error, fill = NaN),
        Estimate = map(Error_matrix, center, enframe = TRUE),
    ) %>%
    unnest(Estimate) %>%
    rename(Bias_7sp = Center)

Compare the two bias estimates:

left_join(bias, bias.7sp, by = c("Mixture_type", "Taxon")) %>%
    filter(Mixture_type == "Cells")

## # A tibble: 7 x 5
##   Mixture_type Taxon                     Bias gm_se Bias_7sp
##   <chr>        <chr>                    <dbl> <dbl>    <dbl>
## 1 Cells        Atopobium_vaginae        0.285  1.04    0.377
## 2 Cells        Gardnerella_vaginalis    0.160  1.05    0.223
## 3 Cells        Lactobacillus_crispatus  2.29   1.03    2.10 
## 4 Cells        Lactobacillus_iners      4.68   1.02    4.02 
## 5 Cells        Prevotella_bivia         1.79   1.04    1.38 
## 6 Cells        Sneathia_amnii           4.59   1.04    4.23 
## 7 Cells        Streptococcus_agalactiae 0.250  1.03    0.240

Get calibrated compositions from both:

cal <- main0 %>%
    left_join(bias.7sp, by = c("Mixture_type", "Taxon")) %>%
    mutate_by(Sample,
        Calibrated_all = close_elts(Observed / Bias),
        Calibrated_7sp = close_elts(Observed / Bias_7sp)
    ) %>%
    gather("Estimate_type", "Estimate", 
        Observed, Calibrated_all, Calibrated_7sp) %>%
    mutate(Estimate_type = factor(Estimate_type, 
            c("Observed", "Calibrated_all", "Calibrated_7sp")))

Check the reduction in error in proportions,

ggplot(cal, aes(Taxon, odds(Estimate) / odds(Actual), 
        color = as.factor(Num_species))) +
    geom_quasirandom() +
    scale_y_log10() + 
    facet_grid(Mixture_type ~ Estimate_type) +
    scale_color_manual(values = colors.num_species) +
    tax_theme

As expected, the improvement isn’t as great but is still substantial for the cell and DNA mixtures. In the cell mixtures, there seems to be a systematic underestimate of G. vaginalis and A. vaginae and a corresponding over-estimate of others such as P bivia. This could be due to chance over-representation of Gv and Av in the 7sp mixtures, e.g. due to sample construction error.

Check that calibration reduces the error in the proportions on the samples not used to estimate bias:

cal_error.props <- cal %>%
    filter(Num_species < 7) %>%
    group_by(Mixture_type, Estimate_type) %>%
    summarize(
        MSE.prop = mean((Estimate - Actual)^2),
        MSE.logit = mean((logit(Estimate) - logit(Actual))^2),
        RMSE.logit = sqrt(mean( (logit(Estimate) - logit(Actual))^2 )),
        )
cal_error.props

## # A tibble: 9 x 5
## # Groups:   Mixture_type [3]
##   Mixture_type Estimate_type  MSE.prop MSE.logit RMSE.logit
##   <chr>        <fct>             <dbl>     <dbl>      <dbl>
## 1 Cells        Observed        0.0906     3.59        1.90 
## 2 Cells        Calibrated_all  0.00237    0.0450      0.212
## 3 Cells        Calibrated_7sp  0.00666    0.139       0.373
## 4 DNA          Observed        0.0445     1.14        1.07 
## 5 DNA          Calibrated_all  0.00212    0.0447      0.211
## 6 DNA          Calibrated_7sp  0.00229    0.0480      0.219
## 7 PCR_product  Observed        0.00619    0.121       0.348
## 8 PCR_product  Calibrated_all  0.00345    0.0693      0.263
## 9 PCR_product  Calibrated_7sp  0.00494    0.0986      0.314

Also check the reduction in the average Bray-Curtis dissimilarity and Aitchison distance to the actual compositions after calibration:

cal_error.dist <- cal %>%
    filter(Num_species < 7) %>%
    group_by(Mixture_type, Estimate_type, Sample) %>%
    summarize(
        Dist.BC = xydist(Estimate, Actual, method = "bray"),
        Dist.Ai = xydist(Estimate, Actual, method = "aitchison", trim = TRUE)
    ) %>%
    summarize_at(vars(Dist.BC, Dist.Ai), mean)
cal_error.dist

## # A tibble: 9 x 4
## # Groups:   Mixture_type [3]
##   Mixture_type Estimate_type  Dist.BC Dist.Ai
##   <chr>        <fct>            <dbl>   <dbl>
## 1 Cells        Observed        0.348    1.73 
## 2 Cells        Calibrated_all  0.0467   0.166
## 3 Cells        Calibrated_7sp  0.0830   0.305
## 4 DNA          Observed        0.243    0.966
## 5 DNA          Calibrated_all  0.0479   0.175
## 6 DNA          Calibrated_7sp  0.0499   0.181
## 7 PCR_product  Observed        0.0792   0.285
## 8 PCR_product  Calibrated_all  0.0620   0.225
## 9 PCR_product  Calibrated_7sp  0.0752   0.272

Summary for the manuscript: Calibration (using just the 7-species samples) reduced the mean squared error of the proportions in the calibrated (non-7-species) samples by 92.6% and the average Bray-Curtis dissimilarity between the actual and observed compositions from 0.35 to 0.08.

Model comparisons

Starting point for fitting the linear models. For fitting the linear models on the proportions, keep single-species samples and rows where Actual == 0. Will still set Observed == 0 for taxa not supposed to be in the samples (i.e., remove cross-sample contamination).

main1 <- main %>%
    mutate_by(Sample, 
        Observed = close_elts(Count * Expected),
        Actual = close_elts(Expected)) %>%
    select(-Count, -Expected) %>%
    select(Sample, Taxon, Observed, Actual, everything())

Simple linear regression on the proportions

Simple linear model of Actual ~ Observed, without and with intercept term. Each taxon is fit independently.

fits.slm <- main1 %>%
    group_by(Mixture_type, Taxon) %>%
    nest %>%
    mutate(
        fit0 = map(data, ~lm(Observed ~ 0 + Actual, data = .)),
        fit1 = map(data, ~lm(Observed ~ 1 + Actual, data = .)),
        )

The fits in fit0 and fit1 are without and with an intercept term.

preds.slm <- fits.slm %>%
    gather("Model", "Fit", fit0, fit1) %>%
    unnest(data, map(Fit, broom::augment))
ggplot(preds.slm, aes(.fitted, Observed, color = Taxon)) +
    geom_quasirandom() +
    facet_grid(Mixture_type ~ Model) +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    labs(c = "Model prediction")

The fits are very similar, so we’ll use the simpler model (with no intercept term) from here on.

preds.slm <- preds.slm %>%
    filter(Model == "fit0")

Brooks2015 model

The model is decribed in the Methods section “Experimental design andmixture effect models” of Brooks et al 2015. For a given taxon in a given mixture experiment, the model is specified by the formula

fmla <- as.formula(paste(
        "Observed ~ 0 + (", 
        paste(taxa, collapse = "+"),
        ")^3"))
fmla

## Observed ~ 0 + (Atopobium_vaginae + Gardnerella_vaginalis + Lactobacillus_crispatus + 
##     Lactobacillus_iners + Prevotella_bivia + Sneathia_amnii + 
##     Streptococcus_agalactiae)^3

where Observed is the observed proportion of the given taxon and the taxon names on the right-hand side (the predictors) are the actual proportions of the taxa. To fit this model, we need a data frame with each row corresponding to a given taxon’s observed abundance in a sample, and the actual abundances of all 7 taxa.

actual <- main1 %>%
    select(Sample, Taxon, Actual) %>%
    spread(Taxon, Actual)
tb <- main1 %>%
    left_join(actual, by = "Sample")
tb %>% glimpse

## Observations: 1,680
## Variables: 16
## $ Sample                   [3m[90m<chr>[39m[23m "s1-1", "s1-1", "s1-1", "s1-1", "s1-1",…
## $ Taxon                    [3m[90m<chr>[39m[23m "Atopobium_vaginae", "Gardnerella_vagin…
## $ Observed                 [3m[90m<dbl>[39m[23m 0.00000000, 0.00000000, 0.60167254, 0.0…
## $ Actual                   [3m[90m<dbl>[39m[23m 0.0000000, 0.0000000, 0.3333333, 0.0000…
## $ Plate                    [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Barcode                  [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 10, 10, 10, 10, 10…
## $ Mixture_type             [3m[90m<chr>[39m[23m "Cells", "Cells", "Cells", "Cells", "Ce…
## $ Num_species              [3m[90m<int>[39m[23m 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Species_list             [3m[90m<chr>[39m[23m "Lactobacillus_crispatus;Prevotella_biv…
## $ Atopobium_vaginae        [3m[90m<dbl>[39m[23m 0.0000000, 0.0000000, 0.0000000, 0.0000…
## $ Gardnerella_vaginalis    [3m[90m<dbl>[39m[23m 0.0000000, 0.0000000, 0.0000000, 0.0000…
## $ Lactobacillus_crispatus  [3m[90m<dbl>[39m[23m 0.3333333, 0.3333333, 0.3333333, 0.3333…
## $ Lactobacillus_iners      [3m[90m<dbl>[39m[23m 0.0000000, 0.0000000, 0.0000000, 0.0000…
## $ Prevotella_bivia         [3m[90m<dbl>[39m[23m 0.3333333, 0.3333333, 0.3333333, 0.3333…
## $ Sneathia_amnii           [3m[90m<dbl>[39m[23m 0.0000000, 0.0000000, 0.0000000, 0.0000…
## $ Streptococcus_agalactiae [3m[90m<dbl>[39m[23m 0.3333333, 0.3333333, 0.3333333, 0.3333…

We again fit this model with lm for each experiment and taxon, and extract the predictions for plotting.

fits.brooks <- tb %>%
    group_by(Mixture_type, Taxon) %>%
    nest %>%
    mutate( fit = map(data, ~lm(fmla, data = .)) )
preds.brooks <- fits.brooks %>%
    unnest(data, map(fit, broom::augment))

Plot with all three models

Finally, we’ll plot the fits from the simple linear model (no intercept), the Brooks model, and our model. We will restrict to just the samples with at least two species and where the Actual proportion is > 0 in order to filter most of the cases where the linear models prediction proportions that are outside of the [0, 1] interval.

preds <- left_join(
    preds.slm %>% select(Mixture_type, Taxon, Sample, Observed, Actual,
        Num_species, Species_list, SLM = .fitted),
    preds.brooks %>% select(Taxon, Sample, Brooks = .fitted),
    by = c("Sample", "Taxon")
    ) %>%
    left_join(
        main0 %>% select(Taxon, Sample, Ours = Predicted),
        by = c("Sample", "Taxon")
    ) %>%
    gather("Model", "Predicted", SLM, Brooks, Ours) %>%
    filter(Actual > 0, Num_species > 1) %>%
    mutate(
        Model = factor(Model, c("SLM", "Brooks", "Ours")),
        Model = fct_recode(Model, 
            `Simple linear model` = "SLM",
            `Brooks et al model` = "Brooks",
            `Our model` = "Ours",
        )
    )
ggplot(preds, aes(logit(Predicted), logit(Observed), color = Taxon)) +
    geom_abline(intercept = 0, slope = 1, color = "grey") +
    geom_jitter(width = 0.1, height = 0) +
    geom_rangeframe(color = "black") + 
    facet_grid(Mixture_type ~ Model) +
    base_theme +
    labs(x = "log-odds(Predicted proportion)", 
        y = "log-odds(Observed proportion)") +
    coord_fixed() +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    theme(
        panel.spacing.x = unit(1, "lines"),
        legend.position = "bottom",
    )

## Warning in log(x): NaNs produced

## Warning in log(x): NaNs produced

## Warning in log(x): NaNs produced

## Warning: Removed 2 rows containing missing values (geom_point).

The few remaining NaNs arise during the logit transform of the few remaining cases where the Brooks model predicts a proportion less than zero.

preds %>%
    filter(Predicted < 0) %>%
    select(-Species_list)

## # A tibble: 2 x 8
##   Mixture_type Taxon    Sample Observed Actual Num_species Model  Predicted
##   <chr>        <chr>    <chr>     <dbl>  <dbl>       <int> <fct>      <dbl>
## 1 Cells        Gardner… s1-23    0.0141  0.143           7 Brook…  -0.00238
## 2 Cells        Gardner… s2-14    0.0222  0.143           7 Brook…  -0.00238

Session info

sessionInfo()

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Arch Linux
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas.so.3.8.0
## LAPACK: /usr/lib/liblapack.so.3.8.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] BiasManuscript_0.0.0.9000 metacal_0.1.0            
##  [3] ggbeeswarm_0.6.0          cowplot_0.9.99           
##  [5] ggthemes_4.2.0            forcats_0.4.0            
##  [7] stringr_1.4.0             dplyr_0.8.3              
##  [9] purrr_0.3.2               readr_1.3.1              
## [11] tidyr_0.8.3               tibble_2.1.3             
## [13] ggplot2_3.2.0             tidyverse_1.2.1          
## [15] here_0.1                  rmarkdown_1.13           
## [17] nvimcom_0.9-82            usethis_1.5.0            
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-140      fs_1.3.1          lubridate_1.7.4  
##  [4] devtools_2.0.2    webshot_0.5.1     httr_1.4.0       
##  [7] rprojroot_1.3-2   useful_1.2.6      tools_3.6.1      
## [10] backports_1.1.4   utf8_1.1.4        R6_2.4.0         
## [13] vipor_0.4.5       lazyeval_0.2.2    colorspace_1.4-1 
## [16] withr_2.1.2       tidyselect_0.2.5  prettyunits_1.0.2
## [19] processx_3.3.1    compiler_3.6.1    cli_1.1.0        
## [22] rvest_0.3.4       xml2_1.2.0        desc_1.2.0       
## [25] labeling_0.3      scales_1.0.0      callr_3.2.0      
## [28] digest_0.6.20     pkgconfig_2.0.2   htmltools_0.3.6  
## [31] sessioninfo_1.1.1 rlang_0.4.0       readxl_1.3.1     
## [34] rstudioapi_0.10   generics_0.0.2    jsonlite_1.6     
## [37] gtools_3.8.1      magrittr_1.5      kableExtra_1.1.0 
## [40] Rcpp_1.0.1        munsell_0.5.0     fansi_0.4.0      
## [43] stringi_1.4.3     yaml_2.2.0        MASS_7.3-51.4    
## [46] pkgbuild_1.0.3    plyr_1.8.4        grid_3.6.1       
## [49] crayon_1.3.4      lattice_0.20-38   haven_2.1.0      
## [52] hms_0.4.2         zeallot_0.1.0     knitr_1.23       
## [55] ps_1.3.0          pillar_1.4.2      igraph_1.2.4.1   
## [58] reshape2_1.4.3    codetools_0.2-16  pkgload_1.0.2    
## [61] glue_1.3.1        evaluate_0.14     remotes_2.0.4    
## [64] modelr_0.1.4      vctrs_0.2.0       testthat_2.1.1   
## [67] cellranger_1.1.0  gtable_0.3.0      assertthat_0.2.1 
## [70] xfun_0.7          broom_0.5.2       viridisLite_0.3.0
## [73] beeswarm_0.2.3    memoise_1.1.0     ellipsis_0.2.0.1

Brooks2015 analysis

Michael McLaren

2019-07-09