R setup

Run with SAVE_FIGURES = TRUE to save figures in figures/.

SAVE_FIGURES = TRUE

Libraries and paths

library(here)
library(tidyverse)
library(ggthemes)
library(cowplot); theme_set(theme_grey()) 
library(ggbeeswarm)
# Functions for bias estimation and calibration
library(metacal); packageVersion("metacal")
## [1] '0.1.0'
# This package
devtools::load_all(here())

Options for ggplot:

base_theme <- theme_tufte() + 
    theme(
        text = element_text(size=9, family = ""),
        strip.text = element_text(size=9, family = ""),
        # axis.text = element_text(size=8, family = ""),
        legend.position = "none"
        )
tax_theme <- theme(
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
        axis.title.x = element_blank())

update_geom_defaults("point", list(size = 1))
update_geom_defaults("text", list(size = 2.5))
update_geom_defaults("hline", list(color = "grey"))
update_geom_defaults("abline", list(color = "grey"))
# Colors for color scales
colors.OkabeIto <-  tribble(
    ~Name, ~Hex,
    "orange", "#E69F00",
    "sky blue", "#56B4E9",
    "bluish green", "#009E73",
    # "yellow", "#F0E442",
    "blue", "#0072B2",
    "vermilion", "#D55E00",
    "reddish purple", "#CC79A7",
    "black", "#000000",
    # "grey", "#999999",
    "yellow", "#F0E442",
    )
colors.taxon <- c(
    "Atopobium_vaginae" = "#009E73",
    "Gardnerella_vaginalis" = "#56B4E9",
    "Lactobacillus_crispatus" = "#D55E00",
    # "Lactobacillus_iners" = "#000000",
    "Lactobacillus_iners" = "#505050",
    "Prevotella_bivia" = "#0072B2",
    "Sneathia_amnii" = "#CC79A7",
    "Streptococcus_agalactiae" = "#E69F00") 
colors.num_species <- c(
    "2" = "#E69F00",
    "3" = "#56B4E9",
    "4" = "#CC79A7",
    # "7" = "#000000"
    "7" = "#505050"
    )
# Width of figures in elife latex template pdf when width=\fullwidth is used
# elife_width = 5.626

Load and inspect the data and prep for analysis

Data from the Brooks2015 supplement; for details and code used to obtain this data, see data-raw/brooks2015.R.

data("brooks2015_sample_data")
data("brooks2015_counts")
data("brooks2015_species_info")

Sample data

The key sample metadata variables are the Mixture type (or experiment), the number of species in the mixture, and the species in each mixture.

sam <- brooks2015_sample_data
print(sam, n = 10)
## # A tibble: 240 x 6
##    Sample Plate Barcode Mixture_type Num_species Species_list              
##    <chr>  <dbl>   <dbl> <chr>              <int> <chr>                     
##  1 s1-1       1       1 Cells                  3 Lactobacillus_crispatus;P…
##  2 s1-2       1       2 Cells                  3 Gardnerella_vaginalis;Ato…
##  3 s1-3       1       3 Cells                  3 Gardnerella_vaginalis;Ato…
##  4 s1-4       1       4 Cells                  3 Atopobium_vaginae;Lactoba…
##  5 s1-5       1       5 Cells                  2 Gardnerella_vaginalis;Pre…
##  6 s1-6       1       6 Cells                  3 Atopobium_vaginae;Lactoba…
##  7 s1-7       1       7 Cells                  2 Atopobium_vaginae;Strepto…
##  8 s1-8       1       8 Cells                  2 Gardnerella_vaginalis;Str…
##  9 s1-9       1       9 Cells                  3 Atopobium_vaginae;Sneathi…
## 10 s1-10      1      10 Cells                  3 Atopobium_vaginae;Lactoba…
## # … with 230 more rows

There are 80 samples for each of the three mixture types:

sam %>%
    group_by(Mixture_type) %>%
    summarize(Num_samples = n())
## # A tibble: 3 x 2
##   Mixture_type Num_samples
##   <chr>              <int>
## 1 Cells                 80
## 2 DNA                   80
## 3 PCR_product           80

We’ll ultimately just be using the 71 of 80 that contain two or more species,

sam %>%
    group_by(Mixture_type) %>%
    filter(Num_species > 1) %>%
    summarize(Num_samples = n())
## # A tibble: 3 x 2
##   Mixture_type Num_samples
##   <chr>              <int>
## 1 Cells                 71
## 2 DNA                   71
## 3 PCR_product           71

Distinct compositions:

sam %>%
    select(Mixture_type, Num_species, Species_list) %>%
    distinct() %>%
    group_by(Mixture_type) %>%
    filter(Num_species > 1) %>%
    summarize(Num_samples = n())
## # A tibble: 3 x 2
##   Mixture_type Num_samples
##   <chr>              <int>
## 1 Cells                 58
## 2 DNA                   58
## 3 PCR_product           58

Count data

The count data includes reads from Brooks2015’s above- and below-threshold tables, specifying reads that were classified above or below a 97% sequence identity threshold to 16S sequences in their database.

print(brooks2015_counts, n = 10)
## # A tibble: 3,840 x 4
##    Sample Taxon                   Table Count
##    <chr>  <chr>                   <chr> <dbl>
##  1 s1-1   Atopobium_vaginae       above     1
##  2 s1-1   Atopobium_vaginae       below     2
##  3 s1-1   Gardnerella_vaginalis   above     1
##  4 s1-1   Gardnerella_vaginalis   below     0
##  5 s1-1   Lactobacillus_crispatus above 13670
##  6 s1-1   Lactobacillus_crispatus below   540
##  7 s1-1   Lactobacillus_iners     above     0
##  8 s1-1   Lactobacillus_iners     below     0
##  9 s1-1   Other                   above     7
## 10 s1-1   Other                   below    12
## # … with 3,830 more rows

The vast majority of reads were classified above threshold,

brooks2015_counts %>%
    group_by(Table) %>%
    summarize(Sum = sum(Count)) %>%
    mutate(Proportion = Sum / sum(Sum))
## # A tibble: 2 x 3
##   Table     Sum Proportion
##   <chr>   <dbl>      <dbl>
## 1 above 3668922     0.935 
## 2 below  253078     0.0645

In preparing this dataframe, we have grouped all reads classified to non-mock taxa as “Other”. The vast majority of above and below threshold reads were classified to the mock taxa,

brooks2015_counts %>%
    group_by(Table, Taxon != "Other") %>%
    summarize(Sum = sum(Count)) %>%
    mutate(Proportion = Sum / sum(Sum))
## # A tibble: 4 x 4
## # Groups:   Table [2]
##   Table `Taxon != "Other"`     Sum Proportion
##   <chr> <lgl>                <dbl>      <dbl>
## 1 above FALSE                  733   0.000200
## 2 above TRUE               3668189   1.000   
## 3 below FALSE                 2279   0.00901 
## 4 below TRUE                250799   0.991

We will follow Brooks2015 in using the above threshold reads only, and will restrict to just the mock taxa. We will also create a new dataframe that will be our main data frame for our analysis going forward.

main <- brooks2015_counts %>%
    filter(Table == "above", Taxon != "Other") %>%
    select(Sample, Taxon, Count)

Let’s add the sample data,

main <- main %>%
    left_join(sam, by = "Sample")

and a column for whether the species is expected to be in the sample.

main <- main %>%
    mutate(Expected = str_detect(Species_list, Taxon))

Species info

Estimated genome size and 16S copy number, obtained in analysis/brooks2015-species-info.Rmd.

species_info <- brooks2015_species_info
taxa <- species_info$Taxon
species_info
## # A tibble: 7 x 3
##   Taxon                    Genome_size Copy_number
##   <chr>                          <dbl>       <dbl>
## 1 Atopobium_vaginae           1440070.           2
## 2 Gardnerella_vaginalis       1642448.           2
## 3 Lactobacillus_crispatus     2043161            4
## 4 Lactobacillus_iners         1277649            5
## 5 Prevotella_bivia            2521238            4
## 6 Sneathia_amnii              1330224            3
## 7 Streptococcus_agalactiae    2160267            7

Expected copy-number bias for the three experiments:

cnbias <- species_info %>%
    expand(Mixture_type = sam$Mixture_type, 
        nesting(Taxon, Genome_size, Copy_number)) %>%
    mutate(CN_bias = case_when(
            Mixture_type == "Cells" ~ Copy_number,
            Mixture_type == "DNA" ~ Copy_number / Genome_size,
            Mixture_type == "PCR_product" ~ 1,
        )
    ) %>%
    mutate_by(Mixture_type, CN_bias = center_elts(CN_bias))
print(cnbias, n = Inf)
## # A tibble: 21 x 5
##    Mixture_type Taxon                    Genome_size Copy_number CN_bias
##    <chr>        <chr>                          <dbl>       <dbl>   <dbl>
##  1 Cells        Atopobium_vaginae           1440070.           2   0.568
##  2 Cells        Gardnerella_vaginalis       1642448.           2   0.568
##  3 Cells        Lactobacillus_crispatus     2043161            4   1.14 
##  4 Cells        Lactobacillus_iners         1277649            5   1.42 
##  5 Cells        Prevotella_bivia            2521238            4   1.14 
##  6 Cells        Sneathia_amnii              1330224            3   0.852
##  7 Cells        Streptococcus_agalactiae    2160267            7   1.99 
##  8 DNA          Atopobium_vaginae           1440070.           2   0.679
##  9 DNA          Gardnerella_vaginalis       1642448.           2   0.595
## 10 DNA          Lactobacillus_crispatus     2043161            4   0.957
## 11 DNA          Lactobacillus_iners         1277649            5   1.91 
## 12 DNA          Prevotella_bivia            2521238            4   0.775
## 13 DNA          Sneathia_amnii              1330224            3   1.10 
## 14 DNA          Streptococcus_agalactiae    2160267            7   1.58 
## 15 PCR_product  Atopobium_vaginae           1440070.           2   1    
## 16 PCR_product  Gardnerella_vaginalis       1642448.           2   1    
## 17 PCR_product  Lactobacillus_crispatus     2043161            4   1    
## 18 PCR_product  Lactobacillus_iners         1277649            5   1    
## 19 PCR_product  Prevotella_bivia            2521238            4   1    
## 20 PCR_product  Sneathia_amnii              1330224            3   1    
## 21 PCR_product  Streptococcus_agalactiae    2160267            7   1

Inspecting and removing cross-sample contaminant reads

Many samples contain reads that are assigned to taxa not supposed to be in the mixture.

main %>%
    group_by(Sample, Expected) %>%
    summarize(Sum = sum(Count)) %>%
    ungroup %>%
    filter(!Expected) %>%
    summarize(Frac = mean(Sum > 0)) %>%
    {paste("The fraction of samples with out-of-sample reads is", .$Frac)}
## [1] "The fraction of samples with out-of-sample reads is 0.97008547008547"
main %>%
    group_by(Mixture_type, Num_species, Sample, Expected) %>%
    summarize(Sum = sum(Count)) %>%
    mutate_by(Sample, Prop = Sum / sum(Sum)) %>%
    filter(!Expected) %>%
    ggplot(aes(Prop, fill = as.factor(Num_species))) +
    geom_dotplot() +
    scale_x_log10() +
    facet_grid(Mixture_type~.) +
    labs(x = "Fraction of out-of-sample reads", fill = "# species")
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 7 rows containing non-finite values (stat_bindot).

Note, no 7-species samples appear as their fraction is always zero. The fraction of out-of-sample reads can be quite high in the DNA mixtures and a few PCR mixtures. But the fraction is still generally quite low, and we suspect most to all cases are due to cross-sample contamination during handling and sequencing (e.g., index/barcode hopping), and not sample mislabeling or mis-construction.

Prep for main analysis

Our main analysis will use Observed and Actual variables, with the following simplifications. Our downstream analysis requires that the samples only have reads from taxa actually in the samples. We will therefore remove the out-of-sample reads and treat the actual compositions as an even mixture of the expected taxa. We will also ignore the extra precision information in the observed read counts, and treat Observed as well as Actual as compositional vectors. We will further filter to samples with more than one species. We will generally store relative abundances as proportions for plotting purposes (we use the close_elts function within samples to convert compositional vectors to vectors of proportions).

main0 <- main %>%
    filter(Expected, Num_species > 1) %>%
    mutate_by(Sample, 
        Observed = close_elts(Count),
        Actual = close_elts(Expected),
        Error = Observed / Actual) %>%
    select(-Count, -Expected) %>%
    select(Sample, Taxon, Observed, Actual, everything())

The Observed and Actual compositions are normalized to proportions for plotting in the next section.

Add copy-number bias for each taxon + experiment pair:

main0 <- main0 %>%
    left_join(cnbias %>% select(Mixture_type, Taxon, CN_bias),
        by = c("Mixture_type", "Taxon"))
print(main0, n = 10)
## # A tibble: 591 x 11
##    Sample Taxon Observed Actual Plate Barcode Mixture_type Num_species
##    <chr>  <chr>    <dbl>  <dbl> <dbl>   <dbl> <chr>              <int>
##  1 s1-1   Lact…   0.602   0.333     1       1 Cells                  3
##  2 s1-1   Prev…   0.332   0.333     1       1 Cells                  3
##  3 s1-1   Stre…   0.0663  0.333     1       1 Cells                  3
##  4 s1-10  Atop…   0.0391  0.333     1      10 Cells                  3
##  5 s1-10  Lact…   0.392   0.333     1      10 Cells                  3
##  6 s1-10  Snea…   0.569   0.333     1      10 Cells                  3
##  7 s1-11  Gard…   0.0154  0.333     1      11 Cells                  3
##  8 s1-11  Lact…   0.337   0.333     1      11 Cells                  3
##  9 s1-11  Lact…   0.647   0.333     1      11 Cells                  3
## 10 s1-12  Atop…   0.0616  0.5       1      12 Cells                  2
## # … with 581 more rows, and 3 more variables: Species_list <chr>,
## #   Error <dbl>, CN_bias <dbl>

View the errors

Let’s take a look at the observed vs. actual proportions, taking a log-odds transform to avoid compressing the variation near p=0 and p=1.

main0 %>%
    mutate_at(vars(Actual, Observed), logit) %>%
    ggplot(aes(Actual, Observed, color = Taxon)) + 
    geom_quasirandom() +
    facet_grid(Mixture_type ~ .) +
    geom_rangeframe(sides = "bl", color= "black") +
    scale_color_manual(values = colors.taxon, labels = tax_labeller) + 
    labs(title = "Observed vs. Actual (log-od