R setup

Run with SAVE_FIGURES = TRUE to save figures in figures/.

SAVE_FIGURES = TRUE

Libraries and paths:

library(here)
library(tidyverse)
library(ggthemes)
library(cowplot)
library(ggbeeswarm)
# Functions for bias estimation and calibration
library(metacal); packageVersion("metacal")
## [1] '0.1.0'
# This package
devtools::load_all(here())

Plot setup:

# Options for `ggplot`:
base_theme <- theme_tufte() + 
    theme(
        text = element_text(size=9, family = ""),
        strip.text = element_text(size=9, family = ""),
        legend.position = "none"
        )
tax_theme <- theme(
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
        axis.title.x = element_blank())
update_geom_defaults("point", list(size = 1))
update_geom_defaults("text", list(size = 2.5))
update_geom_defaults("hline", list(color = "grey"))
update_geom_defaults("vline", list(color = "grey"))
# Color-blind friendly colors from ColorBrewer2
colors.protocol <- c(
    H = "#1b9e77", 
    W = "#d95f02", 
    Q = "#7570b3", 
    Actual = "black")
# Labeller for facets when faceting by protocol
labeller.protocol <- labeller(
    Protocol = function(p) paste("Protocol", p))
labeller.protocol.ref <- labeller(
    Protocol = function(p) {
        focal <- str_sub(p, 1, 1)
        ref <- str_sub(p, 2, 2)
        paste("Protocol", focal, "/", ref)
    }
)

Data setup

Sample metadata:

data("costea2017_sample_data")
sam <- costea2017_sample_data
sam
## # A tibble: 29 x 5
##    Sample Protocol Individual SI_sample Run_accession
##    <chr>  <chr>    <chr>      <chr>     <chr>        
##  1 Q1     Q        1          BYQ_AAAB  ERR1971004   
##  2 H1     H        1          BYQ_AAAM  ERR1971015   
##  3 W1     W        1          BYQ_AAAV  ERR1971024   
##  4 Q2     Q        2          BYQ_AAAF  ERR1971008   
##  5 H2     H        2          BYQ_AAAQ  ERR1971019   
##  6 W2     W        2          BYQ_AAAZ  ERR1971028   
##  7 Q3     Q        3          BYQ_AAAA  ERR1971003   
##  8 H3     H        3          BYQ_AAAL  ERR1971014   
##  9 W3     W        3          BYQ_AAAU  ERR1971023   
## 10 Q4     Q        4          BYQ_AAAE  ERR1971007   
## # … with 19 more rows

Expected mock community composition, obtained from the supplementary data (see data-raw/costea2017.R)

data("costea2017_mock_composition")
mock <- costea2017_mock_composition

The FACS measurements are the “bacterial cells” columns; the measurements without endospores were used as the actual composition, and replicate measurements were averaged (personal communication with Paul Costea);

mock <- mock %>%
    select(Taxon, FACS = "bacterial cells in spike in Mix") %>%
    group_by(Taxon) %>%
    summarize_at(vars(FACS), mean)
mock <- mock %>%
    arrange(desc(FACS))
mock
## # A tibble: 10 x 2
##    Taxon                            FACS
##    <chr>                           <dbl>
##  1 Prevotella_melaninogenica   91361186.
##  2 Clostridium_perfringens     73342343.
##  3 Salmonella_enterica         58450204.
##  4 Clostridium_difficile       42383205.
##  5 Lactobacillus_plantarum     22566221.
##  6 Vibrio_cholerae             10065143.
##  7 Clostridium_saccharolyticum  9212814.
##  8 Yersinia_pseudotuberculosis  3950160.
##  9 Blautia_hansenii              459883.
## 10 Fusobacterium_nucleatum       204756.

Note that the order of three species (V. cholerae, C. saccharolyticum, and Y. pseudotuberculosis) differ from that of Costea2017 Figure 6. We reproduce their figure below, showing that these species are likely mislabeled in their figure and not in the mock table. But a mislabeling would here not affect our main conclusions.

Load the Metaphlan2 profiles:

data("costea2017_metaphlan2_profiles")
profiles <- costea2017_metaphlan2_profiles
profiles %>% corner
## # A tibble: 5 x 5
##   Clade                         ERR1971003 ERR1971004 ERR1971005 ERR1971006
##   <chr>                              <dbl>      <dbl>      <dbl>      <dbl>
## 1 k__Archaea                       0.00101    0.00036    0.00018          0
## 2 k__Archaea|p__Euryarchaeota      0.00101    0.00036    0.00018          0
## 3 k__Archaea|p__Euryarchaeota|…    0.00101    0.00036    0.00018          0
## 4 k__Archaea|p__Euryarchaeota|…    0.00101    0.00036    0.00018          0
## 5 k__Archaea|p__Euryarchaeota|…    0.00101    0.00036    0.00018          0

The samples are named here by their ENA run accession; let’s rename to the new sample names.

new_names <- c("Clade", 
    sam$Sample[match(colnames(profiles)[-1], sam$Run_accession)])
colnames(profiles) <- new_names

Two samples, QA and QB, are two extra fecal samples that were only sequenced by protocol Q and so we won’t be using for our analysis. We also drop unneeded sample variables.

sam <- sam %>%
    filter(!(Sample %in% c("QA", "QB"))) %>%
    select(Sample, Protocol, Individual)
profiles <- profiles[, c("Clade", sam$Sample)]

Taxonomy from the metaphlan2 clade names:

tax_ranks <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus",
    "Species", "Strain")
rank_letters <- c("k", "p", "c", "o", "f", "g", "s", "t")
tax_pattern  <- paste0("(?:", rank_letters, "__(\\w+))?") %>%
    paste(collapse = "\\|?")
tax <- profiles %>%
    select(Clade) %>%
    extract(Clade, tax_ranks, regex = tax_pattern, remove = FALSE)
tax
## # A tibble: 1,868 x 9
##    Clade      Kingdom Phylum  Class  Order  Family  Genus Species  Strain  
##    <chr>      <chr>   <chr>   <chr>  <chr>  <chr>   <chr> <chr>    <chr>   
##  1 k__Archaea Archaea <NA>    <NA>   <NA>   <NA>    <NA>  <NA>     <NA>    
##  2 k__Archae… Archaea Euryar… <NA>   <NA>   <NA>    <NA>  <NA>     <NA>    
##  3 k__Archae… Archaea Euryar… Metha… <NA>   <NA>    <NA>  <NA>     <NA>    
##  4 k__Archae… Archaea Euryar… Metha… Metha… <NA>    <NA>  <NA>     <NA>    
##  5 k__Archae… Archaea Euryar… Metha… Metha… Methan… <NA>  <NA>     <NA>    
##  6 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… <NA>     <NA>    
##  7 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… <NA>    
##  8 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… GCF_000…
##  9 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… <NA>    
## 10 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… Methano…
## # … with 1,858 more rows

Species table with just the Metaphlan species identifiers

st <- profiles %>%
    left_join(tax, by = "Clade") %>%
    filter(!is.na(Species), is.na(Strain)) %>%
    select(-Strain)
st
## # A tibble: 718 x 35
##    Clade      Q1      H1      W1      Q2      H2      W2      Q3      H3
##    <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 k__A… 0.      0       0.      0       0       0       0       0      
##  2 k__A… 3.60e-4 0       0.      0.0830  0.0935  0.616   0.0009  0.00012
##  3 k__A… 0.      0       0.      0.0161  0       0       0       0      
##  4 k__A… 0.      0       0.      0       0       0       0.00011 0      
##  5 k__B… 0.      0       0.      0       0       0       0.00095 0      
##  6 k__B… 0.      0       0.      0.00795 0.00008 0.00088 0.0011  0      
##  7 k__B… 8.30e-4 0       0.      0       0       0       0       0      
##  8 k__B… 1.50e-4 0.00088 2.80e-4 0.00044 0.00008 0       0.00534 0.0018 
##  9 k__B… 2.30e-4 0       0.      0.00361 0       0.00036 0.00016 0.00008
## 10 k__B… 6.27e-3 0.00093 4.30e-4 0.00173 0       0.00018 0       0      
## # … with 708 more rows, and 26 more variables: W3 <dbl>, Q4 <dbl>,
## #   H4 <dbl>, W4 <dbl>, Q5 <dbl>, H5 <dbl>, W5 <dbl>, Q6 <dbl>, H6 <dbl>,
## #   W6 <dbl>, Q7 <dbl>, H7 <dbl>, W7 <dbl>, Q8 <dbl>, H8 <dbl>, W8 <dbl>,
## #   QM <dbl>, HM <dbl>, WM <dbl>, Kingdom <chr>, Phylum <chr>,
## #   Class <chr>, Order <chr>, Family <chr>, Genus <chr>, Species <chr>

Species names have the form Genus_species and are unique,

st$Species %>% head
## [1] "Methanobrevibacter_ruminantium"  "Methanobrevibacter_smithii"     
## [3] "Methanobrevibacter_unclassified" "Methanosphaera_stadtmanae"      
## [5] "Actinomyces_cardiffensis"        "Actinomyces_europaeus"
st$Species %>% anyDuplicated
## [1] 0

The Genus_unclassified catches cases where more reads were mapped to the genus’s markers than could be accounted for by species within the genus. We will include these for measuring the family-level compositions but not the spike-in compositions From here on, we will identify the Clade / Taxon with this species identifier,

st <- st %>%
    mutate(Clade = Species) %>%
    rename(Taxon = Clade)

Next, join the metadata and tidy it into a tall format (one sample-taxon observation per row)

st <- st %>%
    gather("Sample", "Abundance", sam$Sample) %>%
    left_join(sam, by = "Sample") %>%
    select(Taxon, Sample, Abundance, colnames(sam), everything())
st
## # A tibble: 19,386 x 12
##    Taxon Sample Abundance Protocol Individual Kingdom Phylum Class Order
##    <chr> <chr>      <dbl> <chr>    <chr>      <chr>   <chr>  <chr> <chr>
##  1 Meth… Q1      0        Q        1          Archaea Eurya… Meth… Meth…
##  2 Meth… Q1      0.00036  Q        1          Archaea Eurya… Meth… Meth…
##  3 Meth… Q1      0        Q        1          Archaea Eurya… Meth… Meth…
##  4 Meth… Q1      0        Q        1          Archaea Eurya… Meth… Meth…
##  5 Acti… Q1      0        Q        1          Bacter… Actin… Acti… Acti…
##  6 Acti… Q1      0        Q        1          Bacter… Actin… Acti… Acti…
##  7 Acti… Q1      0.00083  Q        1          Bacter… Actin… Acti… Acti…
##  8 Acti… Q1      0.000150 Q        1          Bacter… Actin… Acti… Acti…
##  9 Acti… Q1      0.00023  Q        1          Bacter… Actin… Acti… Acti…
## 10 Acti… Q1      0.00627  Q        1          Bacter… Actin… Acti… Acti…
## # … with 19,376 more rows, and 3 more variables: Family <chr>,
## #   Genus <chr>, Species <chr>

Metaphlan outputs estimates of the proportions * 100. We will renormalize to proportions summing to 1 instead.

st <- st %>%
    mutate_by(Sample, Abundance = close_elts(Abundance))
st %>%
    group_by(Sample) %>%
    summarize(sum(Abundance)) %>%
    {summary(.[,2])}
##  sum(Abundance)
##  Min.   :1     
##  1st Qu.:1     
##  Median :1     
##  Mean   :1     
##  3rd Qu.:1     
##  Max.   :1

Output from the various kingdoms:

st %>%
    group_by(Kingdom, Sample) %>%
    summarize(Abundance = sum(Abundance)) %>%
    spread(Kingdom, Abundance) %>%
    print(n=Inf)
## # A tibble: 27 x 5
##    Sample     Archaea Bacteria Eukaryota Viruses
##    <chr>        <dbl>    <dbl>     <dbl>   <dbl>
##  1 H1     0              0.931 0.0000845  0.0689
##  2 H2     0.000936       0.942 0.0000650  0.0569
##  3 H3     0.00000120     0.944 0.0000756  0.0564
##  4 H4     0              0.956 0.000103   0.0435
##  5 H5     0.000759       0.933 0.0000763  0.0664
##  6 H6     0.00000100     0.947 0.0000541  0.0526
##  7 H7     0              0.916 0.0000971  0.0842
##  8 H8     0.00000110     0.928 0.0000712  0.0715
##  9 HM     0              0.846 0.0000424  0.154 
## 10 Q1     0.00000360     0.932 0.0000544  0.0679
## 11 Q2     0.000992       0.978 0.0000190  0.0207
## 12 Q3     0.0000101      0.987 0.0000785  0.0124
## 13 Q4     0              0.956 0.0000910  0.0434
## 14 Q5     0.00102        0.945 0.0000424  0.0543
## 15 Q6     0              0.964 0.0000539  0.0355
## 16 Q7     0.00000180     0.953 0.0000460  0.0473
## 17 Q8     0              0.952 0.0000775  0.0478
## 18 QM     0.00000260     0.782 0.0000238  0.218 
## 19 W1     0              0.880 0.0000436  0.120 
## 20 W2     0.00616        0.915 0.0000884  0.0784
## 21 W3     0              0.933 0.0000661  0.0669
## 22 W4     0.00000230     0.927 0.0000828  0.0733
## 23 W5     0.00159        0.903 0.0000666  0.0958
## 24 W6     0.000000900    0.944 0.0000386  0.0561
## 25 W7     0.00000340     0.871 0.0000424  0.129 
## 26 W8     0              0.879 0.0000502  0.121 
## 27 WM     0              0.857 0.0000441  0.143

There is very little Archaea and Eukaryote abundance, and the vast majority of the viral abundance comes from a Salmonella phage that I suspect is integrated into the genome of the Salmonella enterica strain in the spike-in.

st %>%
    filter(Kingdom == "Viruses") %>%
    group_by(Taxon) %>% 
    summarize(Abundance = mean(Abundance)) %>%
    arrange(desc(Abundance))
## # A tibble: 5 x 2
##   Taxon                     Abundance
##   <chr>                         <dbl>
## 1 Salmonella_phage_Fels_1  0.0772    
## 2 Lactococcus_phage_P680   0.0000241 
## 3 C2likevirus_unclassified 0.0000128 
## 4 Lactobacillus_phage_PL_1 0.00000478
## 5 Dasheen_mosaic_virus     0.00000311

Bacteria form the vast majority of estimated non-viral abundance in each sample,

st %>%
    group_by(Kingdom, Sample) %>%
    summarize(Abundance = sum(Abundance)) %>%
    filter(Kingdom != "Viruses") %>%
    mutate_by(Sample, Abundance = close_elts(Abundance)) %>%
    filter(Kingdom == "Bacteria") %>%
    summarize_at("Abundance", list(Min = min, Mean = mean))
## # A tibble: 1 x 2
##     Min  Mean
##   <dbl> <dbl>
## 1 0.993 0.999

We will therefore restrict our analysis of sample composition to just Bacteria, renormalizing the abundances to sum to 1 after filtering.

st <- st %>%
    filter(Kingdom == "Bacteria") %>%
    mutate_by(Sample, Abundance = close_elts(Abundance))

Composition of the samples

Identifying the lab contaminant

Costea et al. report E. coli as a likely lab contaminant of the spike-in based on its appearance in the profiles of the mock-only samples (which they derived using mOTUs rather than Metaphlan2). So let’s check what is at appreciable frequency in the mock-only samples that shouldn’t be,

tb <- st %>%
    filter(
        Individual == "M", Abundance > 0, 
        !(Taxon %in% mock$Taxon)
        ) %>%
    group_by(Family, Taxon) %>%
    summarize(
        Mean = mean(Abundance),
        Max = max(Abundance),
        ) %>%
    filter(Max > 1e-4) %>%
    arrange(desc(Mean))
tb
## # A tibble: 9 x 4
## # Groups:   Family [4]
##   Family             Taxon                          Mean      Max
##   <chr>              <chr>                         <dbl>    <dbl>
## 1 Enterobacteriaceae Shigella_flexneri          0.0871   0.109   
## 2 Enterobacteriaceae Salmonella_unclassified    0.0134   0.0183  
## 3 Enterobacteriaceae Escherichia_coli           0.00394  0.00520 
## 4 Enterobacteriaceae Escherichia_unclassified   0.000764 0.000948
## 5 Xanthomonadaceae   Xanthomonas_perforans      0.000126 0.000146
## 6 Moraxellaceae      Psychrobacter_unclassified 0.000122 0.000208
## 7 Xanthomonadaceae   Xanthomonas_gardneri       0.000107 0.000128
## 8 Enterobacteriaceae Pantoea_unclassified       0.000101 0.000139
## 9 Prevotellaceae     Prevotella_copri           0.000100 0.000160

Here we see that most of the contaminant reads appear to be captured by Shigella_flexneri identifier. Given the closeness of S. flexnari with E. coli, the estimated abundance for Escherichia_coli may also be coming from the contaminant strain. Since S. flexnari is generally much larger,

st %>%
    filter(Taxon %in% c("Shigella_flexneri", "Escherichia_coli")) %>%
    group_by(Individual, Taxon) %>%
    summarize_at(vars(Abundance), mean) %>%
    spread(Taxon, Abundance) %>%
    mutate(Escherichia_coli / Shigella_flexneri)
## # A tibble: 9 x 4
## # Groups:   Individual [9]
##   Individual Escherichia_coli Shigella_flexneri `Escherichia_coli/Shigella…
##   <chr>                 <dbl>             <dbl>                       <dbl>
## 1 1                  0.00183             0.0363                      0.0504
## 2 2                  0.00220             0.0220                      0.0998
## 3 3                  0.00100             0.0206                      0.0487
## 4 4                  0.000954            0.0221                      0.0432
## 5 5                  0.00153             0.0309                      0.0497
## 6 6                  0.000894            0.0178                      0.0503
## 7 7                  0.00203             0.0369                      0.0549
## 8 8                  0.00181             0.0335                      0.0540
## 9 M                  0.00394             0.0871                      0.0452

We will treat Shigella_flexneri as the lab contaminant in our analysis,

contaminant <- "Shigella_flexneri"

Our analysis of the spike-in abundances below show that the ratio of Shigella_flexneri to other spike-in taxa is constant across samples, indicating that Shigella_flexneri reads in the fecal samples are primarily from the contaminant and not native gut strains.

Above, we also see appreciable frequency of Salmonella_unclassified, which is due to the genus-level estimated proportion for Salmonella being larger than the species-level estimates. Because the estimated proportions of Escherichia_coli, Escherichia_unclassified, and Salmonella_unclassified are all fairly small, they will not substantially affect the family-level composition plot (Figure 4A). and so we will leave it in as a “native” taxon for this analysis. However, because these abundances might be due to the spike-in rather than native gut taxa, we will filter them out before proceeding.

st0 <- st %>%
    filter(!(Taxon %in% c("Salmonella_unclassified", "Escherichia_coli",
                "Escherichia_unclassified"))) %>%
    mutate_by(Sample, Abundance = close_elts(Abundance))

Taxa by source (Mock, Contaminant, Native)

Classify the taxa by source type:

st0 <- st0 %>%
    mutate(Source = case_when(
            Taxon %in% mock$Taxon ~ "Mock", 
            Taxon == contaminant ~ "Contaminant",
            TRUE ~ "Native"
            )
        )

View the proportions of native, mock, and contaminant for each sample:

tb.source <- st0 %>%
    group_by(Sample, Protocol, Individual, Source) %>%
    summarize(Abundance = sum(Abundance)) %>%
    ungroup()
tb.source %>%
    spread(Source, Abundance) %>%
    print(n=Inf)
## # A tibble: 27 x 6
##    Sample Protocol Individual Contaminant   Mock  Native
##    <chr>  <chr>    <chr>            <dbl>  <dbl>   <dbl>
##  1 H1     H        1              0.0280  0.290  0.682  
##  2 H2     H        2              0.0259  0.263  0.711  
##  3 H3     H        3              0.0265  0.279  0.695  
##  4 H4     H        4              0.0178  0.179  0.803  
##  5 H5     H        5              0.0268  0.321  0.653  
##  6 H6     H        6              0.0205  0.228  0.752  
##  7 H7     H        7              0.0330  0.372  0.595  
##  8 H8     H        8              0.0309  0.353  0.616  
##  9 HM     H        M              0.0818  0.917  0.00170
## 10 Q1     Q        1              0.0247  0.196  0.780  
## 11 Q2     Q        2              0.00485 0.0317 0.963  
## 12 Q3     Q        3              0.00498 0.0276 0.967  
## 13 Q4     Q        4              0.0166  0.132  0.851  
## 14 Q5     Q        5              0.0197  0.150  0.830  
## 15 Q6     Q        6              0.0117  0.0834 0.905  
## 16 Q7     Q        7              0.0183  0.157  0.825  
## 17 Q8     Q        8              0.0151  0.111  0.874  
## 18 QM     Q        M              0.112   0.886  0.00211
## 19 W1     W        1              0.0574  0.830  0.112  
## 20 W2     W        2              0.0358  0.442  0.522  
## 21 W3     W        3              0.0307  0.417  0.552  
## 22 W4     W        4              0.0321  0.443  0.525  
## 23 W5     W        5              0.0468  0.598  0.356  
## 24 W6     W        6              0.0213  0.358  0.621  
## 25 W7     W        7              0.0605  0.769  0.170  
## 26 W8     W        8              0.0555  0.734  0.211  
## 27 WM     W        M              0.0726  0.926  0.00122

The contaminant is always a fairly small fraction of the total sample. It also generally appears in a consistent ratio with the mock for a given protocol:

tb.source %>%
    spread(Source, Abundance) %>%
    ggplot(aes(Sample, Contaminant / Mock, color = Protocol, 
            label = Individual)) +
    scale_color_manual(values = colors.protocol) + 
    geom_text() +
    coord_flip()

In samples Q2 and Q3, the spike-in makes up an unusually low fraction of the total (see table above), so the larger ratio in these samples may be due to chance. These observations suggest that the contaminant is being correctly identified.

Composition by source type:

p.source <- ggplot(tb.source,
    aes(y = Abundance, 
        x = Individual,
        # x = factor(Individual, lvls.mock_prop), 
        fill = Source)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(breaks = c(0, 0.5, 1), 
        labels = c("0", "0.5", "1")) +
    geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"), 
        aes(y = y), sides = "l", inherit.aes = FALSE) +
    scale_fill_brewer(type = "qual") +
    facet_grid(. ~ Protocol, labeller = labeller.protocol) +
    labs(x = "Specimen", fill = "Taxon type", 
        title = "Composition of different bacterial types") +
    base_theme +
    theme(legend.position = "right")
p.source 

Native taxa composition

Get the native taxa agglomerated to the Family level,

nat <- st0 %>%
    filter(Source == "Native") %>%
    group_by_at(vars(Sample, Protocol, Individual, Kingdom:Family)) %>%
    summarize(Abundance = sum(Abundance)) %>%
    ungroup

Let’s restrict to Families within a minimal abundance to simplify the figure. First, lets check the prevalence of various families,

prev <- nat %>%
    filter(Individual != "M") %>%
    mutate_by(Sample, Proportion = close_elts(Abundance)) %>%
    group_by(Family) %>%
    summarize(Prev = sum(Proportion > 0), 
        Min = min(Proportion), 
        Median = median(Proportion), 
        Mean = mean(Proportion), 
        Max = max(Proportion))
prev %>%
    arrange(desc(Max)) %>%
    print(n=30)
## # A tibble: 98 x 6
##    Family                  Prev         Min     Median     Mean      Max
##    <chr>                  <int>       <dbl>      <dbl>    <dbl>    <dbl>
##  1 Bacteroidaceae            24 0.0412      0.227      0.228    0.496   
##  2 Ruminococcaceae           24 0.0755      0.221      0.229    0.458   
##  3 Prevotellaceae            24 0.0000859   0.000586   0.0521   0.456   
##  4 Lachnospiraceae           24 0.0492      0.157      0.166    0.311   
##  5 Bifidobacteriaceae        24 0.000279    0.0196     0.0443   0.242   
##  6 Eubacteriaceae            24 0.0151      0.0938     0.0890   0.162   
##  7 Rikenellaceae             24 0.0177      0.0439     0.0579   0.162   
##  8 Coriobacteriaceae         24 0.00182     0.0113     0.0185   0.103   
##  9 Porphyromonadaceae        24 0.0109      0.0258     0.0324   0.0773  
## 10 Erysipelotrichaceae       24 0.000854    0.00209    0.00703  0.0732  
## 11 Acidaminococcaceae        24 0.00000244  0.0000307  0.00796  0.0724  
## 12 Veillonellaceae           24 0.0000890   0.0116     0.0204   0.0688  
## 13 Verrucomicrobiaceae       24 0.000000613 0.000164   0.0110   0.0622  
## 14 Clostridiaceae            24 0.00153     0.00526    0.00687  0.0303  
## 15 Oscillospiraceae          24 0.00161     0.00455    0.00547  0.0148  
## 16 Sutterellaceae            24 0.0000214   0.00337    0.00453  0.0146  
## 17 Bacteroidales_noname      24 0.000131    0.00307    0.00409  0.0145  
## 18 Streptococcaceae          24 0.000260    0.00205    0.00312  0.00865 
## 19 Desulfovibrionaceae       24 0.000855    0.00417    0.00441  0.00752 
## 20 Burkholderiales_noname    24 0.00000230  0.0000564  0.000989 0.00644 
## 21 Lactobacillaceae          24 0.00000618  0.000142   0.000564 0.00421 
## 22 Clostridiales_noname      24 0.000244    0.00147    0.00158  0.00411 
## 23 Peptostreptococcaceae     24 0.00000691  0.000172   0.000450 0.00350 
## 24 Xanthomonadaceae          24 0.000519    0.00139    0.00144  0.00290 
## 25 Pasteurellaceae           24 0.00000126  0.000234   0.000387 0.00262 
## 26 Enterobacteriaceae        24 0.0000173   0.000157   0.000321 0.00179 
## 27 Oxalobacteraceae          21 0           0.00000565 0.000112 0.00143 
## 28 Campylobacteraceae        14 0           0.00000186 0.000110 0.00141 
## 29 Enterococcaceae           24 0.00000296  0.000219   0.000335 0.00105 
## 30 Moraxellaceae             18 0           0.0000731  0.000145 0.000931
## # … with 68 more rows

Let’s use the families with a proportion of at least 2%.

families <- prev %>%
    filter(Max > 0.02) %>%
    {.$Family}
nat0 <- nat %>%
    filter(Family %in% families) %>%
    arrange(Phylum, Family)

Create a color scale for Family that distinguishes Phylum by hue.

# Color tibble
ctb <- nat0 %>%
    select(Phylum, Family) %>%
    distinct %>%
    arrange(Phylum, Family) %>%
    group_by(Phylum) %>%
    nest(Family) %>%
    rowwise() %>%
    mutate(Families = list(unlist(data)),
        Num = length(Families))
ctb <- ctb %>%
    arrange(desc(Num)) %>%
    add_column(Palette = c("Blues", "Greens", "Reds", "Oranges"))
ctb <- ctb %>%
    rowwise() %>%
    mutate(Colors = list(RColorBrewer::brewer.pal(Num, name = Palette)))
## Warning in RColorBrewer::brewer.pal(Num, name = Palette): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(Num, name = Palette): minimal value for n is 3, returning requested palette with 3 different levels
last_n <- function(x, n) {
    len <- length(x)
    x[seq(len - n + 1, len)]
}

ctb <- ctb %>%
    rowwise() %>%
    mutate(Colors = list(last_n(Colors, Num)))

ctb <- ctb %>%
    select(Phylum, Families, Colors) %>%
    unnest() %>%
    arrange(Phylum, Families)

colors.families <-  ctb$Colors
names(colors.families) <- ctb$Families
# RColorBrewer::display.brewer.all()
# RColorBrewer::brewer.pal(5, name = "Blues")

We also want the mock and contaminant fractions in the final figure

comp <- bind_rows(
    nat0,
    tb.source %>% filter(Source != "Native") %>% rename(Family = Source)
)
comp %>%
    filter(Sample == "W1") %>%
    select(Family, Abundance) %>%
    arrange(desc(Abundance))
## # A tibble: 16 x 2
##    Family                Abundance
##    <chr>                     <dbl>
##  1 Mock                0.830      
##  2 Contaminant         0.0574     
##  3 Lachnospiraceae     0.0244     
##  4 Ruminococcaceae     0.0218     
##  5 Bacteroidaceae      0.0198     
##  6 Rikenellaceae       0.0130     
##  7 Bifidobacteriaceae  0.00714    
##  8 Eubacteriaceae      0.00617    
##  9 Porphyromonadaceae  0.00575    
## 10 Verrucomicrobiaceae 0.00448    
## 11 Clostridiaceae      0.00340    
## 12 Coriobacteriaceae   0.00197    
## 13 Erysipelotrichaceae 0.000363   
## 14 Prevotellaceae      0.0000887  
## 15 Veillonellaceae     0.0000517  
## 16 Acidaminococcaceae  0.000000690
comp %>%
    group_by(Sample) %>%
    summarize(sum= sum(Abundance)) %>%
    {min(.$sum)}
## [1] 0.9624378
colors.source <- c("#A9A9A9", "#D3D3D3")
names(colors.source) <- c("Contaminant", "Mock")
colors.comp <- c(colors.source, colors.families)
p.comp <- ggplot(comp,
    aes(x = factor(Individual, as.character(c(1:8, "M"))),
        y = Abundance, 
        fill = factor(Family, names(colors.comp)))) +
    geom_bar(stat = "identity") +
    scale_y_continuous(limits=c(0,1), breaks = c(0, 0.5, 1), 
        labels = c("0", "0.5", "1")) +
    geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"), 
        aes(y = y), sides = "l", inherit.aes = FALSE) +
    scale_fill_manual(values = colors.comp) +
    facet_grid(. ~ Protocol, labeller = labeller.protocol) +
    labs(x = "Specimen", y = "Proportion",
        title = paste0("Composition of all bacteria")
        ) +
    base_theme +
    theme(legend.position = "right", legend.key.size = unit(0.11, "in"),
        legend.margin = margin(5,5,5,-5)) + 
    # theme(legend.position = c(1, 0.5), legend.justification = c(1, 0.5),
    #     legend.key.size = unit(0.11, "in"),
    #     ) +
    labs(fill = "Source or\nFamily (Native)")
p.comp

Relative abundances and bias in the mock spike-in

Next, we want to visualize relative abundances vs. actual composition (bias) of the mock spike-in (including the contaminant). Let’s first confirm that all taxa were detected in all samples:

st0 %>%
    filter(Taxon %in% c(mock$Taxon, contaminant)) %>%
    {all(.$Abundance > 0)}
## [1] TRUE

Now let’s restrict to just the spike-in taxa. Since the 10 mock species appear in all samples with the same known relative abundances, it is convenient to normalize the abundances of all taxa to the geometric mean of the 10 mock species (called Denom below).

sts <- st0 %>%
    filter(Taxon %in% c(mock$Taxon, contaminant)) %>%
    rename(Observed = Abundance) %>%
    mutate(Taxon = ifelse(Taxon == contaminant, "Contaminant", Taxon))
denom <- sts %>%
    filter(Taxon %in% mock$Taxon) %>%
    group_by(Sample) %>%
    summarize(Denom = gm_mean(Observed))
sts <- sts %>%
    left_join(denom, by = c("Sample")) %>%
    mutate(Observed = Observed / Denom)

Next, add the actual relative abundances as determined by the FACS measurement.

mock0 <- mock %>%
    rename(Actual = FACS) %>%
    mutate(Actual = center_elts(Actual))
sts <- sts %>%
    left_join(mock0, by = "Taxon") %>%
    mutate(Error = Observed / Actual)

Also grab a version with just the mock taxa

stm <- sts %>%
    filter(Taxon %in% mock$Taxon)

Plot the observed and actual relative abundances.

p.step <- ggplot(sts,
    aes(x = factor(Taxon, c(mock$Taxon, "Contaminant")), y = Observed,
        color = Protocol, shape = (Individual == "M"))) + 
    geom_quasirandom() +
    geom_rangeframe(color = "black", sides = "l") +
    scale_y_log10(labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    scale_shape_manual(values = c(16, 1), labels = c("Fecal", "Mock only")) +
    scale_color_manual(values = colors.protocol) + 
    coord_cartesian(clip = "off") +
    labs(title = "Composition of the spike-in",
        # y = "Abundance / gm. mean of non-contaminants",
        y = "Abundance / gm. mean",
        shape = "Sample type") +
    base_theme + 
    tax_theme + 
    theme(legend.position = "right", legend.margin = margin(5,0,5,-5))
# Add the actual abundances (as a stair plot like in Costea2017's Figure 6)
# underneath the data points
tb.step <- mock0 %>%
    mutate(x1 = rank(desc(Actual)) - 0.5, x2 = x1 + 0.5,
        x3 = ifelse(x2 == 10, 10.5, NA)) %>%
    gather("loc", "x", x1, x2, x3) %>%
    filter(!is.na(x)) %>%
    arrange(x)
p.step$layers <- c(
    geom_step(data = tb.step, aes(x = x, y = Actual, linetype = "Actual"), 
        size = 0.3, color = "black", inherit.aes = FALSE),
    p.step$layers)
# p.step <- p.step +
#     annotate("text", x = 10.6, y = 0.021, label = "Actual",
#         color = "black", hjust = 0)
p.step <- p.step +
    guides(
        color = guide_legend(order = 1),
        shape = guide_legend(order = 2),
        linetype = guide_legend(order = 3))
p.step

Main text Figure 3

plot_grid(p.comp, p.step, labels = c("A", "B"))

Estimate bias

Estimate the bias within each protocol as the center (or compositional mean) of the observed composition errors. Because all samples have all taxa, we can do this very straightforwardly by taking geometric means:

bias <- stm %>%
    group_by(Protocol, Taxon) %>%
    summarize(Bias = gm_mean(Error)) %>%
    mutate(Bias = center_elts(Bias)) %>%
    ungroup
bias %>%
    spread(Protocol, Bias)
## # A tibble: 10 x 4
##    Taxon                             H      Q      W
##    <chr>                         <dbl>  <dbl>  <dbl>
##  1 Blautia_hansenii             0.537  0.745   1.80 
##  2 Clostridium_difficile        0.110  0.0891  0.223
##  3 Clostridium_perfringens      1.18   0.489   1.14 
##  4 Clostridium_saccharolyticum  1.21   1.44    0.589
##  5 Fusobacterium_nucleatum     46.3    4.98   16.5  
##  6 Lactobacillus_plantarum      0.0168 0.773   0.355
##  7 Prevotella_melaninogenica    2.81   1.55    1.37 
##  8 Salmonella_enterica          1.77   2.29    0.788
##  9 Vibrio_cholerae              2.10   1.16    0.888
## 10 Yersinia_pseudotuberculosis  1.46   1.35    0.660

The call to center_elts normalizes the efficiencies within each protocol to have a geometric mean of 1,

bias %>%
    group_by(Protocol) %>%
    summarize(gm_mean(Bias))
## # A tibble: 3 x 2
##   Protocol `gm_mean(Bias)`
##   <chr>              <dbl>
## 1 H                      1
## 2 Q                      1
## 3 W                      1

The elements of the Bias column can be interpretted as the detection efficiency relative to the average taxon for that protocol.

To confirm that the estimation worked, let’s calibrate the relative abundances from the previous figure:

p.step.cal <- p.step
p.step.cal$data <- p.step.cal$data %>%
    left_join(bias, by = c("Protocol", "Taxon")) %>%
    mutate(Observed = Observed / Bias)
p.step.cal
## Warning: Removed 27 rows containing missing values (position_quasirandom).

Join the bias w/ the abundance table and compute compositional residuals.

stm <- stm %>%
    left_join(bias, by = c("Protocol", "Taxon")) %>%
    mutate(
        Residual = Error / Bias,
        Predicted = Actual * Bias
    )
# Note, `Residual` is already geometrically centered.

Let’s also check that the bias predictions predict the taxon ratios:

gvs <- c("Protocol", "Individual", "Sample")
ratios <- stm %>%
    mutate(Taxon = tax_abbrev(Taxon)) %>%
    compute_ratios(group_vars = gvs) %>%
    mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))
ratios.pred <- ratios %>%
    select(Taxon.x, Taxon.y, Pair, Protocol, Actual, Bias, Predicted) %>%
    distinct
ggplot(ratios, aes(Pair, Error, 
        label = Individual, color = Protocol)) +
    geom_point(data = ratios.pred,
        aes(Pair, Bias), inherit.aes = FALSE,
        shape = 3, size = 4, color = "black") +
    geom_text(position = position_jitter(width=0.25, height=0)) +
    geom_hline(yintercept = 1) +
    facet_grid(Protocol ~ ., labeller = labeller.protocol) +
    geom_rangeframe(sides = "l", color= "black") + 
    scale_color_manual(values = colors.protocol) + 
    scale_y_log10() +
    scale_x_discrete(labels = tax_labeller) + 
    base_theme +
    tax_theme

Distribution of efficiencies for the three protocols:

ggplot(bias, aes(Bias, fill = Protocol)) +
    geom_dotplot() +
    scale_x_log10() +
    facet_grid(Protocol ~ ., labeller = labeller.protocol) +
    scale_fill_manual(values = colors.protocol) +
    scale_y_continuous(breaks = NULL) +
    geom_rangeframe() +
    base_theme

Let’s also estimate the differential bias (including the contaminant):

dbias <- sts %>%
    select(Taxon, Observed, Protocol, Individual) %>%
    spread(Protocol, Observed) %>%
    group_by(Taxon) %>%
    summarize(
        HQ = gm_mean(H / Q),
        HW = gm_mean(H / W),
        QW = gm_mean(Q / W))
dbias <- dbias %>%
    gather("Protocol", "Bias", -Taxon)
dbias %>%
    spread(Protocol, Bias)
## # A tibble: 11 x 4
##    Taxon                           HQ     HW    QW
##    <chr>                        <dbl>  <dbl> <dbl>
##  1 Blautia_hansenii            0.721  0.298  0.413
##  2 Clostridium_difficile       1.24   0.494  0.400
##  3 Clostridium_perfringens     2.41   1.04   0.431
##  4 Clostridium_saccharolyticum 0.837  2.05   2.45 
##  5 Contaminant                 0.894  2.13   2.38 
##  6 Fusobacterium_nucleatum     9.30   2.80   0.301
##  7 Lactobacillus_plantarum     0.0218 0.0475 2.18 
##  8 Prevotella_melaninogenica   1.82   2.05   1.12 
##  9 Salmonella_enterica         0.773  2.25   2.90 
## 10 Vibrio_cholerae             1.81   2.37   1.31 
## 11 Yersinia_pseudotuberculosis 1.08   2.21   2.05

For consistency we keep the denominator as just the 10 mock (non-contaminant) species.

Fraction of squared Aitchison error due to bias

What fraction of the compositional error (measured by squared Aitchison distance) is explained by our model for each protocol?

err <- stm %>%
    group_by(Protocol, Sample) %>%
    summarize(
        Error2 = anorm(Error)^2,
        Bias2 = anorm(Bias)^2,
        Resid2 = anorm(Error / Bias)^2
        ) %>%
    summarize_at(vars(-Sample), mean)
err <- err %>% 
    mutate(Frac_Bias = Bias2 / Error2, Frac_Resid = Resid2 / Error2)
err
## # A tibble: 3 x 6
##   Protocol Error2 Bias2 Resid2 Frac_Bias Frac_Resid
##   <chr>     <dbl> <dbl>  <dbl>     <dbl>      <dbl>
## 1 H          39.7  38.8 0.892      0.978    0.0225 
## 2 Q          11.3  10.2 1.10       0.903    0.0969 
## 3 W          12.2  12.2 0.0492     0.996    0.00402

Residuals

Residuals, with the mock-only samples marked:

stm %>%
    mutate(Taxon = factor(Taxon, mock$Taxon)) %>%
    ggplot(aes(x = Taxon, y = Residual, shape = Individual == "M",
            color = Protocol)) +
    geom_quasirandom() +
    geom_hline(yintercept = 1) +
    geom_rangeframe(sides = "l", color = "black") +
    scale_y_log10() +
    scale_shape_manual(values = c(16, 1), labels = c("Fecal", "Mock only")) +
    scale_color_manual(values = colors.protocol) + 
    facet_grid(Protocol ~ ., scales = "fixed") +
    base_theme +
    tax_theme

Note that the residuals are not independent across taxa; the high Residuals for L. plantarum and B. hansenii for protocol H are pushing down the others.

Bootstrap standard errors

Get the bias estimates w/ estimated geometric standard errors -

reps <- stm %>%
    group_by(Protocol) %>%
    nest %>%
    mutate(
        Error_matrix = map(data, build_matrix, "Sample", "Taxon", "Error"),
        Bootreps = map(Error_matrix, bootrep_center, R = 4e3, 
            method = "gm")) %>%
    unnest(Bootreps) %>%
    rename(Bias = Center)
stats <- reps %>%
    group_by(Protocol, Taxon) %>%
    summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>%
    ungroup
bias <- left_join(bias, stats, by = c("Protocol", "Taxon"))

As expected, the geometric means of the resamplings approximate the point estimate.

all.equal(bias$Bias, bias$gm_mean)
## [1] "Mean relative difference: 0.0009001726"
bias <- bias %>% select(-gm_mean)

Create the SI figure showing the bias estimate with two standard errors:

r <- range(stm$Observed / stm$Actual)
p1 <- ggplot(bias, aes(factor(Taxon, mock$Taxon), Bias, color = Protocol)) +
    geom_hline(yintercept = 1) +
    geom_quasirandom(data = stm, aes(y = Observed / Actual), alpha = 0.4) +
    geom_pointrange(aes(ymin = Bias / gm_se^2, ymax = Bias * gm_se^2),
        fatten = 1.5) +
    scale_y_log10() +
    scale_color_manual(values = colors.protocol) + 
    facet_wrap(~ Protocol, labeller = labeller.protocol) +
    geom_rangeframe(sides = "l", color = "black") +
    scale_y_log10(limits = r, labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    labs(y = "Efficiency / gm. mean") +
    base_theme +
    tax_theme +
    theme(panel.spacing = unit(1, "lines"))
p1

Do this for differential bias as well. Note setting the denominator to just the mock taxa.

dreps <- sts %>%
    select(Taxon, Observed, Protocol, Individual) %>%
    spread(Protocol, Observed) %>%
    transmute(
        Taxon, Individual,
        HQ = H / Q, 
        HW = H / W,
        QW = Q / W
    ) %>%
    gather("Protocol", "Error", -Taxon, -Individual) %>%
    group_by(Protocol) %>%
    nest %>%
    mutate(
        Error_matrix = map(data, build_matrix, "Individual", "Taxon", "Error"),
        Bootreps = map(Error_matrix, bootrep_center, R = 4e3, method = "gm",
            denom = mock$Taxon)) %>%
    unnest(Bootreps) %>%
    rename(Bias = Center)
dstats <- dreps %>%
    group_by(Protocol, Taxon) %>%
    summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>%
    ungroup
dbias <- left_join(dbias, dstats, by = c("Protocol", "Taxon"))
all.equal(dbias$Bias, dbias$gm_mean)
## [1] "Mean relative difference: 0.001009332"
dbias <- dbias %>% select(-gm_mean)
p2 <- ggplot(dbias, aes(factor(Taxon, c(mock$Taxon, "Contaminant")), Bias)) +
    geom_hline(yintercept = 1) +
    # geom_quasirandom(data = sts, aes(y = Observed / Actual), alpha = 0.5) +
    geom_pointrange(aes(ymin = Bias / gm_se^2, ymax = Bias * gm_se^2),
        fatten = 1.5) +
    scale_y_log10() +
    # scale_color_manual(values = colors.protocol) + 
    facet_wrap(~ Protocol, labeller = labeller.protocol.ref) +
    geom_rangeframe(sides = "l", color = "black") +
    scale_y_log10(limits = r, labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    labs(y = "Efficiency / gm. mean") +
    base_theme +
    tax_theme +
    theme(panel.spacing = unit(1, "lines"))
p2

How does the standard error decrease with sample size?

Here we wish to see how the noise in the bias estimate decreases with the number of control samples used. To do so, we will perform multinomial resampling of N control samples, for each N between 1 and 9 (the total number of samples). For a given N, the standard deviation of the bias estimate across resamples approximates the standard error in the bias estimate when N control samples are measured. We’ll use Protocol H, which has an intermediate level of noise.

First, estimate the standard errors for each N:

mat <- stm %>%
    filter(Protocol == "H") %>%
    build_matrix(Sample, Taxon, Error)
N <- 1:9
names(N) <- N
reps.H <- map_dfr(N, ~bootrep_center(mat, R = 4e3, dist = "multinomial",
        N = ., method = "gm"),
    .id = "N") %>%
    rename(Bias = Center)
reps.H.summary <- reps.H %>%
    group_by(N, Taxon) %>%
    summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>%
    mutate(Protocol = "H")

Then view how the dispersion decreases with N:

lvls.color <- reps.H.summary %>%
    filter(N == 3) %>%
    arrange(-gm_se) %>%
    .$Taxon
values.color <- scales::seq_gradient_pal("black", "grey")(seq(0, 1, length.out = 10))
names(values.color) <- lvls.color
r <- range(reps.H.summary$gm_se)
names(r) <- r  %>% round(2)
p.se <- ggplot(reps.H.summary, aes(N, gm_se, 
        color = fct_reorder(Taxon, -gm_se))) +
    geom_line(aes(group = Taxon)) +
    geom_point() +
    scale_y_continuous(limits = range(c(1, r)), ) +
    base_theme +
    scale_color_manual(values = values.color, labels = tax_labeller) +
    geom_rangeframe(color = "black") +
    theme(legend.position = "right", legend.margin = margin(5,10,5,-5)) +
    labs(x = "Number of controls", y = "Geometric standard error", 
        title = "Standard error vs. number of control samples", 
        color = "Taxon")
p.se

The relative abundance plot provides useful context for understanding why the standard errors are much higher for two taxa:

pd <- filter(stm, Protocol == "H")
pd0 <- pd %>%
    filter(Individual %in% c("5", "M"))
p.step.H <- p.step %+% 
    pd %+%
    aes(x = factor(Taxon, mock$Taxon), y = Observed, 
        color = factor(Taxon, lvls.color), shape = NULL) +
    ggrepel::geom_text_repel(data = pd0, aes(label = Individual), 
        direction = "x", size = 2, color = "black") +
    scale_color_manual(values = values.color, labels = tax_labeller) +
    labs(title = "Observed compositions",
        y = "Abundance / gm. mean",
        shape = "Sample type") +
    theme(legend.position = "none")
# p.step.H
plot_grid(p.se, p.step.H, nrow = 1, labels = c("A", "B"),
    rel_widths = c(1.3, 1))

Table with estimated bias and summary statistics

Compute pairwise-error summary statistics

Error vs. actual compositions

Compute pairwise error summary statistics:

avg_errors <- ratios %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Average the errors for each pair of different taxa
    filter(Taxon.x != Taxon.y) %>%
    group_by(Protocol, Taxon.x, Taxon.y) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    # Average over pairs of taxa for each protocol
    group_by(Protocol) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".avg")
max_errors <- ratios %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Find the max fold error in a ratio for each protocol
    group_by(Protocol) %>%
    summarize_at(vars(Error, Bias, Residual), gm_range) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".max")
pw_summary <- bind_cols(avg_errors, max_errors) %>%
    select(Protocol, Bias.max, Bias.avg, Residual.avg)
pw_summary
## # A tibble: 3 x 4
##   Protocol Bias.max Bias.avg Residual.avg
##   <chr>       <dbl>    <dbl>        <dbl>
## 1 H          2751.      9.69         1.31
## 2 Q            55.8     3.22         1.46
## 3 W            74.0     3.48         1.08

Error between protocols

Get ratios for the ratio-based statistics as in ratios, but for the differential error between protocols.

tb <- stm %>%
    select(Taxon, Observed, Protocol, Individual) %>%
    spread(Protocol, Observed) %>%
    transmute(
        Taxon, Individual,
        HQ = H / Q, 
        HW = H / W,
        QW = Q / W
    ) %>%
    gather("Protocol", "Error", -Taxon, -Individual) %>%
    left_join(dbias, by = c("Protocol", "Taxon")) %>%
    select(-gm_se) %>%
    mutate(Residual = Error / Bias)
gvs <- c("Protocol", "Individual")
d_ratios <- tb %>%
    compute_ratios(group_vars = gvs) %>%
    mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))

Compute pairwise error summary statistics:

d_avg_errors <- d_ratios %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Average the errors for each pair of different taxa
    filter(Taxon.x != Taxon.y) %>%
    group_by(Protocol, Taxon.x, Taxon.y) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    # Average over pairs of taxa for each protocol
    group_by(Protocol) %>%
    summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".avg")
d_max_errors <- d_ratios %>%
    # Take the geometric absolute values of the errors
    mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
    # Find the max fold error in a ratio for each protocol
    group_by(Protocol) %>%
    summarize_at(vars(Error, Bias, Residual), gm_range) %>%
    rename_at(vars(Error, Bias, Residual), paste0, ".max")
d_pw_summary <- bind_cols(d_avg_errors, d_max_errors) %>%
    select(Protocol, Bias.max, Bias.avg, Residual.avg)
d_pw_summary
## # A tibble: 3 x 4
##   Protocol Bias.max Bias.avg Residual.avg
##   <chr>       <dbl>    <dbl>        <dbl>
## 1 HQ         428.       4.71         1.49
## 2 HW          59.1      3.88         1.34
## 3 QW           9.64     2.79         1.49

Combine with bias estimates in a single table

# Function to create string for formatting numbers with glue::glue()
fmt_string <- function(var, digits) {
    paste0("{format(", var, ", digits = ", digits, ")}")
}
bias_tab <- bind_rows(bias, dbias) %>%
    select(-gm_se) %>%
    mutate(Bias = glue::glue(fmt_string("Bias", 1))) %>%
    mutate_all(as.character) %>%
    spread(Protocol, Bias) %>%
    mutate_all(~ifelse(is.na(.), "---", .))
pw_summary_tab <- bind_rows(pw_summary, d_pw_summary) %>%
    mutate(
        Bias.max = glue::glue(fmt_string("Bias.max", 1)),
        Bias.avg = glue::glue(fmt_string("Bias.avg", 2)),
        Residual.avg = glue::glue(fmt_string("Residual.avg", 2))
        ) %>%
    mutate_all(as.character) %>%
    gather("Taxon", "Value", -Protocol) %>%
    spread(Protocol, Value)
lvls.taxon <- c(mock$Taxon, "Contaminant", "Bias.max", "Bias.avg",
    "Residual.avg")
lbls.taxon <- c(mock$Taxon, "Contaminant", "Max pairwise bias", 
    "Avg. pairwise bias", "Avg. pairwise noise")
bias_tab0 <- bind_rows(bias_tab, pw_summary_tab) %>%
    select(Taxon, "H", "Q", "W", "HQ", "HW", "QW") %>%
    mutate(
        Taxon = factor(Taxon, lvls.taxon, lbls.taxon),
        ) %>%
    arrange(Taxon)
bias_tab0
## # A tibble: 14 x 7
##    Taxon                      H       Q       W       HQ      HW     QW    
##    <fct>                      <chr>   <chr>   <chr>   <chr>   <chr>  <chr> 
##  1 Prevotella_melaninogenica  " 2.81" " 1.55" " 1.37" " 1.82" " 2.0… " 1.1…
##  2 Clostridium_perfringens    " 1.18" " 0.49" " 1.14" " 2.41" " 1.0… " 0.4…
##  3 Salmonella_enterica        " 1.77" " 2.29" " 0.79" " 0.77" " 2.2… " 2.9…
##  4 Clostridium_difficile      " 0.11" " 0.09" " 0.22" " 1.24" " 0.4… " 0.4…
##  5 Lactobacillus_plantarum    " 0.02" " 0.77" " 0.35" " 0.02" " 0.0… " 2.1…
##  6 Vibrio_cholerae            " 2.10" " 1.16" " 0.89" " 1.81" " 2.3… " 1.3…
##  7 Clostridium_saccharolytic… " 1.21" " 1.44" " 0.59" " 0.84" " 2.0… " 2.4…
##  8 Yersinia_pseudotuberculos… " 1.46" " 1.35" " 0.66" " 1.08" " 2.2… " 2.0…
##  9 Blautia_hansenii           " 0.54" " 0.74" " 1.80" " 0.72" " 0.3… " 0.4…
## 10 Fusobacterium_nucleatum    46.29   " 4.98" 16.51   " 9.30" " 2.8… " 0.3…
## 11 Contaminant                ---     ---     ---     " 0.89" " 2.1… " 2.3…
## 12 Max pairwise bias          2751    "  56"  "  74"  " 428"  "  59" "  10"
## 13 Avg. pairwise bias         9.7     3.2     3.5     4.7     3.9    2.8   
## 14 Avg. pairwise noise        1.3     1.5     1.1     1.5     1.3    1.5

Latex version for the manuscript:

tex <- bias_tab0 %>%
    rename_at(vars(HQ, HW, QW), ~ {str_sub(., 2, 1) <- "/"; .}) %>%
    mutate(
        Taxon = kableExtra::cell_spec(
            str_replace(Taxon, "_", " "), 
            "latex", 
            italic = Taxon %in% mock$Taxon)
        ) %>%
    knitr::kable(format="latex", booktabs = TRUE, linesep = "",
        escape = FALSE, align = c("l", rep("r", 6))) %>%
    kableExtra::add_header_above(c(" ", "Protocol" = 3, 
            "Protocol/Reference" = 3)) %>%
    kableExtra::row_spec(length(mock$Taxon) + 1, extra_latex_after = "\\midrule")
# tex
# clipr::write_clip(tex)

Calibration

Choose Individuals (specimens) to use for estimating bias.

set.seed(20190201)
individuals <- as.character(1:8)
estimation_set <- base::sample(individuals, 3, replace = FALSE)
stm0 <- stm %>%
    select(-Bias, -Residual) %>%
    mutate(Set = ifelse(Individual %in% estimation_set, "Est", "Eval"))
sam <- sam %>%
    mutate(Set = ifelse(Individual %in% estimation_set, "Est", "Eval"))

Estimate bias from the estimation samples:

bias_est <- stm0 %>%
    filter(Set == "Est") %>%
    group_by(Protocol, Taxon) %>%
    summarize(Bias = gm_mean(Error)) %>%
    mutate(Bias = center_elts(Bias)) %>%
    ungroup

Compare the bias estimated from the estimation set and that from all specimens:

bind_rows(list(All = bias, Est = bias_est), .id = "Specimens") %>%
    ggplot(aes(Taxon, Bias, color = Protocol, shape = Specimens)) + 
    geom_quasirandom() +
    scale_y_log10() +
    scale_shape_manual(breaks = c("All", "Est"), values = c(16, 1)) +
    scale_color_manual(values = colors.protocol) + 
    coord_flip()

Bias is consistent with the estimate using all specimens, though this would vary for the higher-variance protocols H and Q for some estimation sets.

Also estimate the differential bias to protocol W. First, add a column with the “Reference” measurement; then repeat bias estimation with the reference measurement instead of the Actual abundance.

stmW <- stm0 %>%
    select(Taxon, Protocol, Individual, Set, Observed) %>%
    spread(Protocol, Observed) %>%
    mutate(Actual = W) %>%
    gather("Protocol", "Observed", H, Q, W) %>%
    mutate(Error = Observed / Actual)
biasW_est <- stmW %>%
    filter(Set == "Est") %>%
    group_by(Protocol, Taxon) %>%
    summarize(BiasW = gm_mean(Error)) %>%
    mutate(BiasW = center_elts(BiasW))
biasW_est %>%
    spread(Protocol, BiasW)
## # A tibble: 10 x 4
##    Taxon                            H     Q     W
##    <chr>                        <dbl> <dbl> <dbl>
##  1 Blautia_hansenii            0.310  0.483     1
##  2 Clostridium_difficile       0.408  0.326     1
##  3 Clostridium_perfringens     0.956  0.385     1
##  4 Clostridium_saccharolyticum 2.09   2.54      1
##  5 Fusobacterium_nucleatum     2.79   0.267     1
##  6 Lactobacillus_plantarum     0.0549 2.88      1
##  7 Prevotella_melaninogenica   2.10   0.925     1
##  8 Salmonella_enterica         2.30   3.20      1
##  9 Vibrio_cholerae             2.37   1.38      1
## 10 Yersinia_pseudotuberculosis 2.26   2.07      1

Calibrate all samples

stm0 <- stm0 %>%
    left_join(bias_est, by = c("Protocol", "Taxon")) %>%
    left_join(biasW_est, by = c("Protocol", "Taxon")) %>%
    mutate_by(Sample, 
        Calibrated_to_Actual = close_elts(Observed / Bias),
        Calibrated_to_W = close_elts(Observed / BiasW)
    )

Sample ordination: Compositional PCA

Prep for the PCA: we first need to Clr transform each relative abundance vector.

stm1 <- stm0 %>%
    gather("Type", "Abundance", 
        Observed, Actual, Calibrated_to_Actual, Calibrated_to_W) %>%
    select(Taxon:Individual, Set:Abundance) %>%
    group_by(Sample, Type) %>%
    mutate(Clr = clr(Abundance)) %>%
    ungroup

Get a matrix with samples as rows to use for the PCA.

temp <- stm1 %>%
    unite("Sample_Type", Sample, Type, sep = ":") %>%
    select(Sample_Type, Taxon, Clr) %>%
    spread(Taxon, Clr)
mat <- temp %>%
    select(-Sample_Type) %>%
    as("matrix")
rownames(mat) <- temp$Sample_Type
corner(mat)
##                         Blautia_hansenii Clostridium_difficile
## H1:Actual                      -3.104989             1.4185469
## H1:Calibrated_to_Actual        -3.815372             1.2580855
## H1:Calibrated_to_W             -3.151494            -0.2018179
## H1:Observed                    -4.323387            -1.0988666
## H2:Actual                      -3.104989             1.4185469
##                         Clostridium_perfringens
## H1:Actual                              1.966933
## H1:Calibrated_to_Actual                1.831330
## H1:Calibrated_to_W                     1.941244
## H1:Observed                            1.896456
## H2:Actual                              1.966933
##                         Clostridium_saccharolyticum
## H1:Actual                                -0.1076099
## H1:Calibrated_to_Actual                   0.1544821
## H1:Calibrated_to_W                       -0.3968607
## H1:Observed                               0.3384341
## H2:Actual                                -0.1076099
##                         Fusobacterium_nucleatum
## H1:Actual                           -3.91413933
## H1:Calibrated_to_Actual             -3.76419663
## H1:Calibrated_to_W                  -0.98939104
## H1:Observed                          0.03810808
## H2:Actual                           -3.91413933

Note that each sample (protocol + individual) appears four times, as Observed, Actual, Calibrated to Actual, and Calibrated to W, and currently I’m just doing a simple PCA on all of these, since our goal is mainly to illustrate the effect of calibration rather than make quantitative claims about variation. Next we run the PCA and make a tibble for plotting,

pcx <- prcomp(mat)
tb <- as_tibble(pcx$x[,c(1,2)], rownames='Sample_Type') %>%
    separate(Sample_Type, c("Sample", "Type"), sep = ":") %>%
    left_join(sam, by = "Sample")
tb <- tb %>%
    mutate(PC1 = -PC1, PC2 = PC2)

Nearly 90% of the variance is covered by the first two PCs, with about 5.2x as much variation on PC1 than PC2:

fracvar <- pcx$sdev^2/sum(pcx$sdev^2)
round(fracvar, 3)
##  [1] 0.739 0.143 0.087 0.018 0.008 0.003 0.001 0.001 0.000 0.000
fracvar[1] / fracvar[2]
## [1] 5.161086
percvar <- round(100 * fracvar, 0)
strvar <- paste0("PC", seq(fracvar), " [", percvar, "%]")

PCA figure that will form the right half of the main text figure:

facet_labeller <- function (tb) {
    tb %>% 
        mutate_all(str_replace_all, "_", " ") %>%
        mutate_all(str_replace, " ", "\n")
}
type.lvls <- c("Observed", "Calibrated_to_Actual", "Calibrated_to_W")
actual <- tb %>%
    filter(Type == "Actual", Sample == "H1") %>%
    select(PC1, PC2) %>%
    expand(Type = factor(type.lvls, type.lvls), PC1, PC2)
tb0 <- tb %>%
    filter(Type != "Actual") %>%
    mutate(Type = factor(Type, type.lvls))
tb.breaks <- 
    tribble(
        ~PC1, ~PC2,
        # min(tb0$PC1), min(tb0$PC2),
        # actual$PC1[1], actual$PC2[1],
        # max(tb0$PC1), max(tb0$PC2),
        )
tb.range <- 
    tribble(
        ~PC1, ~PC2,
        min(tb0$PC1), min(tb0$PC2),
        max(tb0$PC1), max(tb0$PC2),
        )
p.pca <- ggplot(tb0, aes(PC1, PC2, color = Protocol, shape = Set)) +
    geom_point(data = actual, aes(PC1, PC2),
        shape = 3, size = 4, color = "black") +
    geom_point() +
    facet_grid(Type ~ ., labeller = facet_labeller) +
    geom_rangeframe(data = tb.range, aes(PC1, PC2), 
        color = "black", inherit.aes = FALSE) + 
    scale_x_continuous(breaks = tb.breaks$PC1, labels = NULL) +
    scale_y_continuous(breaks = tb.breaks$PC2, labels = NULL) +
    scale_shape_manual(breaks = c("Est", "Eval"), values = c(1, 16)) +
    scale_color_manual(values = colors.protocol) + 
    base_theme +
    theme(strip.text.y = element_text(angle=0, size = 9)) +
    # theme(axis.line = element_line()) +
    labs(title = "Sample ordination", x = strvar[1], y = strvar[2])
# Label the protocols in the "Observed" plot
labtb <- tribble(
    ~x,     ~y,   ~Protocol,
    -4,    1.5,   "H",
    -0.5,  1.3,   "Q",
    -1.85, -0.4,  "W",
    2.3,    0.7,   "Actual"
    ) %>%
    mutate(Type = factor("Observed", type.lvls))
p.pca <- p.pca +
    geom_text(data = labtb,
        aes(x, y, label = Protocol, color = Protocol), 
        size = 3, inherit.aes = FALSE, show.legend = FALSE)
p.pca

Relative abundances before and after calibration

Figure showing the relative abundances, which will form the left half of the main text figure:

stm1 <- stm1 %>% 
    group_by(Sample, Type) %>%
    mutate(Abundance = center_elts(Abundance)) %>%
    ungroup
tbrf <- tibble(Abundance = range(stm1$Abundance))
p.step.cal <- stm1 %>% 
    filter(Type %in% type.lvls) %>%
    mutate(Taxon = factor(Taxon, mock$Taxon),
        Type = factor(Type, type.lvls)) %>%
    ggplot(aes(x = Taxon, y = Abundance, color = Protocol, shape = Set)) + 
    geom_quasirandom() +
    geom_rangeframe(data = tbrf, aes(y = Abundance), 
        color = "black", sides = "l", inherit.aes = FALSE) +
    scale_color_manual(values = colors.protocol) + 
    scale_y_log10(labels = log_formatter) +
    scale_x_discrete(labels = tax_labeller) + 
    coord_cartesian(clip = "off") +
    labs(title = "Taxon relative abundances",
        y = "Abundance / gm. mean") + 
    facet_grid(Type ~ .) +
    scale_shape_manual(breaks = c("Est", "Eval"), values = c(1, 16)) +
    scale_color_manual(values = colors.protocol) + 
    base_theme + 
    tax_theme +
    theme(strip.text.y = element_blank())
# Add the actual abundances (as a stair plot like in Costea2017's Figure 6)
# underneath the data points
tb.step.cal <- tb.step %>%
    expand(Type = factor(type.lvls, type.lvls), nesting(x, Actual))
p.step.cal$layers <- c(
    geom_step(data = tb.step.cal, aes(x = x, y = Actual), 
        size = 0.3, color = "black", inherit.aes = FALSE),
    p.step.cal$layers)
# p.step.cal

Main text figure

plot_grid(p.step.cal + theme(plot.margin = margin(5,20,5,5)),
    p.pca + 
        theme(legend.position = "bottom", legend.box = "vertical",
            legend.margin = margin(),
            legend.spacing = unit(0, "in"),
        ),
    rel_widths = c(1, 1.1)
)

Bias vs. phylogeny

Get tree for the spike-in taxa:

data(costea2017_metaphlan2_tree)
costea2017_metaphlan2_tree$tip.label %>% head
## [1] "k__Archaea|p__Euryarchaeota|c__Methanopyri|o__Methanopyrales|f__Methanopyraceae|g__Methanopyrus|s__Methanopyrus_kandleri"                                    
## [2] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanothermaceae|g__Methanothermus|s__Methanothermus_fervidus"                      
## [3] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanothermobacter|s__Methanothermobacter_thermautotrophicus"
## [4] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanothermobacter|s__Methanothermobacter_marburgensis"      
## [5] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobacterium|s__Methanobacterium_formicicum"              
## [6] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobacterium|s__Methanobacterium_sp_Maddingley_MBC34"
tree <- costea2017_metaphlan2_tree
tree$tip.label <- tree$tip.label %>%
    str_extract("(?<=s__).+")
spikein_taxa <- c(mock$Taxon, contaminant)
tree <- ape::keep.tip(tree, spikein_taxa)
tree
## 
## Phylogenetic tree with 11 tips and 10 internal nodes.
## 
## Tip labels:
##  Clostridium_saccharolyticum, Blautia_hansenii, Clostridium_perfringens, Clostridium_difficile, Fusobacterium_nucleatum, Lactobacillus_plantarum, ...
## Node labels:
##  0.997, 0.650, 0.133, 0.991, 0.688, 1.000, ...
## 
## Rooted; includes branch lengths.

Plot tree along with differential bias to Protocol W. Bias is shown relative to the 10 mock taxa, with 2 geometric standard errors.

gt <- ggtree::ggtree(tree, layout = "rectangular") +
    theme(plot.margin = margin(5, 0, 5, 5))
gtb <- dbias %>%
    mutate(Taxon = str_replace(Taxon, "Contaminant", contaminant)) %>%
    left_join(gt$data, by = c("Taxon" = "label")) %>%
    filter(Protocol %in% c("HW", "QW")) %>%
    mutate_at("Protocol", str_sub, 1, 1)
pr <- gtb %>%
    ggplot(aes(Bias, fct_reorder(Taxon, y), color = Protocol)) +
    # geom_point() +
    geom_vline(xintercept = 1) +
    ggstance::geom_pointrangeh(aes(xmin = Bias / gm_se^2, 
            xmax = Bias * gm_se^2), 
        fatten = 1.0, position = position_jitter(width = 0, height = 0.1)
        ) +
    scale_color_manual(values = colors.protocol) +
    base_theme +
    scale_y_discrete(labels = tax_labeller) +
    scale_x_log10(labels = log_formatter) +
    labs(y = "", x = "Differential efficiency rel. to Protocol W") +
    theme(legend.position = "right", plot.margin = margin(5, 5, 5, 0),
        axis.text.y = element_text(size=9, family = "")
    )
plot_grid(gt, pr, align = "h", rel_widths = c(0.6, 1))

ggsave(here("figures", "costea-tree.png"),
    width = 6, height = 4, units = "in", dpi=300)
ggsave(here("figures", "costea-tree.pdf"),
    width = 6, height = 4, units = "in", useDingbats = FALSE)

Reproduce Costea et al.’s Figure 6A

mock.gathered <- costea2017_mock_composition %>%
    select(Taxon, 
        FACS = "bacterial cells in spike in Mix",
        OD = "Bacterial OD in sample",
        ) %>%
    group_by(Taxon) %>%
    summarize_at(vars(FACS, OD), mean) %>%
    gather("Protocol", "Observed", -Taxon) %>%
    mutate_by(Protocol, Observed = center_elts(Observed))
stm %>%
    group_by(Protocol, Taxon) %>%
    summarize(Observed = gm_mean(Observed)) %>%
    ungroup %>%
    bind_rows(mock.gathered) %>%
    filter(Protocol != "FACS") %>%
    mutate(
        Taxon = factor(Taxon, mock$Taxon),
        Protocol = factor(Protocol, c("W", "Q", "H", "OD"))
        ) %>%
    ggplot(aes(Taxon, Observed, fill = Protocol)) +
    geom_col(position = "dodge") +
    geom_step(data = tb.step, aes(x = x, y = Actual, linetype = "FACS"), 
        size = 0.3, color = "black", inherit.aes = FALSE) +
    scale_fill_manual(values = c(colors.protocol, OD = "grey")) +
    scale_y_log10() +
    base_theme + 
    tax_theme

The abundances don’t match exactly. This is expected, given that Costea et al. used mOTU to measure abundances. However, the general pattern is highly consistent with their figure, including the OD measurements, except for a swapping in the taxon labels on the x-axis for the three species V. cholerae, C. saccharolyticum, and Y. pseudotuberculosis. This strongly suggests that the mock species are correctly labeled here and are labeled incorrectly in their Figure 6.

Session info

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Arch Linux
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas.so.3.8.0
## LAPACK: /usr/lib/liblapack.so.3.8.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] BiasManuscript_0.0.0.9000 metacal_0.1.0            
##  [3] ggbeeswarm_0.6.0          cowplot_0.9.99           
##  [5] ggthemes_4.2.0            forcats_0.4.0            
##  [7] stringr_1.4.0             dplyr_0.8.3              
##  [9] purrr_0.3.2               readr_1.3.1              
## [11] tidyr_0.8.3               tibble_2.1.3             
## [13] ggplot2_3.2.0             tidyverse_1.2.1          
## [15] here_0.1                  rmarkdown_1.13           
## [17] nvimcom_0.9-82            usethis_1.5.0            
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-140       fs_1.3.1           ggtree_1.16.0     
##  [4] lubridate_1.7.4    devtools_2.0.2     webshot_0.5.1     
##  [7] RColorBrewer_1.1-2 httr_1.4.0         rprojroot_1.3-2   
## [10] useful_1.2.6       tools_3.6.1        backports_1.1.4   
## [13] utf8_1.1.4         R6_2.4.0           vipor_0.4.5       
## [16] lazyeval_0.2.2     colorspace_1.4-1   withr_2.1.2       
## [19] tidyselect_0.2.5   prettyunits_1.0.2  processx_3.3.1    
## [22] compiler_3.6.1     cli_1.1.0          rvest_0.3.4       
## [25] xml2_1.2.0         desc_1.2.0         labeling_0.3      
## [28] scales_1.0.0       callr_3.2.0        digest_0.6.20     
## [31] pkgconfig_2.0.2    htmltools_0.3.6    sessioninfo_1.1.1 
## [34] rlang_0.4.0        readxl_1.3.1       rstudioapi_0.10   
## [37] generics_0.0.2     jsonlite_1.6       magrittr_1.5      
## [40] kableExtra_1.1.0   Rcpp_1.0.1         munsell_0.5.0     
## [43] fansi_0.4.0        ape_5.3            stringi_1.4.3     
## [46] yaml_2.2.0         ggstance_0.3.1     pkgbuild_1.0.3    
## [49] plyr_1.8.4         grid_3.6.1         parallel_3.6.1    
## [52] ggrepel_0.8.1      crayon_1.3.4       lattice_0.20-38   
## [55] haven_2.1.0        hms_0.4.2          zeallot_0.1.0     
## [58] knitr_1.23         ps_1.3.0           pillar_1.4.2      
## [61] igraph_1.2.4.1     reshape2_1.4.3     codetools_0.2-16  
## [64] pkgload_1.0.2      glue_1.3.1         evaluate_0.14     
## [67] remotes_2.0.4      modelr_0.1.4       treeio_1.8.0      
## [70] vctrs_0.2.0        testthat_2.1.1     cellranger_1.1.0  
## [73] gtable_0.3.0       assertthat_0.2.1   xfun_0.7          
## [76] broom_0.5.2        tidytree_0.2.4     viridisLite_0.3.0 
## [79] rvcheck_0.1.3      beeswarm_0.2.3     memoise_1.1.0     
## [82] ellipsis_0.2.0.1