R setup

Run with SAVE_FIGURES = TRUE to save figures in figures/.

SAVE_FIGURES = TRUE

Libraries and paths:

library(here)
library(tidyverse)
library(ggthemes)
library(cowplot)
library(ggbeeswarm)
# Functions for bias estimation and calibration
library(metacal); packageVersion("metacal")
## [1] '0.1.0'
# This package
devtools::load_all(here())

Plot setup:

# Options for `ggplot`:
base_theme <- theme_tufte() + 
    theme(
        text = element_text(size=9, family = ""),
        strip.text = element_text(size=9, family = ""),
        legend.position = "none"
        )
tax_theme <- theme(
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
        axis.title.x = element_blank())
update_geom_defaults("point", list(size = 1))
update_geom_defaults("text", list(size = 2.5))
update_geom_defaults("hline", list(color = "grey"))
update_geom_defaults("vline", list(color = "grey"))
# Color-blind friendly colors from ColorBrewer2
colors.protocol <- c(
    H = "#1b9e77", 
    W = "#d95f02", 
    Q = "#7570b3", 
    Actual = "black")
# Labeller for facets when faceting by protocol
labeller.protocol <- labeller(
    Protocol = function(p) paste("Protocol", p))
labeller.protocol.ref <- labeller(
    Protocol = function(p) {
        focal <- str_sub(p, 1, 1)
        ref <- str_sub(p, 2, 2)
        paste("Protocol", focal, "/", ref)
    }
)

Data setup

Sample metadata:

data("costea2017_sample_data")
sam <- costea2017_sample_data
sam
## # A tibble: 29 x 5
##    Sample Protocol Individual SI_sample Run_accession
##    <chr>  <chr>    <chr>      <chr>     <chr>        
##  1 Q1     Q        1          BYQ_AAAB  ERR1971004   
##  2 H1     H        1          BYQ_AAAM  ERR1971015   
##  3 W1     W        1          BYQ_AAAV  ERR1971024   
##  4 Q2     Q        2          BYQ_AAAF  ERR1971008   
##  5 H2     H        2          BYQ_AAAQ  ERR1971019   
##  6 W2     W        2          BYQ_AAAZ  ERR1971028   
##  7 Q3     Q        3          BYQ_AAAA  ERR1971003   
##  8 H3     H        3          BYQ_AAAL  ERR1971014   
##  9 W3     W        3          BYQ_AAAU  ERR1971023   
## 10 Q4     Q        4          BYQ_AAAE  ERR1971007   
## # … with 19 more rows

Expected mock community composition, obtained from the supplementary data (see data-raw/costea2017.R)

data("costea2017_mock_composition")
mock <- costea2017_mock_composition

The FACS measurements are the “bacterial cells” columns; the measurements without endospores were used as the actual composition, and replicate measurements were averaged (personal communication with Paul Costea);

mock <- mock %>%
    select(Taxon, FACS = "bacterial cells in spike in Mix") %>%
    group_by(Taxon) %>%
    summarize_at(vars(FACS), mean)
mock <- mock %>%
    arrange(desc(FACS))
mock
## # A tibble: 10 x 2
##    Taxon                            FACS
##    <chr>                           <dbl>
##  1 Prevotella_melaninogenica   91361186.
##  2 Clostridium_perfringens     73342343.
##  3 Salmonella_enterica         58450204.
##  4 Clostridium_difficile       42383205.
##  5 Lactobacillus_plantarum     22566221.
##  6 Vibrio_cholerae             10065143.
##  7 Clostridium_saccharolyticum  9212814.
##  8 Yersinia_pseudotuberculosis  3950160.
##  9 Blautia_hansenii              459883.
## 10 Fusobacterium_nucleatum       204756.

Note that the order of three species (V. cholerae, C. saccharolyticum, and Y. pseudotuberculosis) differ from that of Costea2017 Figure 6. We reproduce their figure below, showing that these species are likely mislabeled in their figure and not in the mock table. But a mislabeling would here not affect our main conclusions.

Load the Metaphlan2 profiles:

data("costea2017_metaphlan2_profiles")
profiles <- costea2017_metaphlan2_profiles
profiles %>% corner
## # A tibble: 5 x 5
##   Clade                         ERR1971003 ERR1971004 ERR1971005 ERR1971006
##   <chr>                              <dbl>      <dbl>      <dbl>      <dbl>
## 1 k__Archaea                       0.00101    0.00036    0.00018          0
## 2 k__Archaea|p__Euryarchaeota      0.00101    0.00036    0.00018          0
## 3 k__Archaea|p__Euryarchaeota|…    0.00101    0.00036    0.00018          0
## 4 k__Archaea|p__Euryarchaeota|…    0.00101    0.00036    0.00018          0
## 5 k__Archaea|p__Euryarchaeota|…    0.00101    0.00036    0.00018          0

The samples are named here by their ENA run accession; let’s rename to the new sample names.

new_names <- c("Clade", 
    sam$Sample[match(colnames(profiles)[-1], sam$Run_accession)])
colnames(profiles) <- new_names

Two samples, QA and QB, are two extra fecal samples that were only sequenced by protocol Q and so we won’t be using for our analysis. We also drop unneeded sample variables.

sam <- sam %>%
    filter(!(Sample %in% c("QA", "QB"))) %>%
    select(Sample, Protocol, Individual)
profiles <- profiles[, c("Clade", sam$Sample)]

Taxonomy from the metaphlan2 clade names:

tax_ranks <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus",
    "Species", "Strain")
rank_letters <- c("k", "p", "c", "o", "f", "g", "s", "t")
tax_pattern  <- paste0("(?:", rank_letters, "__(\\w+))?") %>%
    paste(collapse = "\\|?")
tax <- profiles %>%
    select(Clade) %>%
    extract(Clade, tax_ranks, regex = tax_pattern, remove = FALSE)
tax
## # A tibble: 1,868 x 9
##    Clade      Kingdom Phylum  Class  Order  Family  Genus Species  Strain  
##    <chr>      <chr>   <chr>   <chr>  <chr>  <chr>   <chr> <chr>    <chr>   
##  1 k__Archaea Archaea <NA>    <NA>   <NA>   <NA>    <NA>  <NA>     <NA>    
##  2 k__Archae… Archaea Euryar… <NA>   <NA>   <NA>    <NA>  <NA>     <NA>    
##  3 k__Archae… Archaea Euryar… Metha… <NA>   <NA>    <NA>  <NA>     <NA>    
##  4 k__Archae… Archaea Euryar… Metha… Metha… <NA>    <NA>  <NA>     <NA>    
##  5 k__Archae… Archaea Euryar… Metha… Metha… Methan… <NA>  <NA>     <NA>    
##  6 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… <NA>     <NA>    
##  7 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… <NA>    
##  8 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… GCF_000…
##  9 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… <NA>    
## 10 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… Methano…
## # … with 1,858 more rows

Species table with just the Metaphlan species identifiers

st <- profiles %>%
    left_join(tax, by = "Clade") %>%
    filter(!is.na(Species), is.na(Strain)) %>%
    select(-Strain)
st
## # A tibble: 718 x 35
##    Clade      Q1      H1      W1      Q2      H2      W2      Q3      H3
##    <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 k__A… 0.      0       0.      0       0       0       0       0      
##  2 k__A… 3.60e-4 0       0.      0.0830  0.0935  0.616   0.0009  0.00012
##  3 k__A… 0.      0       0.      0.0161  0       0       0       0      
##  4 k__A… 0.      0       0.      0       0       0       0.00011 0      
##  5 k__B… 0.      0       0.      0       0       0       0.00095 0      
##  6 k__B… 0.      0       0.      0.00795 0.00008 0.00088 0.0011  0      
##  7 k__B… 8.30e-4 0       0.      0       0       0       0       0      
##  8 k__B… 1.50e-4 0.00088 2.80e-4 0.00044 0.00008 0       0.00534 0.0018 
##  9 k__B… 2.30e-4 0       0.      0.00361 0       0.00036 0.00016 0.00008
## 10 k__B… 6.27e-3 0.00093 4.30e-4 0.00173 0       0.00018 0       0      
## # … with 708 more rows, and 26 more variables: W3 <dbl>, Q4 <dbl>,
## #   H4 <dbl>, W4 <dbl>, Q5 <dbl>, H5 <dbl>, W5 <dbl>, Q6 <dbl>, H6 <dbl>,
## #   W6 <dbl>, Q7 <dbl>, H7 <dbl>, W7 <dbl>, Q8 <dbl>, H8 <dbl>, W8 <dbl>,
## #   QM <dbl>, HM <dbl>, WM <dbl>, Kingdom <chr>, Phylum <chr>,
## #   Class <chr>, Order <chr>, Family <chr>, Genus <chr>, Species <chr>

Species names have the form Genus_species and are unique,

st$Species %>% head
## [1] "Methanobrevibacter_ruminantium"  "Methanobrevibacter_smithii"     
## [3] "Methanobrevibacter_unclassified" "Methanosphaera_stadtmanae"      
## [5] "Actinomyces_cardiffensis"        "Actinomyces_europaeus"
st$Species %>% anyDuplicated
## [1] 0

The Genus_unclassified catches cases where more reads were mapped to the genus’s markers than could be accounted for by species within the genus. We will include these for measuring the family-level compositions but not the spike-in compositions From here on, we will identify the Clade / Taxon with this species identifier,

st <- st %>%
    mutate(Clade = Species) %>%
    rename(Taxon = Clade)

Next, join the metadata and tidy it into a tall format (one sample-taxon observation per row)

st <- st %>%
    gather("Sample", "Abundance", sam$Sample) %>%
    left_join(sam, by = "Sample") %>%
    select(Taxon, Sample, Abundance, colnames(sam), everything())
st
## # A tibble: 19,386 x 12
##    Taxon Sample Abundance Protocol Individual Kingdom Phylum Class Order
##    <chr> <chr>      <dbl> <chr>    <chr>      <chr>   <chr>  <chr> <chr>
##  1 Meth… Q1      0        Q        1          Archaea Eurya… Meth… Meth…
##  2 Meth… Q1      0.00036  Q        1          Archaea Eurya… Meth… Meth…
##  3 Meth… Q1      0        Q        1          Archaea Eurya… Meth… Meth…
##  4 Meth… Q1      0        Q        1          Archaea Eurya… Meth… Meth…
##  5 Acti… Q1      0        Q        1          Bacter… Actin… Acti… Acti…
##  6 Acti… Q1      0        Q        1          Bacter… Actin… Acti… Acti…
##  7 Acti… Q1      0.00083  Q        1          Bacter… Actin… Acti… Acti…
##  8 Acti… Q1      0.000150 Q        1          Bacter… Actin… Acti… Acti…
##  9 Acti… Q1      0.00023  Q        1          Bacter… Actin… Acti… Acti…
## 10 Acti… Q1      0.00627  Q        1          Bacter… Actin… Acti… Acti…
## # … with 19,376 more rows, and 3 more variables: Family <chr>,
## #   Genus <chr>, Species <chr>

Metaphlan outputs estimates of the proportions * 100. We will renormalize to proportions summing to 1 instead.

st <- st %>%
    mutate_by(Sample, Abundance = close_elts(Abundance))
st %>%
    group_by(Sample) %>%
    summarize(sum(Abundance)) %>%
    {summary(.[,2])}
##  sum(Abundance)
##  Min.   :1     
##  1st Qu.:1     
##  Median :1     
##  Mean   :1     
##  3rd Qu.:1     
##  Max.   :1

Output from the various kingdoms:

st %>%
    group_by(Kingdom, Sample) %>%
    summarize(Abundance = sum(Abundance)) %>%
    spread(Kingdom, Abundance) %>%
    print(n=Inf)
## # A tibble: 27 x 5
##    Sample     Archaea Bacteria Eukaryota Viruses
##    <chr>        <dbl>    <dbl>     <dbl>   <dbl>
##  1 H1     0              0.931 0.0000845  0.0689
##  2 H2     0.000936       0.942 0.0000650  0.0569
##  3 H3     0.00000120     0.944 0.0000756  0.0564
##  4 H4     0              0.956 0.000103   0.0435
##  5 H5     0.000759       0.933 0.0000763  0.0664
##  6 H6     0.00000100     0.947 0.0000541  0.0526
##  7 H7     0              0.916 0.0000971  0.0842
##  8 H8     0.00000110     0.928 0.0000712  0.0715
##  9 HM     0              0.846 0.0000424  0.154 
## 10 Q1     0.00000360     0.932 0.0000544  0.0679
## 11 Q2     0.000992       0.978 0.0000190  0.0207
## 12 Q3     0.0000101      0.987 0.0000785  0.0124
## 13 Q4     0              0.956 0.0000910  0.0434
## 14 Q5     0.00102        0.945 0.0000424  0.0543
## 15 Q6     0              0.964 0.0000539  0.0355
## 16 Q7     0.00000180     0.953 0.0000460  0.0473
## 17 Q8     0              0.952 0.0000775  0.0478
## 18 QM     0.00000260     0.782 0.0000238  0.218 
## 19 W1     0              0.880 0.0000436  0.120 
## 20 W2     0.00616        0.915 0.0000884  0.0784
## 21 W3     0              0.933 0.0000661  0.0669
## 22 W4     0.00000230     0.927 0.0000828  0.0733
## 23 W5     0.00159        0.903 0.0000666  0.0958
## 24 W6     0.000000900    0.944 0.0000386  0.0561
## 25 W7     0.00000340     0.871 0.0000424  0.129 
## 26 W8     0              0.879 0.0000502  0.121 
## 27 WM     0              0.857 0.0000441  0.143

There is very little Archaea and Eukaryote abundance, and the vast majority of the viral abundance comes from a Salmonella phage that I suspect is integrated into the genome of the Salmonella enterica strain in the spike-in.

st %>%
    filter(Kingdom == "Viruses") %>%
    group_by(Taxon) %>% 
    summarize(Abundance = mean(Abundance)) %>%
    arrange(desc(Abundance))
## # A tibble: 5 x 2
##   Taxon                     Abundance
##   <chr>                         <dbl>
## 1 Salmonella_phage_Fels_1  0.0772    
## 2 Lactococcus_phage_P680   0.0000241 
## 3 C2likevirus_unclassified 0.0000128 
## 4 Lactobacillus_phage_PL_1 0.00000478
## 5 Dasheen_mosaic_virus     0.00000311

Bacteria form the vast majority of estimated non-viral abundance in each sample,

st %>%
    group_by(Kingdom, Sample) %>%
    summarize(Abundance = sum(Abundance)) %>%
    filter(Kingdom != "Viruses") %>%
    mutate_by(Sample, Abundance = close_elts(Abundance)) %>%
    filter(Kingdom == "Bacteria") %>%
    summarize_at("Abundance", list(Min = min, Mean = mean))
## # A tibble: 1 x 2
##     Min  Mean
##   <dbl> <dbl>
## 1 0.993 0.999

We will therefore restrict our analysis of sample composition to just Bacteria, renormalizing the abundances to sum to 1 after filtering.

st <- st %>%
    filter(Kingdom == "Bacteria") %>%
    mutate_by(Sample, Abundance = close_elts(Abundance))

Composition of the samples

Identifying the lab contaminant

Costea et al. report E. coli as a likely lab contaminant of the spike-in based on its appearance in the profiles of the mock-only samples (which they derived using mOTUs rather than Metaphlan2). So let’s check what is at appreciable frequency in the mock-only samples that shouldn’t be,

tb <- st %>%
    filter(
        Individual == "M", Abundance > 0, 
        !(Taxon %in% mock$Taxon)
        ) %>%
    group_by(Family, Taxon) %>%
    summarize(
        Mean = mean(Abundance),
        Max = max(Abundance),
        ) %>%
    filter(Max > 1e-4) %>%
    arrange(desc(Mean))
tb
## # A tibble: 9 x 4
## # Groups:   Family [4]
##   Family             Taxon                          Mean      Max
##   <chr>              <chr>                         <dbl>    <dbl>
## 1 Enterobacteriaceae Shigella_flexneri          0.0871   0.109   
## 2 Enterobacteriaceae Salmonella_unclassified    0.0134   0.0183  
## 3 Enterobacteriaceae Escherichia_coli           0.00394  0.00520 
## 4 Enterobacteriaceae Escherichia_unclassified   0.000764 0.000948
## 5 Xanthomonadaceae   Xanthomonas_perforans      0.000126 0.000146
## 6 Moraxellaceae      Psychrobacter_unclassified 0.000122 0.000208
## 7 Xanthomonadaceae   Xanthomonas_gardneri       0.000107 0.000128
## 8 Enterobacteriaceae Pantoea_unclassified       0.000101 0.000139
## 9 Prevotellaceae     Prevotella_copri           0.000100 0.000160

Here we see that most of the contaminant reads appear to be captured by Shigella_flexneri identifier. Given the closeness of S. flexnari with E. coli, the estimated abundance for Escherichia_coli may also be coming from the contaminant strain. Since S. flexnari is generally much larger,

st %>%
    filter(Taxon %in% c("Shigella_flexneri", "Escherichia_coli")) %>%
    group_by(Individual, Taxon) %>%
    summarize_at(vars(Abundance), mean) %>%
    spread(Taxon, Abundance) %>%
    mutate(Escherichia_coli / Shigella_flexneri)
## # A tibble: 9 x 4
## # Groups:   Individual [9]
##   Individual Escherichia_coli Shigella_flexneri `Escherichia_coli/Shigella…
##   <chr>                 <dbl>             <dbl>                       <dbl>
## 1 1                  0.00183             0.0363                      0.0504
## 2 2                  0.00220             0.0220                      0.0998
## 3 3                  0.00100             0.0206                      0.0487
## 4 4                  0.000954            0.0221                      0.0432
## 5 5                  0.00153             0.0309                      0.0497
## 6 6                  0.000894            0.0178                      0.0503
## 7 7                  0.00203             0.0369                      0.0549
## 8 8                  0.00181             0.0335                      0.0540
## 9 M                  0.00394             0.0871                      0.0452

We will treat Shigella_flexneri as the lab contaminant in our analysis,

contaminant <- "Shigella_flexneri"

Our analysis of the spike-in abundances below show that the ratio of Shigella_flexneri to other spike-in taxa is constant across samples, indicating that Shigella_flexneri reads in the fecal samples are primarily from the contaminant and not native gut strains.

Above, we also see appreciable frequency of Salmonella_unclassified, which is due to the genus-level estimated proportion for Salmonella being larger than the species-level estimates. Because the estimated proportions of Escherichia_coli, Escherichia_unclassified, and Salmonella_unclassified are all fairly small, they will not substantially affect the family-level composition plot (Figure 4A). and so we will leave it in as a “native” taxon for this analysis. However, because these abundances might be due to the spike-in rather than native gut taxa, we will filter them out before proceeding.

st0 <- st %>%
    filter(!(Taxon %in% c("Salmonella_unclassified", "Escherichia_coli",
                "Escherichia_unclassified"))) %>%
    mutate_by(Sample, Abundance = close_elts(Abundance))

Taxa by source (Mock, Contaminant, Native)

Classify the taxa by source type:

st0 <- st0 %>%
    mutate(Source = case_when(
            Taxon %in% mock$Taxon ~ "Mock", 
            Taxon == contaminant ~ "Contaminant",
            TRUE ~ "Native"
            )
        )

View the proportions of native, mock, and contaminant for each sample:

tb.source <- st0 %>%
    group_by(Sample, Protocol, Individual, Source) %>%
    summarize(Abundance = sum(Abundance)) %>%
    ungroup()
tb.source %>%
    spread(Source, Abundance) %>%
    print(n=Inf)
## # A tibble: 27 x 6
##    Sample Protocol Individual Contaminant   Mock  Native
##    <chr>  <chr>    <chr>            <dbl>  <dbl>   <dbl>
##  1 H1     H        1              0.0280  0.290  0.682  
##  2 H2     H        2              0.0259  0.263  0.711  
##  3 H3     H        3              0.0265  0.279  0.695  
##  4 H4     H        4              0.0178  0.179  0.803  
##  5 H5     H        5              0.0268  0.321  0.653  
##  6 H6     H        6              0.0205  0.228  0.752  
##  7 H7     H        7              0.0330  0.372  0.595  
##  8 H8     H        8              0.0309  0.353  0.616  
##  9 HM     H        M              0.0818  0.917  0.00170
## 10 Q1     Q        1              0.0247  0.196  0.780  
## 11 Q2     Q        2              0.00485 0.0317 0.963  
## 12 Q3     Q        3              0.00498 0.0276 0.967  
## 13 Q4     Q        4              0.0166  0.132  0.851  
## 14 Q5     Q        5              0.0197  0.150  0.830  
## 15 Q6     Q        6              0.0117  0.0834 0.905  
## 16 Q7     Q        7              0.0183  0.157  0.825  
## 17 Q8     Q        8              0.0151  0.111  0.874  
## 18 QM     Q        M              0.112   0.886  0.00211
## 19 W1     W        1              0.0574  0.830  0.112  
## 20 W2     W        2              0.0358  0.442  0.522  
## 21 W3     W        3              0.0307  0.417  0.552  
## 22 W4     W        4              0.0321  0.443  0.525  
## 23 W5     W        5              0.0468  0.598  0.356  
## 24 W6     W        6              0.0213  0.358  0.621  
## 25 W7     W        7              0.0605  0.769  0.170  
## 26 W8     W        8              0.0555  0.734  0.211  
## 27 WM     W        M              0.0726  0.926  0.00122

The contaminant is always a fairly small fraction of the total sample. It also generally appears in a consistent ratio with the mock for a given protocol:

tb.source %>%
    spread(Source, Abundance) %>%
    ggplot(aes(Sample, Contaminant / Mock, color = Protocol, 
            label = Individual)) +
    scale_color_manual(values = colors.protocol) + 
    geom_text() +
    coord_flip()

In samples Q2 and Q3, the spike-in makes up an unusually low fraction of the total (see table above), so the larger ratio in these samples may be due to chance. These observations suggest that the contaminant is being correctly identified.

Composition by source type:

p.source <- ggplot(tb.source,
    aes(y = Abundance, 
        x = Individual,
        # x = factor(Individual, lvls.mock_prop), 
        fill = Source)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(breaks = c(0, 0.5, 1), 
        labels = c("0", "0.5", "1")) +
    geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"), 
        aes(y = y), sides = "l", inherit.aes = FALSE) +
    scale_fill_brewer(type = "qual") +
    facet_grid(. ~ Protocol, labeller = labeller.protocol) +
    labs(x = "Specimen", fill = "Taxon type", 
        title = "Composition of different bacterial types") +
    base_theme +
    theme(legend.position = "right")
p.source 

Native taxa composition

Get the native taxa agglomerated to the Family level,

nat <- st0 %>%
    filter(Source == "Native") %>%
    group_by_at(vars(Sample, Protocol, Individual, Kingdom:Family)) %>%
    summarize(Abundance = sum(Abundance)) %>%
    ungroup

Let’s restrict to Families within a minimal abundance to simplify the figure. First, lets check the prevalence of various families,

prev <- nat %>%
    filter(Individual != "M") %>%
    mutate_by(Sample, Proportion = close_elts(Abundance)) %>%
    group_by(Family) %>%
    summarize(Prev = sum(Proportion > 0), 
        Min = min(Proportion), 
        Median = median(Proportion), 
        Mean = mean(Proportion), 
        Max = max(Proportion))
prev %>%
    arrange(desc(Max)) %>%
    print(n=30)
## # A tibble: 98 x 6
##    Family                  Prev         Min     Median     Mean      Max
##    <chr>                  <int>       <dbl>      <dbl>    <dbl>    <dbl>
##  1 Bacteroidaceae            24 0.0412      0.227      0.228    0.496   
##  2 Ruminococcaceae           24 0.0755      0.221      0.229    0.458   
##  3 Prevotellaceae            24 0.0000859   0.000586   0.0521   0.456   
##  4 Lachnospiraceae           24 0.0492      0.157      0.166    0.311   
##  5 Bifidobacteriaceae        24 0.000279    0.0196     0.0443   0.242   
##  6 Eubacteriaceae            24 0.0151      0.0938     0.0890   0.162   
##  7 Rikenellaceae             24 0.0177      0.0439     0.0579   0.162   
##  8 Coriobacteriaceae         24 0.00182     0.0113     0.0185   0.103   
##  9 Porphyromonadaceae        24 0.0109      0.0258     0.0324   0.0773  
## 10 Erysipelotrichaceae       24 0.000854    0.00209    0.00703  0.0732  
## 11 Acidaminococcaceae        24 0.00000244  0.0000307  0.00796  0.0724  
## 12 Veillonellaceae           24 0.0000890   0.0116     0.0204   0.0688  
## 13 Verrucomicrobiaceae       24 0.000000613 0.000164   0.0110   0.0622  
## 14 Clostridiaceae            24 0.00153     0.00526    0.00687  0.0303  
## 15 Oscillospiraceae          24 0.00161     0.00455    0.00547  0.0148  
## 16 Sutterellaceae            24 0.0000214   0.00337    0.00453  0.0146  
## 17 Bacteroidales_noname      24 0.000131    0.00307    0.00409  0.0145  
## 18 Streptococcaceae          24 0.000260    0.00205    0.00312  0.00865 
## 19 Desulfovibrionaceae       24 0.000855    0.00417    0.00441  0.00752 
## 20 Burkholderiales_noname    24 0.00000230  0.0000564  0.000989 0.00644 
## 21 Lactobacillaceae          24 0.00000618  0.000142   0.000564 0.00421 
## 22 Clostridiales_noname      24 0.000244    0.00147    0.00158  0.00411 
## 23 Peptostreptococcaceae     24 0.00000691  0.000172   0.000450 0.00350 
## 24 Xanthomonadaceae          24 0.000519    0.00139    0.00144  0.00290 
## 25 Pasteurellaceae           24 0.00000126  0.000234   0.000387 0.00262 
## 26 Enterobacteriaceae        24 0.0000173   0.000157   0.000321 0.00179 
## 27 Oxalobacteraceae          21 0           0.00000565 0.000112 0.00143 
## 28 Campylobacteraceae        14 0           0.00000186 0.000110 0.00141 
## 29 Enterococcaceae           24 0.00000296  0.000219   0.000335 0.00105 
## 30 Moraxellaceae             18 0           0.0000731  0.000145 0.000931
## # … with 68 more rows

Let’s use the families with a proportion of at least 2%.

families <- prev %>%
    filter(Max > 0.02) %>%
    {.$Family}
nat0 <- nat %>%
    filter(Family %in% families) %>%
    arrange(Phylum, Family)

Create a color scale for Family that distinguishes Phylum by hue.

# Color tibble
ctb <- nat0 %>%
    select(Phylum, Family) %>%
    distinct %>%
    arrange(Phylum, Family) %>%
    group_by(Phylum) %>%
    nest(Family) %>%
    rowwise() %>%
    mutate(Families = list(unlist(data)),
        Num = length(Families))
ctb <- ctb %>%
    arrange(desc(Num)) %>%
    add_column(Palette = c("Blues", "Greens", "Reds", "Oranges"))
ctb <- ctb %>%
    rowwise() %>%
    mutate(Colors = list(RColorBrewer::brewer.pal(Num, name = Palette)))
## Warning in RColorBrewer::brewer.pal(Num, name = Palette): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(Num, name = Palette): minimal value for n is 3, returning requested palette with 3 different levels
last_n <- function(x, n) {
    len <- length(x)
    x[seq(len - n + 1, len)]
}

ctb <- ctb %>%
    rowwise() %>%
    mutate(Colors = list(last_n(Colors, Num)))

ctb <- ctb %>%
    select(Phylum, Families, Colors) %>%
    unnest() %>%
    arrange(Phylum, Families)

colors.families <-  ctb$Colors
names(colors.families) <- ctb$Families
# RColorBrewer::display.brewer.all()
# RColorBrewer::brewer.pal(5, name = "Blues")

We also want the mock and contaminant fractions in the final figure

comp <- bind_rows(
    nat0,
    tb.source %>% filter(Source != "Native") %>% rename(Family = Source)
)
comp %>%
    filter(Sample == "W1") %>%
    select(Family, Abundance) %>%
    arrange(desc(Abundance))
## # A tibble: 16 x 2
##    Family                Abundance
##    <chr>                     <dbl>
##  1 Mock                0.830      
##  2 Contaminant         0.0574     
##  3 Lachnospiraceae     0.0244     
##  4 Ruminococcaceae     0.0218     
##  5 Bacteroidaceae      0.0198     
##  6 Rikenellaceae       0.0130     
##  7 Bifidobacteriaceae  0.00714    
##  8 Eubacteriaceae      0.00617    
##  9 Porphyromonadaceae  0.00575    
## 10 Verrucomicrobiaceae 0.00448    
## 11 Clostridiaceae      0.00340    
## 12 Coriobacteriaceae   0.00197    
## 13 Erysipelotrichaceae 0.000363   
## 14 Prevotellaceae      0.0000887  
## 15 Veillonellaceae     0.0000517  
## 16 Acidaminococcaceae  0.000000690
comp %>%
    group_by(Sample) %>%
    summarize(sum= sum(Abundance)) %>%
    {min(.$sum)}
## [1] 0.9624378
colors.source <- c("#A9A9A9", "#D3D3D3")
names(colors.source) <- c("Contaminant", "Mock")
colors.comp <- c(colors.source, colors.families)
p.comp <- ggplot(comp,
    aes(x = factor(Individual, as.character(c(1:8, "M"))),
        y = Abundance, 
        fill = factor(Family, names(colors.comp)))) +
    geom_bar(stat = "identity") +
    scale_y_continuous(limits=c(0,1), breaks = c(0, 0.5, 1), 
        labels = c("0", "0.5", "1")) +
    geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"), 
        aes(y = y), sides = "l", inherit.aes = FALSE) +
    scale_fill_manual(values = colors.comp) +
    facet_grid(. ~ Protocol, labeller = labeller.protocol) +
    labs(x = "Specimen", y = "Proportion",
        title = paste0("Composition of all bacteria")
        ) +
    base_theme +
    theme(legend.position = "right", legend.key.size = unit(0.11, "in"),
        legend.margin = margin(5,5,5,-5)) + 
    # theme(legend.position = c(1, 0.5), legend.justification = c(1, 0.5),
    #     legend.key.size = unit(0.11, "in"),
    #     ) +
    labs(fill = "Source or\nFamily (Native)")
p.comp