Run with SAVE_FIGURES = TRUE
to save figures in figures/
.
Libraries and paths:
library(here)
library(tidyverse)
library(ggthemes)
library(cowplot)
library(ggbeeswarm)
# Functions for bias estimation and calibration
library(metacal); packageVersion("metacal")
## [1] '0.1.0'
Plot setup:
# Options for `ggplot`:
base_theme <- theme_tufte() +
theme(
text = element_text(size=9, family = ""),
strip.text = element_text(size=9, family = ""),
legend.position = "none"
)
tax_theme <- theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title.x = element_blank())
update_geom_defaults("point", list(size = 1))
update_geom_defaults("text", list(size = 2.5))
update_geom_defaults("hline", list(color = "grey"))
update_geom_defaults("vline", list(color = "grey"))
# Color-blind friendly colors from ColorBrewer2
colors.protocol <- c(
H = "#1b9e77",
W = "#d95f02",
Q = "#7570b3",
Actual = "black")
# Labeller for facets when faceting by protocol
labeller.protocol <- labeller(
Protocol = function(p) paste("Protocol", p))
labeller.protocol.ref <- labeller(
Protocol = function(p) {
focal <- str_sub(p, 1, 1)
ref <- str_sub(p, 2, 2)
paste("Protocol", focal, "/", ref)
}
)
Sample metadata:
## # A tibble: 29 x 5
## Sample Protocol Individual SI_sample Run_accession
## <chr> <chr> <chr> <chr> <chr>
## 1 Q1 Q 1 BYQ_AAAB ERR1971004
## 2 H1 H 1 BYQ_AAAM ERR1971015
## 3 W1 W 1 BYQ_AAAV ERR1971024
## 4 Q2 Q 2 BYQ_AAAF ERR1971008
## 5 H2 H 2 BYQ_AAAQ ERR1971019
## 6 W2 W 2 BYQ_AAAZ ERR1971028
## 7 Q3 Q 3 BYQ_AAAA ERR1971003
## 8 H3 H 3 BYQ_AAAL ERR1971014
## 9 W3 W 3 BYQ_AAAU ERR1971023
## 10 Q4 Q 4 BYQ_AAAE ERR1971007
## # … with 19 more rows
Expected mock community composition, obtained from the supplementary data (see data-raw/costea2017.R
)
The FACS measurements are the “bacterial cells” columns; the measurements without endospores were used as the actual composition, and replicate measurements were averaged (personal communication with Paul Costea);
mock <- mock %>%
select(Taxon, FACS = "bacterial cells in spike in Mix") %>%
group_by(Taxon) %>%
summarize_at(vars(FACS), mean)
mock <- mock %>%
arrange(desc(FACS))
mock
## # A tibble: 10 x 2
## Taxon FACS
## <chr> <dbl>
## 1 Prevotella_melaninogenica 91361186.
## 2 Clostridium_perfringens 73342343.
## 3 Salmonella_enterica 58450204.
## 4 Clostridium_difficile 42383205.
## 5 Lactobacillus_plantarum 22566221.
## 6 Vibrio_cholerae 10065143.
## 7 Clostridium_saccharolyticum 9212814.
## 8 Yersinia_pseudotuberculosis 3950160.
## 9 Blautia_hansenii 459883.
## 10 Fusobacterium_nucleatum 204756.
Note that the order of three species (V. cholerae, C. saccharolyticum, and Y. pseudotuberculosis) differ from that of Costea2017 Figure 6. We reproduce their figure below, showing that these species are likely mislabeled in their figure and not in the mock
table. But a mislabeling would here not affect our main conclusions.
Load the Metaphlan2 profiles:
data("costea2017_metaphlan2_profiles")
profiles <- costea2017_metaphlan2_profiles
profiles %>% corner
## # A tibble: 5 x 5
## Clade ERR1971003 ERR1971004 ERR1971005 ERR1971006
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 k__Archaea 0.00101 0.00036 0.00018 0
## 2 k__Archaea|p__Euryarchaeota 0.00101 0.00036 0.00018 0
## 3 k__Archaea|p__Euryarchaeota|… 0.00101 0.00036 0.00018 0
## 4 k__Archaea|p__Euryarchaeota|… 0.00101 0.00036 0.00018 0
## 5 k__Archaea|p__Euryarchaeota|… 0.00101 0.00036 0.00018 0
The samples are named here by their ENA run accession; let’s rename to the new sample names.
new_names <- c("Clade",
sam$Sample[match(colnames(profiles)[-1], sam$Run_accession)])
colnames(profiles) <- new_names
Two samples, QA and QB, are two extra fecal samples that were only sequenced by protocol Q and so we won’t be using for our analysis. We also drop unneeded sample variables.
sam <- sam %>%
filter(!(Sample %in% c("QA", "QB"))) %>%
select(Sample, Protocol, Individual)
profiles <- profiles[, c("Clade", sam$Sample)]
Taxonomy from the metaphlan2 clade names:
tax_ranks <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus",
"Species", "Strain")
rank_letters <- c("k", "p", "c", "o", "f", "g", "s", "t")
tax_pattern <- paste0("(?:", rank_letters, "__(\\w+))?") %>%
paste(collapse = "\\|?")
tax <- profiles %>%
select(Clade) %>%
extract(Clade, tax_ranks, regex = tax_pattern, remove = FALSE)
tax
## # A tibble: 1,868 x 9
## Clade Kingdom Phylum Class Order Family Genus Species Strain
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 k__Archaea Archaea <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 k__Archae… Archaea Euryar… <NA> <NA> <NA> <NA> <NA> <NA>
## 3 k__Archae… Archaea Euryar… Metha… <NA> <NA> <NA> <NA> <NA>
## 4 k__Archae… Archaea Euryar… Metha… Metha… <NA> <NA> <NA> <NA>
## 5 k__Archae… Archaea Euryar… Metha… Metha… Methan… <NA> <NA> <NA>
## 6 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… <NA> <NA>
## 7 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… <NA>
## 8 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… GCF_000…
## 9 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… <NA>
## 10 k__Archae… Archaea Euryar… Metha… Metha… Methan… Meth… Methano… Methano…
## # … with 1,858 more rows
Species table with just the Metaphlan species identifiers
st <- profiles %>%
left_join(tax, by = "Clade") %>%
filter(!is.na(Species), is.na(Strain)) %>%
select(-Strain)
st
## # A tibble: 718 x 35
## Clade Q1 H1 W1 Q2 H2 W2 Q3 H3
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 k__A… 0. 0 0. 0 0 0 0 0
## 2 k__A… 3.60e-4 0 0. 0.0830 0.0935 0.616 0.0009 0.00012
## 3 k__A… 0. 0 0. 0.0161 0 0 0 0
## 4 k__A… 0. 0 0. 0 0 0 0.00011 0
## 5 k__B… 0. 0 0. 0 0 0 0.00095 0
## 6 k__B… 0. 0 0. 0.00795 0.00008 0.00088 0.0011 0
## 7 k__B… 8.30e-4 0 0. 0 0 0 0 0
## 8 k__B… 1.50e-4 0.00088 2.80e-4 0.00044 0.00008 0 0.00534 0.0018
## 9 k__B… 2.30e-4 0 0. 0.00361 0 0.00036 0.00016 0.00008
## 10 k__B… 6.27e-3 0.00093 4.30e-4 0.00173 0 0.00018 0 0
## # … with 708 more rows, and 26 more variables: W3 <dbl>, Q4 <dbl>,
## # H4 <dbl>, W4 <dbl>, Q5 <dbl>, H5 <dbl>, W5 <dbl>, Q6 <dbl>, H6 <dbl>,
## # W6 <dbl>, Q7 <dbl>, H7 <dbl>, W7 <dbl>, Q8 <dbl>, H8 <dbl>, W8 <dbl>,
## # QM <dbl>, HM <dbl>, WM <dbl>, Kingdom <chr>, Phylum <chr>,
## # Class <chr>, Order <chr>, Family <chr>, Genus <chr>, Species <chr>
Species names have the form Genus_species
and are unique,
## [1] "Methanobrevibacter_ruminantium" "Methanobrevibacter_smithii"
## [3] "Methanobrevibacter_unclassified" "Methanosphaera_stadtmanae"
## [5] "Actinomyces_cardiffensis" "Actinomyces_europaeus"
## [1] 0
The Genus_unclassified
catches cases where more reads were mapped to the genus’s markers than could be accounted for by species within the genus. We will include these for measuring the family-level compositions but not the spike-in compositions From here on, we will identify the Clade / Taxon with this species identifier,
Next, join the metadata and tidy it into a tall format (one sample-taxon observation per row)
st <- st %>%
gather("Sample", "Abundance", sam$Sample) %>%
left_join(sam, by = "Sample") %>%
select(Taxon, Sample, Abundance, colnames(sam), everything())
st
## # A tibble: 19,386 x 12
## Taxon Sample Abundance Protocol Individual Kingdom Phylum Class Order
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Meth… Q1 0 Q 1 Archaea Eurya… Meth… Meth…
## 2 Meth… Q1 0.00036 Q 1 Archaea Eurya… Meth… Meth…
## 3 Meth… Q1 0 Q 1 Archaea Eurya… Meth… Meth…
## 4 Meth… Q1 0 Q 1 Archaea Eurya… Meth… Meth…
## 5 Acti… Q1 0 Q 1 Bacter… Actin… Acti… Acti…
## 6 Acti… Q1 0 Q 1 Bacter… Actin… Acti… Acti…
## 7 Acti… Q1 0.00083 Q 1 Bacter… Actin… Acti… Acti…
## 8 Acti… Q1 0.000150 Q 1 Bacter… Actin… Acti… Acti…
## 9 Acti… Q1 0.00023 Q 1 Bacter… Actin… Acti… Acti…
## 10 Acti… Q1 0.00627 Q 1 Bacter… Actin… Acti… Acti…
## # … with 19,376 more rows, and 3 more variables: Family <chr>,
## # Genus <chr>, Species <chr>
Metaphlan outputs estimates of the proportions * 100. We will renormalize to proportions summing to 1 instead.
st <- st %>%
mutate_by(Sample, Abundance = close_elts(Abundance))
st %>%
group_by(Sample) %>%
summarize(sum(Abundance)) %>%
{summary(.[,2])}
## sum(Abundance)
## Min. :1
## 1st Qu.:1
## Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
Output from the various kingdoms:
st %>%
group_by(Kingdom, Sample) %>%
summarize(Abundance = sum(Abundance)) %>%
spread(Kingdom, Abundance) %>%
print(n=Inf)
## # A tibble: 27 x 5
## Sample Archaea Bacteria Eukaryota Viruses
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 H1 0 0.931 0.0000845 0.0689
## 2 H2 0.000936 0.942 0.0000650 0.0569
## 3 H3 0.00000120 0.944 0.0000756 0.0564
## 4 H4 0 0.956 0.000103 0.0435
## 5 H5 0.000759 0.933 0.0000763 0.0664
## 6 H6 0.00000100 0.947 0.0000541 0.0526
## 7 H7 0 0.916 0.0000971 0.0842
## 8 H8 0.00000110 0.928 0.0000712 0.0715
## 9 HM 0 0.846 0.0000424 0.154
## 10 Q1 0.00000360 0.932 0.0000544 0.0679
## 11 Q2 0.000992 0.978 0.0000190 0.0207
## 12 Q3 0.0000101 0.987 0.0000785 0.0124
## 13 Q4 0 0.956 0.0000910 0.0434
## 14 Q5 0.00102 0.945 0.0000424 0.0543
## 15 Q6 0 0.964 0.0000539 0.0355
## 16 Q7 0.00000180 0.953 0.0000460 0.0473
## 17 Q8 0 0.952 0.0000775 0.0478
## 18 QM 0.00000260 0.782 0.0000238 0.218
## 19 W1 0 0.880 0.0000436 0.120
## 20 W2 0.00616 0.915 0.0000884 0.0784
## 21 W3 0 0.933 0.0000661 0.0669
## 22 W4 0.00000230 0.927 0.0000828 0.0733
## 23 W5 0.00159 0.903 0.0000666 0.0958
## 24 W6 0.000000900 0.944 0.0000386 0.0561
## 25 W7 0.00000340 0.871 0.0000424 0.129
## 26 W8 0 0.879 0.0000502 0.121
## 27 WM 0 0.857 0.0000441 0.143
There is very little Archaea and Eukaryote abundance, and the vast majority of the viral abundance comes from a Salmonella phage that I suspect is integrated into the genome of the Salmonella enterica strain in the spike-in.
st %>%
filter(Kingdom == "Viruses") %>%
group_by(Taxon) %>%
summarize(Abundance = mean(Abundance)) %>%
arrange(desc(Abundance))
## # A tibble: 5 x 2
## Taxon Abundance
## <chr> <dbl>
## 1 Salmonella_phage_Fels_1 0.0772
## 2 Lactococcus_phage_P680 0.0000241
## 3 C2likevirus_unclassified 0.0000128
## 4 Lactobacillus_phage_PL_1 0.00000478
## 5 Dasheen_mosaic_virus 0.00000311
Bacteria form the vast majority of estimated non-viral abundance in each sample,
st %>%
group_by(Kingdom, Sample) %>%
summarize(Abundance = sum(Abundance)) %>%
filter(Kingdom != "Viruses") %>%
mutate_by(Sample, Abundance = close_elts(Abundance)) %>%
filter(Kingdom == "Bacteria") %>%
summarize_at("Abundance", list(Min = min, Mean = mean))
## # A tibble: 1 x 2
## Min Mean
## <dbl> <dbl>
## 1 0.993 0.999
We will therefore restrict our analysis of sample composition to just Bacteria, renormalizing the abundances to sum to 1 after filtering.
Costea et al. report E. coli as a likely lab contaminant of the spike-in based on its appearance in the profiles of the mock-only samples (which they derived using mOTUs rather than Metaphlan2). So let’s check what is at appreciable frequency in the mock-only samples that shouldn’t be,
tb <- st %>%
filter(
Individual == "M", Abundance > 0,
!(Taxon %in% mock$Taxon)
) %>%
group_by(Family, Taxon) %>%
summarize(
Mean = mean(Abundance),
Max = max(Abundance),
) %>%
filter(Max > 1e-4) %>%
arrange(desc(Mean))
tb
## # A tibble: 9 x 4
## # Groups: Family [4]
## Family Taxon Mean Max
## <chr> <chr> <dbl> <dbl>
## 1 Enterobacteriaceae Shigella_flexneri 0.0871 0.109
## 2 Enterobacteriaceae Salmonella_unclassified 0.0134 0.0183
## 3 Enterobacteriaceae Escherichia_coli 0.00394 0.00520
## 4 Enterobacteriaceae Escherichia_unclassified 0.000764 0.000948
## 5 Xanthomonadaceae Xanthomonas_perforans 0.000126 0.000146
## 6 Moraxellaceae Psychrobacter_unclassified 0.000122 0.000208
## 7 Xanthomonadaceae Xanthomonas_gardneri 0.000107 0.000128
## 8 Enterobacteriaceae Pantoea_unclassified 0.000101 0.000139
## 9 Prevotellaceae Prevotella_copri 0.000100 0.000160
Here we see that most of the contaminant reads appear to be captured by Shigella_flexneri
identifier. Given the closeness of S. flexnari with E. coli, the estimated abundance for Escherichia_coli
may also be coming from the contaminant strain. Since S. flexnari is generally much larger,
st %>%
filter(Taxon %in% c("Shigella_flexneri", "Escherichia_coli")) %>%
group_by(Individual, Taxon) %>%
summarize_at(vars(Abundance), mean) %>%
spread(Taxon, Abundance) %>%
mutate(Escherichia_coli / Shigella_flexneri)
## # A tibble: 9 x 4
## # Groups: Individual [9]
## Individual Escherichia_coli Shigella_flexneri `Escherichia_coli/Shigella…
## <chr> <dbl> <dbl> <dbl>
## 1 1 0.00183 0.0363 0.0504
## 2 2 0.00220 0.0220 0.0998
## 3 3 0.00100 0.0206 0.0487
## 4 4 0.000954 0.0221 0.0432
## 5 5 0.00153 0.0309 0.0497
## 6 6 0.000894 0.0178 0.0503
## 7 7 0.00203 0.0369 0.0549
## 8 8 0.00181 0.0335 0.0540
## 9 M 0.00394 0.0871 0.0452
We will treat Shigella_flexneri
as the lab contaminant in our analysis,
Our analysis of the spike-in abundances below show that the ratio of Shigella_flexneri
to other spike-in taxa is constant across samples, indicating that Shigella_flexneri
reads in the fecal samples are primarily from the contaminant and not native gut strains.
Above, we also see appreciable frequency of Salmonella_unclassified
, which is due to the genus-level estimated proportion for Salmonella being larger than the species-level estimates. Because the estimated proportions of Escherichia_coli
, Escherichia_unclassified
, and Salmonella_unclassified
are all fairly small, they will not substantially affect the family-level composition plot (Figure 4A). and so we will leave it in as a “native” taxon for this analysis. However, because these abundances might be due to the spike-in rather than native gut taxa, we will filter them out before proceeding.
Classify the taxa by source type:
st0 <- st0 %>%
mutate(Source = case_when(
Taxon %in% mock$Taxon ~ "Mock",
Taxon == contaminant ~ "Contaminant",
TRUE ~ "Native"
)
)
View the proportions of native, mock, and contaminant for each sample:
tb.source <- st0 %>%
group_by(Sample, Protocol, Individual, Source) %>%
summarize(Abundance = sum(Abundance)) %>%
ungroup()
tb.source %>%
spread(Source, Abundance) %>%
print(n=Inf)
## # A tibble: 27 x 6
## Sample Protocol Individual Contaminant Mock Native
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 H1 H 1 0.0280 0.290 0.682
## 2 H2 H 2 0.0259 0.263 0.711
## 3 H3 H 3 0.0265 0.279 0.695
## 4 H4 H 4 0.0178 0.179 0.803
## 5 H5 H 5 0.0268 0.321 0.653
## 6 H6 H 6 0.0205 0.228 0.752
## 7 H7 H 7 0.0330 0.372 0.595
## 8 H8 H 8 0.0309 0.353 0.616
## 9 HM H M 0.0818 0.917 0.00170
## 10 Q1 Q 1 0.0247 0.196 0.780
## 11 Q2 Q 2 0.00485 0.0317 0.963
## 12 Q3 Q 3 0.00498 0.0276 0.967
## 13 Q4 Q 4 0.0166 0.132 0.851
## 14 Q5 Q 5 0.0197 0.150 0.830
## 15 Q6 Q 6 0.0117 0.0834 0.905
## 16 Q7 Q 7 0.0183 0.157 0.825
## 17 Q8 Q 8 0.0151 0.111 0.874
## 18 QM Q M 0.112 0.886 0.00211
## 19 W1 W 1 0.0574 0.830 0.112
## 20 W2 W 2 0.0358 0.442 0.522
## 21 W3 W 3 0.0307 0.417 0.552
## 22 W4 W 4 0.0321 0.443 0.525
## 23 W5 W 5 0.0468 0.598 0.356
## 24 W6 W 6 0.0213 0.358 0.621
## 25 W7 W 7 0.0605 0.769 0.170
## 26 W8 W 8 0.0555 0.734 0.211
## 27 WM W M 0.0726 0.926 0.00122
The contaminant is always a fairly small fraction of the total sample. It also generally appears in a consistent ratio with the mock for a given protocol:
tb.source %>%
spread(Source, Abundance) %>%
ggplot(aes(Sample, Contaminant / Mock, color = Protocol,
label = Individual)) +
scale_color_manual(values = colors.protocol) +
geom_text() +
coord_flip()
In samples Q2 and Q3, the spike-in makes up an unusually low fraction of the total (see table above), so the larger ratio in these samples may be due to chance. These observations suggest that the contaminant is being correctly identified.
Composition by source type:
p.source <- ggplot(tb.source,
aes(y = Abundance,
x = Individual,
# x = factor(Individual, lvls.mock_prop),
fill = Source)) +
geom_bar(stat = "identity") +
scale_y_continuous(breaks = c(0, 0.5, 1),
labels = c("0", "0.5", "1")) +
geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"),
aes(y = y), sides = "l", inherit.aes = FALSE) +
scale_fill_brewer(type = "qual") +
facet_grid(. ~ Protocol, labeller = labeller.protocol) +
labs(x = "Specimen", fill = "Taxon type",
title = "Composition of different bacterial types") +
base_theme +
theme(legend.position = "right")
p.source
Get the native taxa agglomerated to the Family level,
nat <- st0 %>%
filter(Source == "Native") %>%
group_by_at(vars(Sample, Protocol, Individual, Kingdom:Family)) %>%
summarize(Abundance = sum(Abundance)) %>%
ungroup
Let’s restrict to Families within a minimal abundance to simplify the figure. First, lets check the prevalence of various families,
prev <- nat %>%
filter(Individual != "M") %>%
mutate_by(Sample, Proportion = close_elts(Abundance)) %>%
group_by(Family) %>%
summarize(Prev = sum(Proportion > 0),
Min = min(Proportion),
Median = median(Proportion),
Mean = mean(Proportion),
Max = max(Proportion))
prev %>%
arrange(desc(Max)) %>%
print(n=30)
## # A tibble: 98 x 6
## Family Prev Min Median Mean Max
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Bacteroidaceae 24 0.0412 0.227 0.228 0.496
## 2 Ruminococcaceae 24 0.0755 0.221 0.229 0.458
## 3 Prevotellaceae 24 0.0000859 0.000586 0.0521 0.456
## 4 Lachnospiraceae 24 0.0492 0.157 0.166 0.311
## 5 Bifidobacteriaceae 24 0.000279 0.0196 0.0443 0.242
## 6 Eubacteriaceae 24 0.0151 0.0938 0.0890 0.162
## 7 Rikenellaceae 24 0.0177 0.0439 0.0579 0.162
## 8 Coriobacteriaceae 24 0.00182 0.0113 0.0185 0.103
## 9 Porphyromonadaceae 24 0.0109 0.0258 0.0324 0.0773
## 10 Erysipelotrichaceae 24 0.000854 0.00209 0.00703 0.0732
## 11 Acidaminococcaceae 24 0.00000244 0.0000307 0.00796 0.0724
## 12 Veillonellaceae 24 0.0000890 0.0116 0.0204 0.0688
## 13 Verrucomicrobiaceae 24 0.000000613 0.000164 0.0110 0.0622
## 14 Clostridiaceae 24 0.00153 0.00526 0.00687 0.0303
## 15 Oscillospiraceae 24 0.00161 0.00455 0.00547 0.0148
## 16 Sutterellaceae 24 0.0000214 0.00337 0.00453 0.0146
## 17 Bacteroidales_noname 24 0.000131 0.00307 0.00409 0.0145
## 18 Streptococcaceae 24 0.000260 0.00205 0.00312 0.00865
## 19 Desulfovibrionaceae 24 0.000855 0.00417 0.00441 0.00752
## 20 Burkholderiales_noname 24 0.00000230 0.0000564 0.000989 0.00644
## 21 Lactobacillaceae 24 0.00000618 0.000142 0.000564 0.00421
## 22 Clostridiales_noname 24 0.000244 0.00147 0.00158 0.00411
## 23 Peptostreptococcaceae 24 0.00000691 0.000172 0.000450 0.00350
## 24 Xanthomonadaceae 24 0.000519 0.00139 0.00144 0.00290
## 25 Pasteurellaceae 24 0.00000126 0.000234 0.000387 0.00262
## 26 Enterobacteriaceae 24 0.0000173 0.000157 0.000321 0.00179
## 27 Oxalobacteraceae 21 0 0.00000565 0.000112 0.00143
## 28 Campylobacteraceae 14 0 0.00000186 0.000110 0.00141
## 29 Enterococcaceae 24 0.00000296 0.000219 0.000335 0.00105
## 30 Moraxellaceae 18 0 0.0000731 0.000145 0.000931
## # … with 68 more rows
Let’s use the families with a proportion of at least 2%.
families <- prev %>%
filter(Max > 0.02) %>%
{.$Family}
nat0 <- nat %>%
filter(Family %in% families) %>%
arrange(Phylum, Family)
Create a color scale for Family that distinguishes Phylum by hue.
# Color tibble
ctb <- nat0 %>%
select(Phylum, Family) %>%
distinct %>%
arrange(Phylum, Family) %>%
group_by(Phylum) %>%
nest(Family) %>%
rowwise() %>%
mutate(Families = list(unlist(data)),
Num = length(Families))
ctb <- ctb %>%
arrange(desc(Num)) %>%
add_column(Palette = c("Blues", "Greens", "Reds", "Oranges"))
ctb <- ctb %>%
rowwise() %>%
mutate(Colors = list(RColorBrewer::brewer.pal(Num, name = Palette)))
## Warning in RColorBrewer::brewer.pal(Num, name = Palette): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(Num, name = Palette): minimal value for n is 3, returning requested palette with 3 different levels
last_n <- function(x, n) {
len <- length(x)
x[seq(len - n + 1, len)]
}
ctb <- ctb %>%
rowwise() %>%
mutate(Colors = list(last_n(Colors, Num)))
ctb <- ctb %>%
select(Phylum, Families, Colors) %>%
unnest() %>%
arrange(Phylum, Families)
colors.families <- ctb$Colors
names(colors.families) <- ctb$Families
# RColorBrewer::display.brewer.all()
# RColorBrewer::brewer.pal(5, name = "Blues")
We also want the mock and contaminant fractions in the final figure
comp <- bind_rows(
nat0,
tb.source %>% filter(Source != "Native") %>% rename(Family = Source)
)
comp %>%
filter(Sample == "W1") %>%
select(Family, Abundance) %>%
arrange(desc(Abundance))
## # A tibble: 16 x 2
## Family Abundance
## <chr> <dbl>
## 1 Mock 0.830
## 2 Contaminant 0.0574
## 3 Lachnospiraceae 0.0244
## 4 Ruminococcaceae 0.0218
## 5 Bacteroidaceae 0.0198
## 6 Rikenellaceae 0.0130
## 7 Bifidobacteriaceae 0.00714
## 8 Eubacteriaceae 0.00617
## 9 Porphyromonadaceae 0.00575
## 10 Verrucomicrobiaceae 0.00448
## 11 Clostridiaceae 0.00340
## 12 Coriobacteriaceae 0.00197
## 13 Erysipelotrichaceae 0.000363
## 14 Prevotellaceae 0.0000887
## 15 Veillonellaceae 0.0000517
## 16 Acidaminococcaceae 0.000000690
## [1] 0.9624378
colors.source <- c("#A9A9A9", "#D3D3D3")
names(colors.source) <- c("Contaminant", "Mock")
colors.comp <- c(colors.source, colors.families)
p.comp <- ggplot(comp,
aes(x = factor(Individual, as.character(c(1:8, "M"))),
y = Abundance,
fill = factor(Family, names(colors.comp)))) +
geom_bar(stat = "identity") +
scale_y_continuous(limits=c(0,1), breaks = c(0, 0.5, 1),
labels = c("0", "0.5", "1")) +
geom_rangeframe(data = tibble(y = c(0, 1), Protocol = "H"),
aes(y = y), sides = "l", inherit.aes = FALSE) +
scale_fill_manual(values = colors.comp) +
facet_grid(. ~ Protocol, labeller = labeller.protocol) +
labs(x = "Specimen", y = "Proportion",
title = paste0("Composition of all bacteria")
) +
base_theme +
theme(legend.position = "right", legend.key.size = unit(0.11, "in"),
legend.margin = margin(5,5,5,-5)) +
# theme(legend.position = c(1, 0.5), legend.justification = c(1, 0.5),
# legend.key.size = unit(0.11, "in"),
# ) +
labs(fill = "Source or\nFamily (Native)")
p.comp
Next, we want to visualize relative abundances vs. actual composition (bias) of the mock spike-in (including the contaminant). Let’s first confirm that all taxa were detected in all samples:
## [1] TRUE
Now let’s restrict to just the spike-in taxa. Since the 10 mock species appear in all samples with the same known relative abundances, it is convenient to normalize the abundances of all taxa to the geometric mean of the 10 mock species (called Denom
below).
sts <- st0 %>%
filter(Taxon %in% c(mock$Taxon, contaminant)) %>%
rename(Observed = Abundance) %>%
mutate(Taxon = ifelse(Taxon == contaminant, "Contaminant", Taxon))
denom <- sts %>%
filter(Taxon %in% mock$Taxon) %>%
group_by(Sample) %>%
summarize(Denom = gm_mean(Observed))
sts <- sts %>%
left_join(denom, by = c("Sample")) %>%
mutate(Observed = Observed / Denom)
Next, add the actual relative abundances as determined by the FACS measurement.
mock0 <- mock %>%
rename(Actual = FACS) %>%
mutate(Actual = center_elts(Actual))
sts <- sts %>%
left_join(mock0, by = "Taxon") %>%
mutate(Error = Observed / Actual)
Also grab a version with just the mock taxa
Plot the observed and actual relative abundances.
p.step <- ggplot(sts,
aes(x = factor(Taxon, c(mock$Taxon, "Contaminant")), y = Observed,
color = Protocol, shape = (Individual == "M"))) +
geom_quasirandom() +
geom_rangeframe(color = "black", sides = "l") +
scale_y_log10(labels = log_formatter) +
scale_x_discrete(labels = tax_labeller) +
scale_shape_manual(values = c(16, 1), labels = c("Fecal", "Mock only")) +
scale_color_manual(values = colors.protocol) +
coord_cartesian(clip = "off") +
labs(title = "Composition of the spike-in",
# y = "Abundance / gm. mean of non-contaminants",
y = "Abundance / gm. mean",
shape = "Sample type") +
base_theme +
tax_theme +
theme(legend.position = "right", legend.margin = margin(5,0,5,-5))
# Add the actual abundances (as a stair plot like in Costea2017's Figure 6)
# underneath the data points
tb.step <- mock0 %>%
mutate(x1 = rank(desc(Actual)) - 0.5, x2 = x1 + 0.5,
x3 = ifelse(x2 == 10, 10.5, NA)) %>%
gather("loc", "x", x1, x2, x3) %>%
filter(!is.na(x)) %>%
arrange(x)
p.step$layers <- c(
geom_step(data = tb.step, aes(x = x, y = Actual, linetype = "Actual"),
size = 0.3, color = "black", inherit.aes = FALSE),
p.step$layers)
# p.step <- p.step +
# annotate("text", x = 10.6, y = 0.021, label = "Actual",
# color = "black", hjust = 0)
p.step <- p.step +
guides(
color = guide_legend(order = 1),
shape = guide_legend(order = 2),
linetype = guide_legend(order = 3))
p.step
Estimate the bias within each protocol as the center (or compositional mean) of the observed composition errors. Because all samples have all taxa, we can do this very straightforwardly by taking geometric means:
bias <- stm %>%
group_by(Protocol, Taxon) %>%
summarize(Bias = gm_mean(Error)) %>%
mutate(Bias = center_elts(Bias)) %>%
ungroup
bias %>%
spread(Protocol, Bias)
## # A tibble: 10 x 4
## Taxon H Q W
## <chr> <dbl> <dbl> <dbl>
## 1 Blautia_hansenii 0.537 0.745 1.80
## 2 Clostridium_difficile 0.110 0.0891 0.223
## 3 Clostridium_perfringens 1.18 0.489 1.14
## 4 Clostridium_saccharolyticum 1.21 1.44 0.589
## 5 Fusobacterium_nucleatum 46.3 4.98 16.5
## 6 Lactobacillus_plantarum 0.0168 0.773 0.355
## 7 Prevotella_melaninogenica 2.81 1.55 1.37
## 8 Salmonella_enterica 1.77 2.29 0.788
## 9 Vibrio_cholerae 2.10 1.16 0.888
## 10 Yersinia_pseudotuberculosis 1.46 1.35 0.660
The call to center_elts
normalizes the efficiencies within each protocol to have a geometric mean of 1,
## # A tibble: 3 x 2
## Protocol `gm_mean(Bias)`
## <chr> <dbl>
## 1 H 1
## 2 Q 1
## 3 W 1
The elements of the Bias
column can be interpretted as the detection efficiency relative to the average taxon for that protocol.
To confirm that the estimation worked, let’s calibrate the relative abundances from the previous figure:
p.step.cal <- p.step
p.step.cal$data <- p.step.cal$data %>%
left_join(bias, by = c("Protocol", "Taxon")) %>%
mutate(Observed = Observed / Bias)
p.step.cal
## Warning: Removed 27 rows containing missing values (position_quasirandom).
Join the bias w/ the abundance table and compute compositional residuals.
stm <- stm %>%
left_join(bias, by = c("Protocol", "Taxon")) %>%
mutate(
Residual = Error / Bias,
Predicted = Actual * Bias
)
# Note, `Residual` is already geometrically centered.
Let’s also check that the bias predictions predict the taxon ratios:
gvs <- c("Protocol", "Individual", "Sample")
ratios <- stm %>%
mutate(Taxon = tax_abbrev(Taxon)) %>%
compute_ratios(group_vars = gvs) %>%
mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))
ratios.pred <- ratios %>%
select(Taxon.x, Taxon.y, Pair, Protocol, Actual, Bias, Predicted) %>%
distinct
ggplot(ratios, aes(Pair, Error,
label = Individual, color = Protocol)) +
geom_point(data = ratios.pred,
aes(Pair, Bias), inherit.aes = FALSE,
shape = 3, size = 4, color = "black") +
geom_text(position = position_jitter(width=0.25, height=0)) +
geom_hline(yintercept = 1) +
facet_grid(Protocol ~ ., labeller = labeller.protocol) +
geom_rangeframe(sides = "l", color= "black") +
scale_color_manual(values = colors.protocol) +
scale_y_log10() +
scale_x_discrete(labels = tax_labeller) +
base_theme +
tax_theme
Distribution of efficiencies for the three protocols:
ggplot(bias, aes(Bias, fill = Protocol)) +
geom_dotplot() +
scale_x_log10() +
facet_grid(Protocol ~ ., labeller = labeller.protocol) +
scale_fill_manual(values = colors.protocol) +
scale_y_continuous(breaks = NULL) +
geom_rangeframe() +
base_theme
Let’s also estimate the differential bias (including the contaminant):
dbias <- sts %>%
select(Taxon, Observed, Protocol, Individual) %>%
spread(Protocol, Observed) %>%
group_by(Taxon) %>%
summarize(
HQ = gm_mean(H / Q),
HW = gm_mean(H / W),
QW = gm_mean(Q / W))
dbias <- dbias %>%
gather("Protocol", "Bias", -Taxon)
dbias %>%
spread(Protocol, Bias)
## # A tibble: 11 x 4
## Taxon HQ HW QW
## <chr> <dbl> <dbl> <dbl>
## 1 Blautia_hansenii 0.721 0.298 0.413
## 2 Clostridium_difficile 1.24 0.494 0.400
## 3 Clostridium_perfringens 2.41 1.04 0.431
## 4 Clostridium_saccharolyticum 0.837 2.05 2.45
## 5 Contaminant 0.894 2.13 2.38
## 6 Fusobacterium_nucleatum 9.30 2.80 0.301
## 7 Lactobacillus_plantarum 0.0218 0.0475 2.18
## 8 Prevotella_melaninogenica 1.82 2.05 1.12
## 9 Salmonella_enterica 0.773 2.25 2.90
## 10 Vibrio_cholerae 1.81 2.37 1.31
## 11 Yersinia_pseudotuberculosis 1.08 2.21 2.05
For consistency we keep the denominator as just the 10 mock (non-contaminant) species.
What fraction of the compositional error (measured by squared Aitchison distance) is explained by our model for each protocol?
err <- stm %>%
group_by(Protocol, Sample) %>%
summarize(
Error2 = anorm(Error)^2,
Bias2 = anorm(Bias)^2,
Resid2 = anorm(Error / Bias)^2
) %>%
summarize_at(vars(-Sample), mean)
err <- err %>%
mutate(Frac_Bias = Bias2 / Error2, Frac_Resid = Resid2 / Error2)
err
## # A tibble: 3 x 6
## Protocol Error2 Bias2 Resid2 Frac_Bias Frac_Resid
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 H 39.7 38.8 0.892 0.978 0.0225
## 2 Q 11.3 10.2 1.10 0.903 0.0969
## 3 W 12.2 12.2 0.0492 0.996 0.00402
Residuals, with the mock-only samples marked:
stm %>%
mutate(Taxon = factor(Taxon, mock$Taxon)) %>%
ggplot(aes(x = Taxon, y = Residual, shape = Individual == "M",
color = Protocol)) +
geom_quasirandom() +
geom_hline(yintercept = 1) +
geom_rangeframe(sides = "l", color = "black") +
scale_y_log10() +
scale_shape_manual(values = c(16, 1), labels = c("Fecal", "Mock only")) +
scale_color_manual(values = colors.protocol) +
facet_grid(Protocol ~ ., scales = "fixed") +
base_theme +
tax_theme
Note that the residuals are not independent across taxa; the high Residuals for L. plantarum and B. hansenii for protocol H are pushing down the others.
Get the bias estimates w/ estimated geometric standard errors -
reps <- stm %>%
group_by(Protocol) %>%
nest %>%
mutate(
Error_matrix = map(data, build_matrix, "Sample", "Taxon", "Error"),
Bootreps = map(Error_matrix, bootrep_center, R = 4e3,
method = "gm")) %>%
unnest(Bootreps) %>%
rename(Bias = Center)
stats <- reps %>%
group_by(Protocol, Taxon) %>%
summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>%
ungroup
bias <- left_join(bias, stats, by = c("Protocol", "Taxon"))
As expected, the geometric means of the resamplings approximate the point estimate.
## [1] "Mean relative difference: 0.0009001726"
Create the SI figure showing the bias estimate with two standard errors:
r <- range(stm$Observed / stm$Actual)
p1 <- ggplot(bias, aes(factor(Taxon, mock$Taxon), Bias, color = Protocol)) +
geom_hline(yintercept = 1) +
geom_quasirandom(data = stm, aes(y = Observed / Actual), alpha = 0.4) +
geom_pointrange(aes(ymin = Bias / gm_se^2, ymax = Bias * gm_se^2),
fatten = 1.5) +
scale_y_log10() +
scale_color_manual(values = colors.protocol) +
facet_wrap(~ Protocol, labeller = labeller.protocol) +
geom_rangeframe(sides = "l", color = "black") +
scale_y_log10(limits = r, labels = log_formatter) +
scale_x_discrete(labels = tax_labeller) +
labs(y = "Efficiency / gm. mean") +
base_theme +
tax_theme +
theme(panel.spacing = unit(1, "lines"))
p1
Do this for differential bias as well. Note setting the denominator to just the mock taxa.
dreps <- sts %>%
select(Taxon, Observed, Protocol, Individual) %>%
spread(Protocol, Observed) %>%
transmute(
Taxon, Individual,
HQ = H / Q,
HW = H / W,
QW = Q / W
) %>%
gather("Protocol", "Error", -Taxon, -Individual) %>%
group_by(Protocol) %>%
nest %>%
mutate(
Error_matrix = map(data, build_matrix, "Individual", "Taxon", "Error"),
Bootreps = map(Error_matrix, bootrep_center, R = 4e3, method = "gm",
denom = mock$Taxon)) %>%
unnest(Bootreps) %>%
rename(Bias = Center)
dstats <- dreps %>%
group_by(Protocol, Taxon) %>%
summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>%
ungroup
dbias <- left_join(dbias, dstats, by = c("Protocol", "Taxon"))
all.equal(dbias$Bias, dbias$gm_mean)
## [1] "Mean relative difference: 0.001009332"
p2 <- ggplot(dbias, aes(factor(Taxon, c(mock$Taxon, "Contaminant")), Bias)) +
geom_hline(yintercept = 1) +
# geom_quasirandom(data = sts, aes(y = Observed / Actual), alpha = 0.5) +
geom_pointrange(aes(ymin = Bias / gm_se^2, ymax = Bias * gm_se^2),
fatten = 1.5) +
scale_y_log10() +
# scale_color_manual(values = colors.protocol) +
facet_wrap(~ Protocol, labeller = labeller.protocol.ref) +
geom_rangeframe(sides = "l", color = "black") +
scale_y_log10(limits = r, labels = log_formatter) +
scale_x_discrete(labels = tax_labeller) +
labs(y = "Efficiency / gm. mean") +
base_theme +
tax_theme +
theme(panel.spacing = unit(1, "lines"))
p2
Here we wish to see how the noise in the bias estimate decreases with the number of control samples used. To do so, we will perform multinomial resampling of N control samples, for each N between 1 and 9 (the total number of samples). For a given N, the standard deviation of the bias estimate across resamples approximates the standard error in the bias estimate when N control samples are measured. We’ll use Protocol H, which has an intermediate level of noise.
First, estimate the standard errors for each N:
mat <- stm %>%
filter(Protocol == "H") %>%
build_matrix(Sample, Taxon, Error)
N <- 1:9
names(N) <- N
reps.H <- map_dfr(N, ~bootrep_center(mat, R = 4e3, dist = "multinomial",
N = ., method = "gm"),
.id = "N") %>%
rename(Bias = Center)
reps.H.summary <- reps.H %>%
group_by(N, Taxon) %>%
summarize(gm_mean = gm_mean(Bias), gm_se = gm_sd(Bias)) %>%
mutate(Protocol = "H")
Then view how the dispersion decreases with N:
lvls.color <- reps.H.summary %>%
filter(N == 3) %>%
arrange(-gm_se) %>%
.$Taxon
values.color <- scales::seq_gradient_pal("black", "grey")(seq(0, 1, length.out = 10))
names(values.color) <- lvls.color
r <- range(reps.H.summary$gm_se)
names(r) <- r %>% round(2)
p.se <- ggplot(reps.H.summary, aes(N, gm_se,
color = fct_reorder(Taxon, -gm_se))) +
geom_line(aes(group = Taxon)) +
geom_point() +
scale_y_continuous(limits = range(c(1, r)), ) +
base_theme +
scale_color_manual(values = values.color, labels = tax_labeller) +
geom_rangeframe(color = "black") +
theme(legend.position = "right", legend.margin = margin(5,10,5,-5)) +
labs(x = "Number of controls", y = "Geometric standard error",
title = "Standard error vs. number of control samples",
color = "Taxon")
p.se
The relative abundance plot provides useful context for understanding why the standard errors are much higher for two taxa:
pd <- filter(stm, Protocol == "H")
pd0 <- pd %>%
filter(Individual %in% c("5", "M"))
p.step.H <- p.step %+%
pd %+%
aes(x = factor(Taxon, mock$Taxon), y = Observed,
color = factor(Taxon, lvls.color), shape = NULL) +
ggrepel::geom_text_repel(data = pd0, aes(label = Individual),
direction = "x", size = 2, color = "black") +
scale_color_manual(values = values.color, labels = tax_labeller) +
labs(title = "Observed compositions",
y = "Abundance / gm. mean",
shape = "Sample type") +
theme(legend.position = "none")
# p.step.H
Compute pairwise error summary statistics:
avg_errors <- ratios %>%
# Take the geometric absolute values of the errors
mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
# Average the errors for each pair of different taxa
filter(Taxon.x != Taxon.y) %>%
group_by(Protocol, Taxon.x, Taxon.y) %>%
summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
# Average over pairs of taxa for each protocol
group_by(Protocol) %>%
summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
rename_at(vars(Error, Bias, Residual), paste0, ".avg")
max_errors <- ratios %>%
# Take the geometric absolute values of the errors
mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
# Find the max fold error in a ratio for each protocol
group_by(Protocol) %>%
summarize_at(vars(Error, Bias, Residual), gm_range) %>%
rename_at(vars(Error, Bias, Residual), paste0, ".max")
pw_summary <- bind_cols(avg_errors, max_errors) %>%
select(Protocol, Bias.max, Bias.avg, Residual.avg)
pw_summary
## # A tibble: 3 x 4
## Protocol Bias.max Bias.avg Residual.avg
## <chr> <dbl> <dbl> <dbl>
## 1 H 2751. 9.69 1.31
## 2 Q 55.8 3.22 1.46
## 3 W 74.0 3.48 1.08
Get ratios for the ratio-based statistics as in ratios
, but for the differential error between protocols.
tb <- stm %>%
select(Taxon, Observed, Protocol, Individual) %>%
spread(Protocol, Observed) %>%
transmute(
Taxon, Individual,
HQ = H / Q,
HW = H / W,
QW = Q / W
) %>%
gather("Protocol", "Error", -Taxon, -Individual) %>%
left_join(dbias, by = c("Protocol", "Taxon")) %>%
select(-gm_se) %>%
mutate(Residual = Error / Bias)
gvs <- c("Protocol", "Individual")
d_ratios <- tb %>%
compute_ratios(group_vars = gvs) %>%
mutate(Pair = paste(Taxon.x, Taxon.y, sep = ":"))
Compute pairwise error summary statistics:
d_avg_errors <- d_ratios %>%
# Take the geometric absolute values of the errors
mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
# Average the errors for each pair of different taxa
filter(Taxon.x != Taxon.y) %>%
group_by(Protocol, Taxon.x, Taxon.y) %>%
summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
# Average over pairs of taxa for each protocol
group_by(Protocol) %>%
summarize_at(vars(Error, Bias, Residual), gm_mean) %>%
rename_at(vars(Error, Bias, Residual), paste0, ".avg")
d_max_errors <- d_ratios %>%
# Take the geometric absolute values of the errors
mutate_at(vars(Error, Bias, Residual), gm_abs) %>%
# Find the max fold error in a ratio for each protocol
group_by(Protocol) %>%
summarize_at(vars(Error, Bias, Residual), gm_range) %>%
rename_at(vars(Error, Bias, Residual), paste0, ".max")
d_pw_summary <- bind_cols(d_avg_errors, d_max_errors) %>%
select(Protocol, Bias.max, Bias.avg, Residual.avg)
d_pw_summary
## # A tibble: 3 x 4
## Protocol Bias.max Bias.avg Residual.avg
## <chr> <dbl> <dbl> <dbl>
## 1 HQ 428. 4.71 1.49
## 2 HW 59.1 3.88 1.34
## 3 QW 9.64 2.79 1.49
# Function to create string for formatting numbers with glue::glue()
fmt_string <- function(var, digits) {
paste0("{format(", var, ", digits = ", digits, ")}")
}
bias_tab <- bind_rows(bias, dbias) %>%
select(-gm_se) %>%
mutate(Bias = glue::glue(fmt_string("Bias", 1))) %>%
mutate_all(as.character) %>%
spread(Protocol, Bias) %>%
mutate_all(~ifelse(is.na(.), "---", .))
pw_summary_tab <- bind_rows(pw_summary, d_pw_summary) %>%
mutate(
Bias.max = glue::glue(fmt_string("Bias.max", 1)),
Bias.avg = glue::glue(fmt_string("Bias.avg", 2)),
Residual.avg = glue::glue(fmt_string("Residual.avg", 2))
) %>%
mutate_all(as.character) %>%
gather("Taxon", "Value", -Protocol) %>%
spread(Protocol, Value)
lvls.taxon <- c(mock$Taxon, "Contaminant", "Bias.max", "Bias.avg",
"Residual.avg")
lbls.taxon <- c(mock$Taxon, "Contaminant", "Max pairwise bias",
"Avg. pairwise bias", "Avg. pairwise noise")
bias_tab0 <- bind_rows(bias_tab, pw_summary_tab) %>%
select(Taxon, "H", "Q", "W", "HQ", "HW", "QW") %>%
mutate(
Taxon = factor(Taxon, lvls.taxon, lbls.taxon),
) %>%
arrange(Taxon)
bias_tab0
## # A tibble: 14 x 7
## Taxon H Q W HQ HW QW
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Prevotella_melaninogenica " 2.81" " 1.55" " 1.37" " 1.82" " 2.0… " 1.1…
## 2 Clostridium_perfringens " 1.18" " 0.49" " 1.14" " 2.41" " 1.0… " 0.4…
## 3 Salmonella_enterica " 1.77" " 2.29" " 0.79" " 0.77" " 2.2… " 2.9…
## 4 Clostridium_difficile " 0.11" " 0.09" " 0.22" " 1.24" " 0.4… " 0.4…
## 5 Lactobacillus_plantarum " 0.02" " 0.77" " 0.35" " 0.02" " 0.0… " 2.1…
## 6 Vibrio_cholerae " 2.10" " 1.16" " 0.89" " 1.81" " 2.3… " 1.3…
## 7 Clostridium_saccharolytic… " 1.21" " 1.44" " 0.59" " 0.84" " 2.0… " 2.4…
## 8 Yersinia_pseudotuberculos… " 1.46" " 1.35" " 0.66" " 1.08" " 2.2… " 2.0…
## 9 Blautia_hansenii " 0.54" " 0.74" " 1.80" " 0.72" " 0.3… " 0.4…
## 10 Fusobacterium_nucleatum 46.29 " 4.98" 16.51 " 9.30" " 2.8… " 0.3…
## 11 Contaminant --- --- --- " 0.89" " 2.1… " 2.3…
## 12 Max pairwise bias 2751 " 56" " 74" " 428" " 59" " 10"
## 13 Avg. pairwise bias 9.7 3.2 3.5 4.7 3.9 2.8
## 14 Avg. pairwise noise 1.3 1.5 1.1 1.5 1.3 1.5
Latex version for the manuscript:
tex <- bias_tab0 %>%
rename_at(vars(HQ, HW, QW), ~ {str_sub(., 2, 1) <- "/"; .}) %>%
mutate(
Taxon = kableExtra::cell_spec(
str_replace(Taxon, "_", " "),
"latex",
italic = Taxon %in% mock$Taxon)
) %>%
knitr::kable(format="latex", booktabs = TRUE, linesep = "",
escape = FALSE, align = c("l", rep("r", 6))) %>%
kableExtra::add_header_above(c(" ", "Protocol" = 3,
"Protocol/Reference" = 3)) %>%
kableExtra::row_spec(length(mock$Taxon) + 1, extra_latex_after = "\\midrule")
# tex
# clipr::write_clip(tex)
Choose Individuals (specimens) to use for estimating bias.
set.seed(20190201)
individuals <- as.character(1:8)
estimation_set <- base::sample(individuals, 3, replace = FALSE)
stm0 <- stm %>%
select(-Bias, -Residual) %>%
mutate(Set = ifelse(Individual %in% estimation_set, "Est", "Eval"))
sam <- sam %>%
mutate(Set = ifelse(Individual %in% estimation_set, "Est", "Eval"))
Estimate bias from the estimation samples:
bias_est <- stm0 %>%
filter(Set == "Est") %>%
group_by(Protocol, Taxon) %>%
summarize(Bias = gm_mean(Error)) %>%
mutate(Bias = center_elts(Bias)) %>%
ungroup
Compare the bias estimated from the estimation set and that from all specimens:
bind_rows(list(All = bias, Est = bias_est), .id = "Specimens") %>%
ggplot(aes(Taxon, Bias, color = Protocol, shape = Specimens)) +
geom_quasirandom() +
scale_y_log10() +
scale_shape_manual(breaks = c("All", "Est"), values = c(16, 1)) +
scale_color_manual(values = colors.protocol) +
coord_flip()
Bias is consistent with the estimate using all specimens, though this would vary for the higher-variance protocols H and Q for some estimation sets.
Also estimate the differential bias to protocol W. First, add a column with the “Reference” measurement; then repeat bias estimation with the reference measurement instead of the Actual abundance.
stmW <- stm0 %>%
select(Taxon, Protocol, Individual, Set, Observed) %>%
spread(Protocol, Observed) %>%
mutate(Actual = W) %>%
gather("Protocol", "Observed", H, Q, W) %>%
mutate(Error = Observed / Actual)
biasW_est <- stmW %>%
filter(Set == "Est") %>%
group_by(Protocol, Taxon) %>%
summarize(BiasW = gm_mean(Error)) %>%
mutate(BiasW = center_elts(BiasW))
biasW_est %>%
spread(Protocol, BiasW)
## # A tibble: 10 x 4
## Taxon H Q W
## <chr> <dbl> <dbl> <dbl>
## 1 Blautia_hansenii 0.310 0.483 1
## 2 Clostridium_difficile 0.408 0.326 1
## 3 Clostridium_perfringens 0.956 0.385 1
## 4 Clostridium_saccharolyticum 2.09 2.54 1
## 5 Fusobacterium_nucleatum 2.79 0.267 1
## 6 Lactobacillus_plantarum 0.0549 2.88 1
## 7 Prevotella_melaninogenica 2.10 0.925 1
## 8 Salmonella_enterica 2.30 3.20 1
## 9 Vibrio_cholerae 2.37 1.38 1
## 10 Yersinia_pseudotuberculosis 2.26 2.07 1
Calibrate all samples
stm0 <- stm0 %>%
left_join(bias_est, by = c("Protocol", "Taxon")) %>%
left_join(biasW_est, by = c("Protocol", "Taxon")) %>%
mutate_by(Sample,
Calibrated_to_Actual = close_elts(Observed / Bias),
Calibrated_to_W = close_elts(Observed / BiasW)
)
Prep for the PCA: we first need to Clr transform each relative abundance vector.
stm1 <- stm0 %>%
gather("Type", "Abundance",
Observed, Actual, Calibrated_to_Actual, Calibrated_to_W) %>%
select(Taxon:Individual, Set:Abundance) %>%
group_by(Sample, Type) %>%
mutate(Clr = clr(Abundance)) %>%
ungroup
Get a matrix with samples as rows to use for the PCA.
temp <- stm1 %>%
unite("Sample_Type", Sample, Type, sep = ":") %>%
select(Sample_Type, Taxon, Clr) %>%
spread(Taxon, Clr)
mat <- temp %>%
select(-Sample_Type) %>%
as("matrix")
rownames(mat) <- temp$Sample_Type
corner(mat)
## Blautia_hansenii Clostridium_difficile
## H1:Actual -3.104989 1.4185469
## H1:Calibrated_to_Actual -3.815372 1.2580855
## H1:Calibrated_to_W -3.151494 -0.2018179
## H1:Observed -4.323387 -1.0988666
## H2:Actual -3.104989 1.4185469
## Clostridium_perfringens
## H1:Actual 1.966933
## H1:Calibrated_to_Actual 1.831330
## H1:Calibrated_to_W 1.941244
## H1:Observed 1.896456
## H2:Actual 1.966933
## Clostridium_saccharolyticum
## H1:Actual -0.1076099
## H1:Calibrated_to_Actual 0.1544821
## H1:Calibrated_to_W -0.3968607
## H1:Observed 0.3384341
## H2:Actual -0.1076099
## Fusobacterium_nucleatum
## H1:Actual -3.91413933
## H1:Calibrated_to_Actual -3.76419663
## H1:Calibrated_to_W -0.98939104
## H1:Observed 0.03810808
## H2:Actual -3.91413933
Note that each sample (protocol + individual) appears four times, as Observed, Actual, Calibrated to Actual, and Calibrated to W, and currently I’m just doing a simple PCA on all of these, since our goal is mainly to illustrate the effect of calibration rather than make quantitative claims about variation. Next we run the PCA and make a tibble for plotting,
pcx <- prcomp(mat)
tb <- as_tibble(pcx$x[,c(1,2)], rownames='Sample_Type') %>%
separate(Sample_Type, c("Sample", "Type"), sep = ":") %>%
left_join(sam, by = "Sample")
tb <- tb %>%
mutate(PC1 = -PC1, PC2 = PC2)
Nearly 90% of the variance is covered by the first two PCs, with about 5.2x as much variation on PC1 than PC2:
## [1] 0.739 0.143 0.087 0.018 0.008 0.003 0.001 0.001 0.000 0.000
## [1] 5.161086
PCA figure that will form the right half of the main text figure:
facet_labeller <- function (tb) {
tb %>%
mutate_all(str_replace_all, "_", " ") %>%
mutate_all(str_replace, " ", "\n")
}
type.lvls <- c("Observed", "Calibrated_to_Actual", "Calibrated_to_W")
actual <- tb %>%
filter(Type == "Actual", Sample == "H1") %>%
select(PC1, PC2) %>%
expand(Type = factor(type.lvls, type.lvls), PC1, PC2)
tb0 <- tb %>%
filter(Type != "Actual") %>%
mutate(Type = factor(Type, type.lvls))
tb.breaks <-
tribble(
~PC1, ~PC2,
# min(tb0$PC1), min(tb0$PC2),
# actual$PC1[1], actual$PC2[1],
# max(tb0$PC1), max(tb0$PC2),
)
tb.range <-
tribble(
~PC1, ~PC2,
min(tb0$PC1), min(tb0$PC2),
max(tb0$PC1), max(tb0$PC2),
)
p.pca <- ggplot(tb0, aes(PC1, PC2, color = Protocol, shape = Set)) +
geom_point(data = actual, aes(PC1, PC2),
shape = 3, size = 4, color = "black") +
geom_point() +
facet_grid(Type ~ ., labeller = facet_labeller) +
geom_rangeframe(data = tb.range, aes(PC1, PC2),
color = "black", inherit.aes = FALSE) +
scale_x_continuous(breaks = tb.breaks$PC1, labels = NULL) +
scale_y_continuous(breaks = tb.breaks$PC2, labels = NULL) +
scale_shape_manual(breaks = c("Est", "Eval"), values = c(1, 16)) +
scale_color_manual(values = colors.protocol) +
base_theme +
theme(strip.text.y = element_text(angle=0, size = 9)) +
# theme(axis.line = element_line()) +
labs(title = "Sample ordination", x = strvar[1], y = strvar[2])
# Label the protocols in the "Observed" plot
labtb <- tribble(
~x, ~y, ~Protocol,
-4, 1.5, "H",
-0.5, 1.3, "Q",
-1.85, -0.4, "W",
2.3, 0.7, "Actual"
) %>%
mutate(Type = factor("Observed", type.lvls))
p.pca <- p.pca +
geom_text(data = labtb,
aes(x, y, label = Protocol, color = Protocol),
size = 3, inherit.aes = FALSE, show.legend = FALSE)
p.pca
Figure showing the relative abundances, which will form the left half of the main text figure:
stm1 <- stm1 %>%
group_by(Sample, Type) %>%
mutate(Abundance = center_elts(Abundance)) %>%
ungroup
tbrf <- tibble(Abundance = range(stm1$Abundance))
p.step.cal <- stm1 %>%
filter(Type %in% type.lvls) %>%
mutate(Taxon = factor(Taxon, mock$Taxon),
Type = factor(Type, type.lvls)) %>%
ggplot(aes(x = Taxon, y = Abundance, color = Protocol, shape = Set)) +
geom_quasirandom() +
geom_rangeframe(data = tbrf, aes(y = Abundance),
color = "black", sides = "l", inherit.aes = FALSE) +
scale_color_manual(values = colors.protocol) +
scale_y_log10(labels = log_formatter) +
scale_x_discrete(labels = tax_labeller) +
coord_cartesian(clip = "off") +
labs(title = "Taxon relative abundances",
y = "Abundance / gm. mean") +
facet_grid(Type ~ .) +
scale_shape_manual(breaks = c("Est", "Eval"), values = c(1, 16)) +
scale_color_manual(values = colors.protocol) +
base_theme +
tax_theme +
theme(strip.text.y = element_blank())
# Add the actual abundances (as a stair plot like in Costea2017's Figure 6)
# underneath the data points
tb.step.cal <- tb.step %>%
expand(Type = factor(type.lvls, type.lvls), nesting(x, Actual))
p.step.cal$layers <- c(
geom_step(data = tb.step.cal, aes(x = x, y = Actual),
size = 0.3, color = "black", inherit.aes = FALSE),
p.step.cal$layers)
# p.step.cal
Get tree for the spike-in taxa:
## [1] "k__Archaea|p__Euryarchaeota|c__Methanopyri|o__Methanopyrales|f__Methanopyraceae|g__Methanopyrus|s__Methanopyrus_kandleri"
## [2] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanothermaceae|g__Methanothermus|s__Methanothermus_fervidus"
## [3] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanothermobacter|s__Methanothermobacter_thermautotrophicus"
## [4] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanothermobacter|s__Methanothermobacter_marburgensis"
## [5] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobacterium|s__Methanobacterium_formicicum"
## [6] "k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobacterium|s__Methanobacterium_sp_Maddingley_MBC34"
tree <- costea2017_metaphlan2_tree
tree$tip.label <- tree$tip.label %>%
str_extract("(?<=s__).+")
spikein_taxa <- c(mock$Taxon, contaminant)
tree <- ape::keep.tip(tree, spikein_taxa)
tree
##
## Phylogenetic tree with 11 tips and 10 internal nodes.
##
## Tip labels:
## Clostridium_saccharolyticum, Blautia_hansenii, Clostridium_perfringens, Clostridium_difficile, Fusobacterium_nucleatum, Lactobacillus_plantarum, ...
## Node labels:
## 0.997, 0.650, 0.133, 0.991, 0.688, 1.000, ...
##
## Rooted; includes branch lengths.
Plot tree along with differential bias to Protocol W. Bias is shown relative to the 10 mock taxa, with 2 geometric standard errors.
gt <- ggtree::ggtree(tree, layout = "rectangular") +
theme(plot.margin = margin(5, 0, 5, 5))
gtb <- dbias %>%
mutate(Taxon = str_replace(Taxon, "Contaminant", contaminant)) %>%
left_join(gt$data, by = c("Taxon" = "label")) %>%
filter(Protocol %in% c("HW", "QW")) %>%
mutate_at("Protocol", str_sub, 1, 1)
pr <- gtb %>%
ggplot(aes(Bias, fct_reorder(Taxon, y), color = Protocol)) +
# geom_point() +
geom_vline(xintercept = 1) +
ggstance::geom_pointrangeh(aes(xmin = Bias / gm_se^2,
xmax = Bias * gm_se^2),
fatten = 1.0, position = position_jitter(width = 0, height = 0.1)
) +
scale_color_manual(values = colors.protocol) +
base_theme +
scale_y_discrete(labels = tax_labeller) +
scale_x_log10(labels = log_formatter) +
labs(y = "", x = "Differential efficiency rel. to Protocol W") +
theme(legend.position = "right", plot.margin = margin(5, 5, 5, 0),
axis.text.y = element_text(size=9, family = "")
)
mock.gathered <- costea2017_mock_composition %>%
select(Taxon,
FACS = "bacterial cells in spike in Mix",
OD = "Bacterial OD in sample",
) %>%
group_by(Taxon) %>%
summarize_at(vars(FACS, OD), mean) %>%
gather("Protocol", "Observed", -Taxon) %>%
mutate_by(Protocol, Observed = center_elts(Observed))
stm %>%
group_by(Protocol, Taxon) %>%
summarize(Observed = gm_mean(Observed)) %>%
ungroup %>%
bind_rows(mock.gathered) %>%
filter(Protocol != "FACS") %>%
mutate(
Taxon = factor(Taxon, mock$Taxon),
Protocol = factor(Protocol, c("W", "Q", "H", "OD"))
) %>%
ggplot(aes(Taxon, Observed, fill = Protocol)) +
geom_col(position = "dodge") +
geom_step(data = tb.step, aes(x = x, y = Actual, linetype = "FACS"),
size = 0.3, color = "black", inherit.aes = FALSE) +
scale_fill_manual(values = c(colors.protocol, OD = "grey")) +
scale_y_log10() +
base_theme +
tax_theme
The abundances don’t match exactly. This is expected, given that Costea et al. used mOTU to measure abundances. However, the general pattern is highly consistent with their figure, including the OD measurements, except for a swapping in the taxon labels on the x-axis for the three species V. cholerae, C. saccharolyticum, and Y. pseudotuberculosis. This strongly suggests that the mock species are correctly labeled here and are labeled incorrectly in their Figure 6.
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Arch Linux
##
## Matrix products: default
## BLAS: /usr/lib/libblas.so.3.8.0
## LAPACK: /usr/lib/liblapack.so.3.8.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] BiasManuscript_0.0.0.9000 metacal_0.1.0
## [3] ggbeeswarm_0.6.0 cowplot_0.9.99
## [5] ggthemes_4.2.0 forcats_0.4.0
## [7] stringr_1.4.0 dplyr_0.8.3
## [9] purrr_0.3.2 readr_1.3.1
## [11] tidyr_0.8.3 tibble_2.1.3
## [13] ggplot2_3.2.0 tidyverse_1.2.1
## [15] here_0.1 rmarkdown_1.13
## [17] nvimcom_0.9-82 usethis_1.5.0
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-140 fs_1.3.1 ggtree_1.16.0
## [4] lubridate_1.7.4 devtools_2.0.2 webshot_0.5.1
## [7] RColorBrewer_1.1-2 httr_1.4.0 rprojroot_1.3-2
## [10] useful_1.2.6 tools_3.6.1 backports_1.1.4
## [13] utf8_1.1.4 R6_2.4.0 vipor_0.4.5
## [16] lazyeval_0.2.2 colorspace_1.4-1 withr_2.1.2
## [19] tidyselect_0.2.5 prettyunits_1.0.2 processx_3.3.1
## [22] compiler_3.6.1 cli_1.1.0 rvest_0.3.4
## [25] xml2_1.2.0 desc_1.2.0 labeling_0.3
## [28] scales_1.0.0 callr_3.2.0 digest_0.6.20
## [31] pkgconfig_2.0.2 htmltools_0.3.6 sessioninfo_1.1.1
## [34] rlang_0.4.0 readxl_1.3.1 rstudioapi_0.10
## [37] generics_0.0.2 jsonlite_1.6 magrittr_1.5
## [40] kableExtra_1.1.0 Rcpp_1.0.1 munsell_0.5.0
## [43] fansi_0.4.0 ape_5.3 stringi_1.4.3
## [46] yaml_2.2.0 ggstance_0.3.1 pkgbuild_1.0.3
## [49] plyr_1.8.4 grid_3.6.1 parallel_3.6.1
## [52] ggrepel_0.8.1 crayon_1.3.4 lattice_0.20-38
## [55] haven_2.1.0 hms_0.4.2 zeallot_0.1.0
## [58] knitr_1.23 ps_1.3.0 pillar_1.4.2
## [61] igraph_1.2.4.1 reshape2_1.4.3 codetools_0.2-16
## [64] pkgload_1.0.2 glue_1.3.1 evaluate_0.14
## [67] remotes_2.0.4 modelr_0.1.4 treeio_1.8.0
## [70] vctrs_0.2.0 testthat_2.1.1 cellranger_1.1.0
## [73] gtable_0.3.0 assertthat_0.2.1 xfun_0.7
## [76] broom_0.5.2 tidytree_0.2.4 viridisLite_0.3.0
## [79] rvcheck_0.1.3 beeswarm_0.2.3 memoise_1.1.0
## [82] ellipsis_0.2.0.1