# Tools for microbiome data
library(speedyseq)
# Tools for general purpose data manipulation and plotting
library(tidyverse)
library(fs)
# ggplot helpers
library(ggbeeswarm)
library(cowplot)
library(patchwork)
library(scales)
theme_set(theme_cowplot())
# stats helpers
# library(broom)
library(metacal); packageVersion("metacal")
[1] '0.2.0.9005'
colors_brooks <- c(
"Atopobium_vaginae" = "#009E73",
"Gardnerella_vaginalis" = "#56B4E9",
"Lactobacillus_crispatus" = "#D55E00",
"Lactobacillus_iners" = "#505050",
"Prevotella_bivia" = "#0072B2",
"Sneathia_amnii" = "#CC79A7",
"Streptococcus_agalactiae" = "#E69F00")
scale_y_custom <- scale_y_continuous(
trans = 'log10',
breaks = trans_breaks('log10', function(x) 10^x),
labels = trans_format('log10', math_format(10^.x))
)
Load data from the cellular mock communities of Brooks et al 2015 from metacal,
dr <- system.file("extdata", package = "metacal")
list.files(dr)
[1] "brooks2015-actual.csv" "brooks2015-observed.csv"
[3] "brooks2015-sample-data.csv"
Note that the single-species samples present.
actual %>%
as_tibble %>%
filter(.abundance == 1) %>%
count(.otu)
# A tibble: 7 × 2
.otu n
<chr> <int>
1 Atopobium_vaginae 2
2 Gardnerella_vaginalis 1
3 Lactobacillus_crispatus 1
4 Lactobacillus_iners 1
5 Prevotella_bivia 1
6 Sneathia_amnii 1
7 Streptococcus_agalactiae 2
Idea is to focus on a single species and show that the error in log proportions is inconsistent, and therefore there is error in the fold changes. L. crispatus is a species MM and BC commonly use to illustrate the inconsistent error, as since it has an intermediate efficiency, we see its error varying in sign/direction. However, the errors in fold changes don’t depend on this aspect, but only the variation in mean efficiency, and is therefore the same for all species.
Data frame for comparing measured and actual proportions:
brooks_prop <- list(
Actual = actual,
Measured = observed
) %>%
map(transform_sample_counts, close_elts) %>%
map_dfr(as_tibble, .id = 'type') %>%
pivot_wider(names_from = type, values_from = .abundance)
brooks_fc <- list(
Actual = actual,
Measured = observed
) %>%
map(transform_sample_counts, close_elts) %>%
map(pairwise_ratios, margin = 'samples', filter = FALSE) %>%
map_dfr(as_tibble, .id = 'type') %>%
pivot_wider(names_from = type, values_from = .abundance) %>%
separate(.sample, c('sample1', 'sample2'), sep = ':') %>%
# Filter to remove cases where Actual is 0 in one or both samples, or where
# the numerator and denominator sample are equal
filter(is.finite(Actual), Actual != 0, sample1 != sample2)
set.seed(42)
species <- 'Lactobacillus_crispatus'
samples <- actual %>%
as_tibble %>%
filter(.otu == species, .abundance > 0) %>%
pull(.sample)
p1 <- brooks_prop %>%
filter(.otu == species, Actual > 0) %>%
ggplot(aes(Actual, Measured, color = .otu)) +
scale_color_manual(values = colors_brooks) +
scale_x_log10() +
scale_y_log10() +
geom_abline(color = 'grey', size = 1) +
geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
# geom_quasirandom() +
coord_fixed() +
expand_limits(x = 0.1, y = 0.1) +
labs(x = 'Actual proportion', y = 'Measured proportion', color = 'Species') +
theme(legend.position = 'none')
p2 <- brooks_fc %>%
filter(.otu == species, sample1 %in% samples, sample2 %in% samples) %>%
ggplot(aes(Actual, Measured, color = .otu)) +
scale_color_manual(values = colors_brooks) +
scale_x_log10() +
# scale_y_custom +
scale_y_log10() +
geom_abline(color = 'grey', size = 1) +
geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
# geom_quasirandom() +
coord_fixed() +
labs(x = 'Actual fold change', y = 'Measured fold change', color = 'Species') +
theme(legend.position = 'none')
# p2
(p1 + labs(x = 'Actual', y = 'Measured', title = 'Proportion')) +
(p2 + labs(x = 'Actual', y = 'Measured', title = 'Fold change')) +
plot_annotation(tag_levels = 'A')
This dataset is for DNA mocks rather than cellular mocks; however the magnitude of bias is similar. This dataset has a couple advantages for illustration purposes:
## From the metacal 2.0 tutorial
# Download data from https://zenodo.org/record/3872145
data_path <- here::here("notebook/_data", "leopold2020host")
# To use a temporary directory:
# data_path <- file.path(tempdir(), "leopold2020")
if (!dir.exists(data_path)) {
dir.create(data_path, recursive = TRUE)
download.file(
"https://zenodo.org/record/3872145/files/dleopold/Populus_priorityEffects-v1.2.zip",
file.path(data_path, "Populus_priorityEffects-v1.2.zip")
)
unzip(
file.path(data_path, "Populus_priorityEffects-v1.2.zip"),
exdir = data_path
)
}
#> The microbiome data is stored in a phyloseq object,
ps <- file.path(data_path,
"dleopold-Populus_priorityEffects-8594f7c/output/compiled/phy.rds") %>%
readRDS
sample_data(ps) <- sample_data(ps) %>%
transform(
Timepoint = factor(Timepoint)
)
mock_actual <- file.path(data_path,
"dleopold-Populus_priorityEffects-8594f7c/data/MockCommunities.csv") %>%
read.csv(row.names = 1) %>%
select(-Sym4) %>%
as("matrix") %>%
otu_table(taxa_are_rows = FALSE) %>%
transform_sample_counts(function(x) close_elts(1 / x))
mock_taxa <- taxa_names(mock_actual)
sam <- sample_data(ps) %>% as("data.frame") %>% as_tibble(rownames = "Sample")
tax <- tax_table(ps) %>% as("matrix") %>% as_tibble(rownames = "Taxon")
ps.mock <- ps %>%
subset_samples(Samp_type == "Mock") %>%
prune_taxa(mock_taxa, .)
leopold_actual <- mock_actual
leopold_observed <- ps.mock %>% otu_table
rm(mock_actual, mock_taxa, sam, tax, ps.mock)
leopold_prop <- list(
Actual = leopold_actual,
Measured = leopold_observed
) %>%
map(transform_sample_counts, close_elts) %>%
map_dfr(as_tibble, .id = 'type') %>%
pivot_wider(names_from = type, values_from = .abundance)
leopold_fc <- list(
Actual = leopold_actual,
Measured = leopold_observed
) %>%
map(transform_sample_counts, close_elts) %>%
map(pairwise_ratios, margin = 'samples', filter = FALSE) %>%
map_dfr(as_tibble, .id = 'type') %>%
pivot_wider(names_from = type, values_from = .abundance) %>%
separate(.sample, c('sample1', 'sample2'), sep = ':') %>%
# Filter to remove cases where Actual is 0 in one or both samples, or where
# the numerator and denominator sample are equal
filter(is.finite(Actual), Actual != 0, sample1 != sample2)
Note, in this case all species are (nominally) in all samples, but that Epicoccum is observed in 0 reads in one sample.
species <- 'Cladosporium'
# species <- 'Alternaria'
# species <- 'Trichoderma'
p1 <- leopold_prop %>%
filter(.otu == species, Actual > 0) %>%
ggplot(aes(Actual, Measured, color = .otu)) +
# scale_color_manual(values = colors_brooks) +
scale_x_log10() +
scale_y_log10() +
geom_abline(color = 'grey', size = 1) +
geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
# geom_quasirandom() +
coord_fixed() +
expand_limits(x = 0.1, y = 0.1) +
labs(x = 'Actual proportion', y = 'Measured proportion', color = 'Species') +
theme(legend.position = 'none')
p2 <- leopold_fc %>%
filter(.otu == species) %>%
ggplot(aes(Actual, Measured, color = .otu)) +
# scale_color_manual(values = colors_brooks) +
scale_x_log10() +
# scale_y_custom +
scale_y_log10() +
geom_abline(color = 'grey', size = 1) +
geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
# geom_quasirandom() +
coord_fixed() +
labs(x = 'Actual fold change', y = 'Measured fold change', color = 'Species') +
theme(legend.position = 'none')
p1 + p2
Same but for the ratios between two species:
x <- list(
Actual = leopold_actual,
Measured = leopold_observed
) %>%
map(transform_sample_counts, close_elts) %>%
map(pairwise_ratios, margin = 'taxa', filter = FALSE)
leopold_ratio <- x %>%
map_dfr(as_tibble, .id = 'type') %>%
pivot_wider(names_from = type, values_from = .abundance) %>%
separate(.otu, c('otu1', 'otu2'), sep = ':') %>%
filter(otu1 != otu2)
leopold_ratio_fc <- x %>%
map(pairwise_ratios, margin = 'samples', filter = FALSE) %>%
map_dfr(as_tibble, .id = 'type') %>%
pivot_wider(names_from = type, values_from = .abundance) %>%
separate(.otu, c('otu1', 'otu2'), sep = ':') %>%
separate(.sample, c('sample1', 'sample2'), sep = ':') %>%
# Filter to remove cases where Actual is 0 in one or both samples, or where
# the numerator and denominator sample are equal
filter(otu1 != otu2, sample1 != sample2)
species1 <- 'Cladosporium'
species2 <- 'Fusarium'
# species2 <- 'Melampsora'
# species <- 'Trichoderma'
p3 <- leopold_ratio %>%
filter(otu1 == species1, otu2 == species2) %>%
ggplot(aes(Actual, Measured)) +
# scale_color_manual(values = colors_brooks) +
scale_x_log10() +
scale_y_log10() +
geom_abline(color = 'grey', size = 1) +
geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
# geom_quasirandom() +
coord_fixed() +
expand_limits(x = 0.1, y = 0.1) +
labs(x = 'Actual ratio', y = 'Measured ratio', color = 'Species') +
theme(legend.position = 'none')
p4 <- leopold_ratio_fc %>%
filter(otu1 == species1, otu2 == species2) %>%
ggplot(aes(Actual, Measured)) +
# scale_color_manual(values = colors_brooks) +
scale_x_log10() +
# scale_y_custom +
scale_y_log10() +
geom_abline(color = 'grey', size = 1) +
geom_point(position = position_jitter(width = 0.02, height = 0.02), alpha = 0.5) +
# geom_quasirandom() +
coord_fixed() +
labs(x = 'Actual fold change', y = 'Measured fold change', color = 'Species') +
theme(legend.position = 'none')
(p1 + p2) / (p3 + p4)
Note, the ratio-view isn’t always obviously better; e.g. when Mel is involved; perhaps because of noise?
To make things more directly comparable, it might be best to use faceting and to fix the axes to have the same span.
sessioninfo::session_info()
─ Session info ──────────────────────────────────────────────────────────────────
setting value
version R version 4.1.1 (2021-08-10)
os Arch Linux
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/New_York
date 2021-10-27
─ Packages ──────────────────────────────────────────────────────────────────────
package * version date lib source
ade4 1.7-18 2021-09-16 [1] CRAN (R 4.1.1)
ape 5.5 2021-04-25 [1] CRAN (R 4.1.0)
assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.0)
backports 1.2.1 2020-12-09 [1] CRAN (R 4.0.3)
beeswarm 0.4.0 2021-06-01 [1] CRAN (R 4.1.0)
Biobase 2.52.0 2021-05-19 [1] Bioconductor
BiocGenerics 0.38.0 2021-05-19 [1] Bioconductor
biomformat 1.20.0 2021-05-19 [1] Bioconductor
Biostrings 2.60.1 2021-06-06 [1] Bioconductor
bitops 1.0-7 2021-04-24 [1] CRAN (R 4.1.0)
bookdown 0.24 2021-09-02 [1] CRAN (R 4.1.1)
broom 0.7.9 2021-07-27 [1] CRAN (R 4.1.0)
bslib 0.3.1 2021-10-06 [1] CRAN (R 4.1.1)
cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.0.0)
cli 3.0.1 2021-07-17 [1] CRAN (R 4.1.0)
cluster 2.1.2 2021-04-17 [2] CRAN (R 4.1.1)
codetools 0.2-18 2020-11-04 [2] CRAN (R 4.1.1)
colorspace 2.0-2 2021-08-11 [1] R-Forge (R 4.1.1)
cowplot * 1.1.1 2021-08-27 [1] Github (wilkelab/cowplot@555c9ae)
crayon 1.4.1 2021-02-08 [1] CRAN (R 4.0.4)
data.table 1.14.2 2021-09-27 [1] CRAN (R 4.1.1)
DBI 1.1.1 2021-01-15 [1] CRAN (R 4.0.4)
dbplyr 2.1.1 2021-04-06 [1] CRAN (R 4.0.5)
digest 0.6.28 2021-09-23 [1] CRAN (R 4.1.1)
distill 1.3 2021-10-13 [1] CRAN (R 4.1.1)
downlit 0.2.1 2020-11-04 [1] CRAN (R 4.0.3)
dplyr * 1.0.7 2021-06-18 [1] CRAN (R 4.1.0)
ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.1.0)
evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.0)
fansi 0.5.0 2021-05-25 [1] CRAN (R 4.1.0)
farver 2.1.0 2021-02-28 [1] CRAN (R 4.0.4)
fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.0.4)
forcats * 0.5.1 2021-01-27 [1] CRAN (R 4.0.4)
foreach 1.5.1 2020-10-15 [1] CRAN (R 4.0.3)
fs * 1.5.0 2020-07-31 [1] CRAN (R 4.0.2)
generics 0.1.1 2021-10-25 [1] CRAN (R 4.1.1)
GenomeInfoDb 1.28.1 2021-07-01 [1] Bioconductor
GenomeInfoDbData 1.2.6 2021-05-31 [1] Bioconductor
ggbeeswarm * 0.6.0 2017-08-07 [1] CRAN (R 4.0.0)
ggplot2 * 3.3.5 2021-06-25 [1] CRAN (R 4.1.0)
glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.2)
gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.0)
haven 2.4.3 2021-08-04 [1] CRAN (R 4.1.1)
here 1.0.1 2020-12-13 [1] CRAN (R 4.0.5)
highr 0.9 2021-04-16 [1] CRAN (R 4.1.0)
hms 1.1.1 2021-09-26 [1] CRAN (R 4.1.1)
htmltools 0.5.2 2021-08-25 [1] CRAN (R 4.1.1)
httr 1.4.2 2020-07-20 [1] CRAN (R 4.0.2)
igraph 1.2.7 2021-10-15 [1] CRAN (R 4.1.1)
IRanges 2.26.0 2021-05-19 [1] Bioconductor
iterators 1.0.13 2020-10-15 [1] CRAN (R 4.0.3)
jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.1.0)
jsonlite 1.7.2 2020-12-09 [1] CRAN (R 4.0.3)
knitr 1.36 2021-09-29 [1] CRAN (R 4.1.1)
lattice 0.20-44 2021-05-02 [2] CRAN (R 4.1.1)
lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.1.1)
lubridate 1.8.0 2021-10-07 [1] CRAN (R 4.1.1)
magrittr 2.0.1 2020-11-17 [1] CRAN (R 4.0.3)
MASS 7.3-54 2021-05-03 [2] CRAN (R 4.1.1)
Matrix 1.3-4 2021-06-01 [2] CRAN (R 4.1.1)
metacal * 0.2.0.9005 2021-10-04 [1] Github (mikemc/metacal@773cbf3)
mgcv 1.8-36 2021-06-01 [2] CRAN (R 4.1.1)
modelr 0.1.8 2020-05-19 [1] CRAN (R 4.0.0)
multtest 2.48.0 2021-05-19 [1] Bioconductor
munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.0)
nlme 3.1-152 2021-02-04 [2] CRAN (R 4.1.1)
nvimcom * 0.9-102 2021-10-25 [1] local
patchwork * 1.1.1 2020-12-17 [1] CRAN (R 4.0.3)
permute 0.9-5 2019-03-12 [1] CRAN (R 4.0.0)
phyloseq * 1.36.0 2021-05-19 [1] Bioconductor
pillar 1.6.4 2021-10-18 [1] CRAN (R 4.1.1)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.0)
plyr 1.8.6 2020-03-03 [1] CRAN (R 4.0.0)
purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.0)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.1.1)
Rcpp 1.0.7 2021-07-07 [1] CRAN (R 4.1.0)
RCurl 1.98-1.5 2021-09-17 [1] CRAN (R 4.1.1)
readr * 2.0.2 2021-09-27 [1] CRAN (R 4.1.1)
readxl 1.3.1 2019-03-13 [1] CRAN (R 4.0.0)
reprex 2.0.1 2021-08-05 [1] CRAN (R 4.1.1)
reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.0.0)
rhdf5 2.36.0 2021-05-19 [1] Bioconductor
rhdf5filters 1.4.0 2021-05-19 [1] Bioconductor
Rhdf5lib 1.14.2 2021-07-06 [1] Bioconductor
rlang 0.4.12 2021-10-18 [1] CRAN (R 4.1.1)
rmarkdown * 2.11 2021-09-14 [1] CRAN (R 4.1.1)
rprojroot 2.0.2 2020-11-15 [1] CRAN (R 4.0.3)
rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.0.3)
rvest 1.0.2 2021-10-16 [1] CRAN (R 4.1.1)
S4Vectors 0.30.0 2021-05-19 [1] Bioconductor
sass 0.4.0 2021-05-12 [1] CRAN (R 4.1.0)
scales * 1.1.1 2020-05-11 [1] CRAN (R 4.0.0)
sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.0)
speedyseq * 0.5.3.9018 2021-06-29 [1] Github (mikemc/speedyseq@ceb941f)
stringi 1.7.5 2021-10-04 [1] CRAN (R 4.1.1)
stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.0)
survival 3.2-11 2021-04-26 [2] CRAN (R 4.1.1)
tibble * 3.1.5 2021-09-30 [1] CRAN (R 4.1.1)
tidyr * 1.1.4 2021-09-27 [1] CRAN (R 4.1.1)
tidyselect 1.1.1 2021-04-30 [1] CRAN (R 4.1.0)
tidyverse * 1.3.1 2021-04-15 [1] CRAN (R 4.1.0)
tzdb 0.1.2 2021-07-20 [1] CRAN (R 4.1.0)
useful 1.2.6 2018-10-08 [1] CRAN (R 4.0.0)
utf8 1.2.2 2021-07-24 [1] CRAN (R 4.1.0)
vctrs 0.3.8 2021-04-29 [1] CRAN (R 4.1.0)
vegan 2.5-7 2020-11-28 [1] CRAN (R 4.0.3)
vipor 0.4.5 2017-03-22 [1] CRAN (R 4.0.0)
withr 2.4.2 2021-04-18 [1] CRAN (R 4.0.5)
xfun 0.27 2021-10-18 [1] CRAN (R 4.1.1)
xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.0)
XVector 0.32.0 2021-05-19 [1] Bioconductor
yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.0)
zlibbioc 1.38.0 2021-05-19 [1] Bioconductor
[1] /home/michael/.local/lib/R/library
[2] /usr/lib/R/library