# library(here)
library(tidyverse)

library(cowplot)
library(patchwork)
theme_set(theme_cowplot(12))

library(ggdist)
library(ggbeeswarm)

library(speedyseq)
library(metacal)

set.seed(42)

Simulate data

n_species <- 10
n_samples <- 50

species <- str_c("Sp. ", seq(n_species))

Actual abundances

Log abundances are independent and follow a simple linear model,

log abundance = a_0 + a_1 * X + epsilon

n_samples <- 50
x <- c(
  rep(0, n_samples / 2),
  rep(1, n_samples / 2)
)
X <- rbind(1, x)

Sigma <- diag(1, nrow = n_species)
rownames(Sigma) <- colnames(Sigma) <- species
a_0 <- MASS::mvrnorm(
  mu = rep(0, n_species), 
  Sigma = diag(1, nrow = n_species)
)
a_1 <- MASS::mvrnorm(
  mu = rep(0, n_species), 
  Sigma = diag(1, nrow = n_species)
)
a <- cbind(a_0, a_1)
names(a_0) <- names(a_1) <- rownames(a) <- species

epsilon <- MASS::mvrnorm(
  n = n_samples,
  mu = rep(0, n_species), 
  Sigma = diag(0.5, nrow = n_species)
) %>%
  t

y <- a %*% X + epsilon
colnames(y) <- str_glue("sa{1:n_samples}")

For the data frame of true coefficients, convert from log-e to log-2.

true_coeffs <- list('(Intercept)' = a_0, x1 = a_1) %>%
  map_dfr(enframe, ".otu", "truth", .id = "term") %>%
  mutate(across(truth, ~ . / log(2))) %>%
  print

#> # A tibble: 20 × 3
#>    term        .otu     truth
#>    <chr>       <chr>    <dbl>
#>  1 (Intercept) Sp. 1  -0.0905
#>  2 (Intercept) Sp. 2   2.91  
#>  3 (Intercept) Sp. 3  -0.137 
#>  4 (Intercept) Sp. 4   2.18  
#>  5 (Intercept) Sp. 5  -0.153 
#>  6 (Intercept) Sp. 6   0.583 
#>  7 (Intercept) Sp. 7   0.913 
#>  8 (Intercept) Sp. 8   0.524 
#>  9 (Intercept) Sp. 9  -0.815 
#> 10 (Intercept) Sp. 10  1.98  
#> 11 x1          Sp. 1   1.90  
#> 12 x1          Sp. 2  -3.52  
#> 13 x1          Sp. 3  -3.83  
#> 14 x1          Sp. 4  -0.410 
#> 15 x1          Sp. 5   0.917 
#> 16 x1          Sp. 6  -0.192 
#> 17 x1          Sp. 7  -0.402 
#> 18 x1          Sp. 8  -2.00  
#> 19 x1          Sp. 9   3.30  
#> 20 x1          Sp. 10  1.88

sam <- tibble(.sample = colnames(y), x) %>%
  mutate(across(x, factor))
actual_abun <- phyloseq(
  otu_table(y, taxa_are_rows = TRUE) %>% transform_sample_counts(exp),
  sample_data(sam)
)

actual_abun %>%
  as_tibble %>%
  ggplot(aes(y = .otu, x = .abundance, fill = x)) +
  scale_x_log10() +
  stat_dots()

Taxonomic bias and measured abundances

Draw efficiencies from a log normal distribution; set relative to the geometric mean efficiency.

log_efficiency <- MASS::mvrnorm(
  mu = rep(0, n_species),
  Sigma = diag(3, nrow = n_species)
) %>%
  # {. - min(.)}
  {. - mean(.)}
efficiency <- exp(log_efficiency)
# names(efficiency) <- species

log_efficiency %>% summary

#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#> -3.7952 -1.4314  0.0123  0.0000  0.9551  5.0904

max(efficiency) / min(efficiency)

#> [1] 7226.731

efficiency %>% qplot + scale_x_log10()

To create an association of log mean efficiency and the covariate, I will set set it so that the species with the largest slope coefficient also has the largest efficiency. This scenario is inspired by the leopold2020host and brooks2015thet experiments.

idx_slope <- which.max(a_1)
idx_eff <- which.max(efficiency)
names(efficiency) <- species
names(efficiency)[idx_eff] <- species[idx_slope]
names(efficiency)[idx_slope] <- species[idx_eff]

Data frame with all species parameters,

species_params <- true_coeffs %>%
  pivot_wider(names_from = term, values_from = truth) %>%
  rename(intercept = '(Intercept)') %>%
  left_join(efficiency %>% enframe('.otu', 'efficiency'), by = '.otu') %>%
  mutate(log2_efficiency = log2(efficiency)) %>%
  print

#> # A tibble: 10 × 5
#>    .otu   intercept     x1 efficiency log2_efficiency
#>    <chr>      <dbl>  <dbl>      <dbl>           <dbl>
#>  1 Sp. 1    -0.0905  1.90      0.0225          -5.48 
#>  2 Sp. 2     2.91   -3.52      1.15             0.200
#>  3 Sp. 3    -0.137  -3.83      0.213           -2.23 
#>  4 Sp. 4     2.18   -0.410     0.0748          -3.74 
#>  5 Sp. 5    -0.153   0.917     2.98             1.58 
#>  6 Sp. 6     0.583  -0.192     0.339           -1.56 
#>  7 Sp. 7     0.913  -0.402     0.892           -0.165
#>  8 Sp. 8     0.524  -2.00      1.73             0.787
#>  9 Sp. 9    -0.815   3.30    162.               7.34 
#> 10 Sp. 10    1.98    1.88      9.62             3.27

species_params %>%
  ggplot(aes(x1, log2_efficiency)) +
  geom_text(aes(label = str_extract(.otu, '[0-9]+')))

Question: Is it true that the first species is driving the association?

The measured (i.e. estimated) proportions and abundances are given by perturbing the actual abundances by the efficiencies, and normalizing to proportions or to the original (correct) total.

measured_prop <- actual_abun %>% perturb(efficiency)
measured_abun <- actual_abun %>% perturb(efficiency, norm = "keep")

Compute mean efficiencies

mean_eff <- actual_abun %>%
  transform_sample_counts(close_elts) %>%
  perturb(efficiency, norm = "none") %>%
  sample_sums %>%
  enframe(".sample", "mean_efficiency") %>%
  left_join(sam, by = ".sample") %>%
  mutate(log2_mean_efficiency = log2(mean_efficiency))

mean_eff %>%
  ggplot(aes(y = as.factor(x), x = mean_efficiency)) +
  scale_x_log10() +
  stat_dotsinterval()

Plots

Estimated and actual abundances

lvls <- a_1 %>% sort(decreasing = TRUE) %>% names

abun <- bind_rows(
  Actual = actual_abun %>% as_tibble,
  Measured = measured_abun %>% as_tibble,
  .id = 'type'
) %>%
  rename(abundance = .abundance) %>%
  mutate(
    log2_abundance = log2(abundance),
    across(.otu, factor, levels = lvls)
  )

p_species_all <- abun %>%
  ggplot(aes(x, log2_abundance, color = type)) +
  labs(y = "Log efficiency", x = "Condition (x)") +
  facet_grid(.otu~type, scales = "fixed") +
  geom_quasirandom(alpha = 0.3, groupOnX = TRUE) +
  # stat_summary(fun.data = "mean_cl_boot", geom = "pointrange") +
  stat_summary(geom = "point", fun.data = mean_se) +
  stat_smooth(aes(x = as.integer(x)), formula = y~x,
    method = "lm", size = 0.9, fill = 'grey', se = FALSE
  ) +
  theme(legend.position = "none")
p_species_all

p_mean_eff <- mean_eff %>%
  ggplot(aes(x, log2(mean_efficiency))) +
  labs(y = "Log efficiency", x = "Condition (x)") +
  geom_quasirandom(alpha = 0.3, groupOnX = TRUE) +
  # stat_summary(fun.data = "mean_cl_boot", geom = "pointrange") +
  stat_summary(geom = "point", fun.data = mean_se) +
  stat_smooth(aes(x = as.integer(x)), formula = y~x,
    method = "lm", size = 0.9, color = 'black', fill = 'grey', se = FALSE
  )
p_mean_eff

p_mean_eff + p_species_all +
  plot_layout(ncol = 1, heights = c(0.2, 1)) &
  colorblindr::scale_color_OkabeIto() &
  colorblindr::scale_fill_OkabeIto()

Regression coefficients

lm_fit_mean_eff <- mean_eff %>%
  {lm(log2_mean_efficiency ~ 1 + x, .)}
lm_results_mean_eff <- lm_fit_mean_eff %>%
  broom::tidy(conf.int = TRUE)

lm_fits <- abun %>%
  group_by(.otu, type) %>%
  nest %>%
  mutate(
    fit = map(data, ~lm(log2_abundance ~ 1 + x, .))
  )
lm_results <- lm_fits %>%
  mutate(
    fit = map(fit, broom::tidy, conf.int = TRUE)
  ) %>%
  select(-data) %>%
  unnest(fit) %>%
  ungroup %>%
  left_join(true_coeffs, by = c(".otu", "term"))

lm_results_slope <- lm_results %>%
  filter(term == "x1") %>%
  mutate(
    across(.otu, fct_reorder, estimate)
  ) %>%
  arrange(.otu)

# params for arrows showing error
delta <- lm_results_mean_eff %>% filter(term == "x1") %>% pull(estimate)
start <- lm_results_slope %>% filter(type == 'Actual', .otu == 'Sp. 9') %>%
  pull(estimate)

p_coef_ci <- lm_results_slope %>% 
  ggplot(aes(y = .otu, x = estimate, color = type)) +
  labs(x = "Mean LFD", y = NULL, color = "Type") +
  geom_vline(xintercept = 0, color = "grey") +
  geom_pointinterval(aes(xmin = conf.low, xmax = conf.high)) +
  theme(legend.position = 'top') +
  guides(color = guide_legend(reverse = TRUE)) +
  annotate(
    geom = 'segment', color = "darkred",
    arrow = grid::arrow(length = unit(0.1, "inches")), 
    x = start, xend = start - delta, 
    y = 10.5, yend = 10.5,
    size = .7
  ) +
  annotate(
    geom = 'text', color = "darkred",
    label = 'Effect of bias',
    x = start - delta/2,
    y = 10.8,
  ) +
  coord_cartesian(clip = 'off')

p_coef_ci_with_true_coef <- p_coef_ci +
  geom_point(data = ~filter(., type == "Actual"),
    aes(x = truth),
    color = 'black', shape = '+', size = 4)

p_coef_ci_mean_eff <- lm_results_mean_eff %>% 
  filter(term == "x1") %>%
  ggplot(aes(y = "Mean efficiency", x = estimate)) +
  expand_limits(
    x = c(
      min(lm_results_slope$conf.low), 
      max(lm_results_slope$conf.high)
  )) +
  labs(x = "Mean LFD", y = NULL) +
  geom_vline(xintercept = 0, color = "grey") +
  geom_point() +
  annotate(
    geom = 'segment', color = "darkred",
    arrow = grid::arrow(length = unit(0.1, "inches")), 
    x = 0, xend = delta, 
    y = 1.0, yend = 1.0,
    size = .7
  )

p_coef_dot <- lm_results_slope %>% 
  ggplot(aes(y = type, x = estimate, fill = type)) +
  labs(x = "Mean LFD", y = "Type") +
  geom_vline(xintercept = 0, color = "grey") +
  stat_dots()

p_coef_ci_mean_eff / p_coef_ci_with_true_coef / p_coef_dot + 
  plot_layout(heights = c(0.2, 1, 0.3)) &
  colorblindr::scale_color_OkabeIto() &
  colorblindr::scale_fill_OkabeIto()

The true coefficients fall within the ‘Actual’ CIs, confirming that our simulations worked. For the manuscript figure, we won’t include the true data-generating coefficients, since our aim is to illustrate the relationship between coefficients estimated on the ‘Actual’ and ‘Measured’ measurements.

Manuscript figure

Chose species that show the full range of qualitative behaviors in terms of the effect of taxonomic bias on the estimated slope coefficient.

Species 9: Bias causes a substantial decrease in the magnitude of the slope
Species 10: Bias causes a substantial decrease in the magnitude of the slope, such that the estimate is no longer distinguishable from zero.
Species 5: Bias causes a sign error
Species 7: Bias cases a small negative slope that would not have been distinguishable from zero to become significantly (in a biological and statistical sense) negative
Species 2: Bias cases a substantially negative slope to become more negative (increased magnitude)

species_to_plot <- str_c("Sp. ", c('9', '10', '5', '7', '2'))
p_species_focal <- p_species_all
p_species_focal$data <- p_species_focal$data %>% 
  filter(.otu %in% species_to_plot)

# Bold these species in coeffcient CI plot
p_coef_ci1 <- p_coef_ci +
  theme(
    axis.text.y = element_text(
      face = ifelse(levels(p_coef_ci$data$.otu) %in% species_to_plot, 
        "bold", "plain")
    )
  )

# Set range of mean eff panel to match
rng <- p_species_focal$data %>%
  pull(log2_abundance) %>%
  {max(.) - min(.)}
m <- p_mean_eff$data %>% pull(mean_efficiency) %>% log2 %>% mean
p_mean_eff1 <- p_mean_eff + 
  expand_limits(y = c(m - rng/2, m + rng/2))

(p_mean_eff1 + ggtitle("Mean efficiency")) +
  (p_coef_ci_mean_eff + ggtitle("LFD in mean efficiency")) + 
  (p_species_focal + ggtitle("Actual and measured\nabundances of select species")) + 
  (p_coef_ci1 + ggtitle("Mean LFD estimated from\nactual or measured abundances") +
    theme(legend.box.margin = margin(b = -15))
  ) + 
  plot_layout(ncol = 2, heights = c(0.2, 1)) +
  plot_annotation(tag_levels = 'A') &
  colorblindr::scale_color_OkabeIto() &
  colorblindr::scale_fill_OkabeIto() &
  theme(
    plot.title = element_text(face = "plain")
  )

Standard errors

lm_results %>%
  mutate(across(.otu, factor, levels = lm_results_slope$.otu %>% levels)) %>%
  ggplot(aes(y = .otu, x = std.error, color = type)) +
  facet_wrap(~term) +
  geom_point() +
  colorblindr::scale_color_OkabeIto()

Can see that the standard error is generally increased in the measured abundance estimates except for Species 9, whose residual variation we expect to be correlated with the log mean efficiency and hence negatively correlated with the measurement error.

Let’s confirm that the relationship between the SEs in the measured response, the actual response, and the mean efficiency agree with our theoretical expectation. To do so, we must compute the sample covariance of the residual of the actual response and the mean efficiency for each species.

aug_mean_eff <- lm_fit_mean_eff %>% 
  broom::augment() %>% 
  add_column(.sample = mean_eff$.sample)
aug <- lm_fits %>%
  mutate(
    .sample = map(data, pull, '.sample'),
    augment = map(fit, broom::augment),
  ) %>%
  select(-data, -fit) %>%
  unnest(c(.sample, augment)) %>%
  ungroup %>%
  left_join(aug_mean_eff, by = c(".sample", "x"), suffix = c('.abun', '.mean_eff'))

cov_actual_with_mean_eff <- aug %>%
  filter(type == 'Actual') %>%
  with_groups(.otu, summarize,
    cov = cov(.resid.abun, .resid.mean_eff)
  ) %>%
  print

#> # A tibble: 10 × 2
#>    .otu       cov
#>    <fct>    <dbl>
#>  1 Sp. 9   0.565 
#>  2 Sp. 1  -0.111 
#>  3 Sp. 10 -0.163 
#>  4 Sp. 5  -0.0326
#>  5 Sp. 6  -0.0115
#>  6 Sp. 7  -0.0672
#>  7 Sp. 4  -0.172 
#>  8 Sp. 8  -0.105 
#>  9 Sp. 2   0.0449
#> 10 Sp. 3   0.0338

std_err_mean_eff <- lm_fit_mean_eff %>% broom::tidy() %>%
  filter(term == 'x1') %>% pull(std.error)
std_errs <- lm_results_slope %>%
  select(type, .otu, std.error) %>%
  pivot_wider(names_from = type, values_from = std.error, 
    names_prefix = "se_") %>%
  left_join(cov_actual_with_mean_eff, by = '.otu') %>%
  print

#> # A tibble: 10 × 4
#>    .otu   se_Actual se_Measured     cov
#>    <fct>      <dbl>       <dbl>   <dbl>
#>  1 Sp. 3      0.292       0.341  0.0338
#>  2 Sp. 2      0.311       0.354  0.0449
#>  3 Sp. 8      0.244       0.336 -0.105 
#>  4 Sp. 4      0.252       0.357 -0.172 
#>  5 Sp. 6      0.278       0.340 -0.0115
#>  6 Sp. 7      0.290       0.363 -0.0672
#>  7 Sp. 5      0.269       0.337 -0.0326
#>  8 Sp. 1      0.283       0.367 -0.111 
#>  9 Sp. 10     0.289       0.383 -0.163 
#> 10 Sp. 9      0.270       0.130  0.565

std_errs %>%
  mutate(.otu,
    se2_Measured = se_Measured^2,
    pred = se_Actual^2 + std_err_mean_eff^2 -
      cov * 2 / ((n_samples - 2) * var(x))
      # cov * 2 * (n_samples - 1) / (n_samples * (n_samples - 2) * var(x))
  )

#> # A tibble: 10 × 6
#>    .otu   se_Actual se_Measured     cov se2_Measured   pred
#>    <fct>      <dbl>       <dbl>   <dbl>        <dbl>  <dbl>
#>  1 Sp. 3      0.292       0.341  0.0338       0.116  0.116 
#>  2 Sp. 2      0.311       0.354  0.0449       0.125  0.125 
#>  3 Sp. 8      0.244       0.336 -0.105        0.113  0.113 
#>  4 Sp. 4      0.252       0.357 -0.172        0.128  0.128 
#>  5 Sp. 6      0.278       0.340 -0.0115       0.115  0.115 
#>  6 Sp. 7      0.290       0.363 -0.0672       0.132  0.132 
#>  7 Sp. 5      0.269       0.337 -0.0326       0.114  0.114 
#>  8 Sp. 1      0.283       0.367 -0.111        0.135  0.135 
#>  9 Sp. 10     0.289       0.383 -0.163        0.147  0.147 
#> 10 Sp. 9      0.270       0.130  0.565        0.0170 0.0170

The prediction for the squared standard error from the theoretical calculation agrees with what we observe.

TODO: Sort out why it is this version, and not the other, that gives agreement. Perhaps the standard errors being returned by R are using the MLE estimate of sigma instead of the OLS estimate?

2022-10-08 Modified main figure

New panel showing simulated efficiencies and mean LFDs, to replace the panel showing the change in mean efficiency (former panel B).

p_params <- species_params %>%
  ggplot(aes(x1, log2_efficiency)) +
  theme(axis.line = element_blank()) +
  # panel_border(remove = TRUE) +
  # theme_minimal_grid() +
  coord_cartesian(clip = 'off') +
  geom_vline(xintercept = 0, color = 'grey') +
  geom_hline(yintercept = 0, color = 'grey') +
  geom_text(
    aes(label = str_c('Sp. ', str_extract(.otu, '[0-9]+'))), 
    size = 4) +
  labs(
    x = 'Expected LFD',
    y = 'Log efficiency'
  )
p_params

Annotate mean efficiency plot,

# params for arrow showing change
arrow_params <- mean_eff %>%
  with_groups(x, summarize, across(log2_mean_efficiency, mean)) %>%
  pull(log2_mean_efficiency)

p_mean_eff2 <- p_mean_eff +
  annotate(
    geom = 'segment', color = "darkred",
    arrow = grid::arrow(length = unit(0.1, "inches")), 
    x = 2.5, xend = 2.5, 
    y = arrow_params[1], 
    yend = arrow_params[2], 
    size = .7
  )
p_mean_eff2

Assemble with improved titles,


(p_params + ggtitle("Simulated log efficiencies\nand expected LFDs")) +
  (p_mean_eff2 + ggtitle("Mean efficiency across\nsamples in each condition")) +
  # (p_coef_ci_mean_eff + ggtitle("LFD in mean efficiency")) + 
  (p_species_focal + ggtitle("Actual and measured\nabundances of select species")) + 
  (p_coef_ci1 + ggtitle("Mean LFD estimated from\nactual or measured abundances") +
    theme(legend.box.margin = margin(b = -15))
  ) + 
  plot_layout(ncol = 2, heights = c(0.4, 1)) +
  plot_annotation(tag_levels = 'A') &
  colorblindr::scale_color_OkabeIto() &
  colorblindr::scale_fill_OkabeIto() &
  theme(
    plot.title = element_text(face = "plain")
  )

Simulate regression example