MOMS Patient Exploration and Overall Summary

Jacob T. Nearing

07/10/2021

Set up

Load in required libaries for the analysis

Load in mock data

This code section is copied from MM’s original analysis

#Load in otu data
otu <- here("output", "stirrups-profiles", "abundance-matrix.csv.bz2") %>%
  read_csv(
    col_types = cols(.default = col_double(), sample_name = col_character())
  ) %>%
  otu_table(taxa_are_rows = FALSE)

## Assuming first column, `sample_name`, contains the sample names

#load in sample data
sam <- here("output", "stirrups-profiles", "sample-data.csv.bz2") %>%
  read_csv(col_types = "ccccccic") %>%
  mutate(across(host_visit_number, factor, ordered = TRUE)) %>%
  sample_data

## Assuming first column, `sample_name`, contains the sample names

#load in taxa profiles
tax <- here("output", "stirrups-profiles", "taxonomy.csv.bz2") %>%
  read_csv(col_types = cols(.default = col_character())) %>%
  tax_table %>%
  mutate_tax_table(
    species = case_when(!is.na(genus) ~ .otu)
  )

## Assuming first column, `otu`, contains the taxa names

ps <- phyloseq(otu, sam, tax)

taxa_names(ps) <- taxa_names(ps) %>% 
  str_replace("(?<=Lactobacillus_crispatus)_cluster", "")
taxa_names(ps) %>% str_subset("crispatus")

## [1] "Lactobacillus_crispatus"       "Lactobacillus_crispatus_type1"

Calibrate data

#grab data files from metacal package 
dr <- system.file("extdata", package = "metacal")


#table contains the expected abundances of the mock communties from
# brooks et al., 2015
actual <- file.path(dr, "brooks2015-actual.csv") %>%
  read.csv(row.names = "Sample") %>%
  as("matrix")

#table contains the observed abundances from 16S sequencing above mock communties
# data from brooks et al., 2015
observed <- file.path(dr, "brooks2015-observed.csv") %>%
  read.csv(row.names = "Sample") %>%
  subset(select = - Other) %>%
  as("matrix")

# Estimate bias with bootstrapping for error estimation
mc_fit <- estimate_bias(observed, actual, margin = 1, boot = TRUE)

## Zeroing 175 values in `observed`

summary(mc_fit)

## Summary of a metacal bias fit.
## 
## Estimated relative efficiencies:
## # A tibble: 7 × 4
##   taxon                    estimate gm_mean gm_se
##   <chr>                       <dbl>   <dbl> <dbl>
## 1 Atopobium_vaginae           0.285   0.285  1.04
## 2 Gardnerella_vaginalis       0.160   0.160  1.04
## 3 Lactobacillus_crispatus     2.29    2.28   1.03
## 4 Lactobacillus_iners         4.68    4.68   1.02
## 5 Prevotella_bivia            1.79    1.79   1.04
## 6 Sneathia_amnii              4.59    4.59   1.04
## 7 Streptococcus_agalactiae    0.250   0.250  1.03
## 
## Geometric standard error estimated from 1000 bootstrap replicates.

#to keep our workspace clean we will remove the tables we don;t need.
rm(actual, observed, dr)

control_species <- mc_fit %>% coef %>% names
stopifnot(all(control_species %in% taxa_names(ps)))

We’ll just use the point estimate, ignoring the uncertainty, since the standard errors are so small.

We don’t have a way to directly estimate the efficiencies of the other OTUs, so we’ll impute them as follows.

Set the efficiencies of the 7 control species to the directly estimated values
Compute efficiencies for the 6 control genera to the efficiency of the control species (if only one) or the geometric mean efficiency (if more than one; here this is just Lactobacillus)
Use these genera-level efficiencies for the efficiencies of OTUs that are not control species but are in the same genus as one
For other OTUs, use the geometric mean of the genera efficiencies; this is similar to using the mean of the 7 control species but gives Lactobacillus equal weight to the other genera.

#bias of all 
bias_species <- coef(mc_fit) %>% 
  enframe("species", "efficiency") %>%
  print

## # A tibble: 7 × 2
##   species                  efficiency
##   <chr>                         <dbl>
## 1 Atopobium_vaginae             0.285
## 2 Gardnerella_vaginalis         0.160
## 3 Lactobacillus_crispatus       2.29 
## 4 Lactobacillus_iners           4.68 
## 5 Prevotella_bivia              1.79 
## 6 Sneathia_amnii                4.59 
## 7 Streptococcus_agalactiae      0.250

bias_genus <- bias_species %>%
  mutate(genus = str_extract(species, "^[^_]+"), .before = 1) %>%
  with_groups(genus, summarize, across(efficiency, gm_mean)) %>%
  print

## # A tibble: 6 × 2
##   genus         efficiency
##   <chr>              <dbl>
## 1 Atopobium          0.285
## 2 Gardnerella        0.160
## 3 Lactobacillus      3.27 
## 4 Prevotella         1.79 
## 5 Sneathia           4.59 
## 6 Streptococcus      0.250

# Match on genus or species, depending on which is available; then set others
# to average genus efficiency
bias_all <- tax_table(ps) %>% as_tibble %>%
  left_join(bias_species, by = "species") %>%
  left_join(bias_genus, by = "genus") %>%
  mutate(
    efficiency = case_when(
      !is.na(efficiency.x) ~ efficiency.x,
      !is.na(efficiency.y) ~ efficiency.y,
      TRUE ~ gm_mean(bias_genus$efficiency)
    )
  ) %>%
  select(-efficiency.x, -efficiency.y) %>%
  # standardize to L. iners, the most efficiently measured
  mutate(
    across(efficiency, ~ . / max(.))
  ) %>%
  glimpse

## Rows: 360
## Columns: 10
## $ .otu       <chr> "Clostridiaceae_1_OTU17", "Lachnospiraceae_BVAB1", "Lachnos…
## $ domain     <chr> NA, NA, NA, NA, NA, NA, "Bacteria", "Bacteria", "Bacteria",…
## $ phylum     <chr> NA, NA, NA, NA, NA, NA, "Firmicutes", "Firmicutes", "Proteo…
## $ class      <chr> NA, NA, NA, NA, NA, NA, "Bacilli", "Clostridia", "Betaprote…
## $ order      <chr> NA, NA, NA, NA, NA, NA, "Lactobacillales", "Clostridiales",…
## $ family     <chr> NA, NA, NA, NA, NA, NA, "Aerococcaceae", "Veillonellaceae",…
## $ genus      <chr> NA, NA, NA, NA, NA, NA, "Abiotrophia", "Acidaminococcus", "…
## $ control    <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALS…
## $ species    <chr> NA, NA, NA, NA, NA, NA, "Abiotrophia_defectiva", "Acidamino…
## $ efficiency <dbl> 0.175212, 0.175212, 0.175212, 0.175212, 0.175212, 0.175212,…

bias_all_vec <- bias_all %>% select(.otu, efficiency) %>% deframe

ps.obs <- ps %>% 
  # prune_taxa(control_species, .) %>%
  transform_sample_counts(close_elts)

# note, calibrate() automatically subsets to just the focal taxa if we haven't
# already
ps.cal <- ps.obs %>% calibrate(bias_all_vec)


mean_eff <- perturb(ps.cal, bias_all_vec, norm = "none")%>% 
  sample_sums %>%
  enframe('.sample', 'mean_efficiency') %>%
  left_join(sam %>% as_tibble, by = '.sample')

Calculate fold-change between subsequent visits

#takes in as input 2 phyloseq objects (obs and calibrated phyloseq objects)
#the subject of interest (i.e. patient ID)
#whether you want to prune to only the taxa in the mock community (i.e. control species)
#a vector that contains the names of all the cntrl species (required when pruning)
get_taxa_fold_change <- function(cal, obs, subject, prune=F, cntrl_spec=control_species){
  
  
  #if we want to only keep taxa in the mocks prune set to T
  if(prune){
  
    #borrow Mike's code from above
    temp <- list(Calibrated = cal, Observed = obs) %>%
    map(filter_sample_data, host_subject_id == subject) %>%
    #remove taxa that are not control taxa
    map(~prune_taxa(cntrl_spec, .)) %>%
    #change to relative abundances are they not already in proportions though?
    #ohhh we need to recalculate the relative abundances as we removed some taxa!
      
    map(transform_sample_counts, fun = close_elts) %>%
    map(as_tibble)
    
  }else{
    temp <- list(Calibrated = cal, Observed = obs) %>%
    map(filter_sample_data, host_subject_id == subject) %>%
    map(as_tibble)
  }
  
  #set the type of overvation
  temp[["Calibrated"]]$Type <- "Calibrated"
  temp[["Observed"]]$Type <- "Observed"

  #list that will contain the returned info.
  ret_list_cal <- list()
  
  
  taxa_names <- unique(temp$Calibrated$.otu)
  #go through list and create a new dataframe of each taxa arrange by the their visit number
  for(i in taxa_names){
    ret_list_cal[[i]] <- filter(temp$Calibrated, .otu==i) %>% arrange(host_visit_number)
    
  }
  #okay for each of these taxa we now need to calculate the fold change in abundance between row i and row i + 1
  # i.e. each subsequent visit
  for(i in taxa_names){
    #loop through each taxa abundance for that sample
    for(j in 1:nrow(ret_list_cal[[i]])){
      #initalize the column as -1 (fold change value should never be negative)
      #if its the first sample we will just mark at is "START"
      if(j==1){
        ret_list_cal[[i]][j,"fold_change_from_last"] <- NA
        ret_list_cal[[i]][j,"change"] <- "START"
        next()
      }
      #get the previous sample index
      last_index <- j-1
      #divide current index abundance by last abundance to get FC change
      fc_change <- ret_list_cal[[i]][j,".abundance"]/ret_list_cal[[i]][last_index,".abundance"]
      ret_list_cal[[i]][j,"fold_change_from_last"] <- fc_change
      
      #sometimes NaN is retunred as abundance when all abundances are 0 during reclosing to RA
      #this is only an issue when pruning to control taxa.
      if(is.na(ret_list_cal[[i]][j, ".abundance"]) | is.na(ret_list_cal[[i]][last_index, ".abundance"])){
        ret_list_cal[[i]][j, "change"] <- "FAIL"
        next()
      }
      #set what the change was (went up down or stayed same between visits)
      if(ret_list_cal[[i]][j,".abundance"] > ret_list_cal[[i]][last_index, ".abundance"]){
        ret_list_cal[[i]][j,"change"] <- "UP"
      }
      if(ret_list_cal[[i]][j,".abundance"] < ret_list_cal[[i]][last_index, ".abundance"]){
        ret_list_cal[[i]][j,"change"] <- "DOWN"
      }
      if(ret_list_cal[[i]][j,".abundance"] == ret_list_cal[[i]][last_index, ".abundance"]){
        ret_list_cal[[i]][j,"change"] <- "SAME"
      }
      
    }
    
  }
  
  #same analysis as above but for observed values.
  ret_list_obs <- list()
  
  taxa_names <- unique(temp$Observed$.otu)
  for(i in taxa_names){
    ret_list_obs[[i]] <- filter(temp$Observed, .otu==i) %>% arrange(host_visit_number)
    
  }
  #okay for each of these taxa we now need to calculate the fold change in abundance between row i and row i + 1
  for(i in taxa_names){
    for(j in 1:nrow(ret_list_obs[[i]])){
      #initalize the column as -1 (fold change value should never be negative)
      if(j==1){
        ret_list_obs[[i]][j,"fold_change_from_last"] <- NA
        ret_list_obs[[i]][j,"change"] <- "START"
        next()
      }
      last_index <- j-1
      #divide current index abundance by last abundance to get FC change
      fc_change <- ret_list_obs[[i]][j,".abundance"]/ret_list_obs[[i]][last_index,".abundance"]
      ret_list_obs[[i]][j,"fold_change_from_last"] <- fc_change
      
      #sometimes NaN is retunred as abundance when all abundances are 0 during reclosing to RA
      if(is.na(ret_list_obs[[i]][j, ".abundance"]) | is.na(ret_list_obs[[i]][last_index, ".abundance"])){
        ret_list_obs[[i]][j, "change"] <- "FAIL"
        next()
      }
      
      if(ret_list_obs[[i]][j,".abundance"] > ret_list_obs[[i]][last_index, ".abundance"]){
        ret_list_obs[[i]][j,"change"] <- "UP"
      }
      if(ret_list_obs[[i]][j,".abundance"] < ret_list_obs[[i]][last_index, ".abundance"]){
        ret_list_obs[[i]][j,"change"] <- "DOWN"
      }
      if(ret_list_obs[[i]][j,".abundance"] == ret_list_obs[[i]][last_index, ".abundance"]){
        ret_list_obs[[i]][j,"change"] <- "SAME"
      }
    }
    
  }
  
  return(list(ret_list_cal,ret_list_obs))
  
}

### Function that goes through the list that is returned from get_taxa_fold_change and finds taxa/samples that have a sign change between obs and calibrated
## for now we will return a DF with two rows for each occurance of a fold change difference between calibrated and observed.
find_sign_change <- function(taxa_list){
  
  taxa_names <- names(taxa_list[[1]])
  
  found_hit=F
  ret_df <- NULL
  #go through each taxa
  for(taxa in taxa_names){
    #1 is calibrated list
    for(j in 1:nrow(taxa_list[[1]][[taxa]])){
      if(taxa_list[[1]][[taxa]][j,"change"] != taxa_list[[2]][[taxa]][j,"change"]){
        message("found diff")
        if(found_hit==F){
          ret_df <- rbind(taxa_list[[1]][[taxa]][j,], taxa_list[[2]][[taxa]][j,])
          
          val1 <- ret_df[1,"fold_change_from_last"]/ret_df[2,"fold_change_from_last"]
          val2 <- ret_df[2,"fold_change_from_last"]/ret_df[1,"fold_change_from_last"]
          vals <- c(val1,val2)
          
          fc_diff <- vals[which.max(vals)]
          message(fc_diff)
          ret_df$FC_Diff <- fc_diff

          

          found_hit=T
        }else{
          temp_df <- rbind(taxa_list[[1]][[taxa]][j,], taxa_list[[2]][[taxa]][j,])
          
          val1 <- temp_df[1,"fold_change_from_last"]/temp_df[2,"fold_change_from_last"]
          val2 <- temp_df[2,"fold_change_from_last"]/temp_df[1,"fold_change_from_last"]
          vals <- c(val1,val2)
          
          fc_diff <- vals[which.max(vals)]
          temp_df$FC_Diff <- fc_diff
          
          ret_df <- rbind(ret_df, temp_df)
        }

      }

    }
  }
  return(ret_df)
  
}

Get_total_fc <- function(taxa_list){
  
  taxa_names <- names(taxa_list[[1]])
  found_hit=F
  ret_df <- NULL
  #go through each taxa
  for(taxa in taxa_names){
    #1 is calibrated list
    for(j in 1:nrow(taxa_list[[1]][[taxa]])){
        if(found_hit==F){
          ret_df <- rbind(taxa_list[[1]][[taxa]][j,], taxa_list[[2]][[taxa]][j,])
          found_hit=T
        }else{
          temp_df <- rbind(taxa_list[[1]][[taxa]][j,], taxa_list[[2]][[taxa]][j,])
          ret_df <- rbind(ret_df, temp_df)
        }

      }

    }
  
  return(ret_df)
}

Generate FC change values

For the purposes of this analysis we will only focus on the values that also include the imputed bias for non-control taxa

This takes awhile to run so we will save the results as an RDS file.

#get a list of subjects in the dataset
Subject_names <- unique(sample_data(ps.cal)$host_subject_id)

#remove subjects with only 1 samples
one_sample_subjects <- names(which(table(sample_data(ps.cal)$host_subject_id) ==1 ))
Subject_names <- Subject_names[! Subject_names %in% one_sample_subjects]

#list to contain FC_changes
nonpruned_fc_changes <- list()
#list to contain samples/taxa that there is FC dif between observed and calibrated
nonpruned_sign_dif <- list()

#Go through each subject and calulate fold changes for every taxa
for(i in Subject_names){
  message(i)
  nonpruned_fc_changes[[i]] <- get_taxa_fold_change(cal=ps.cal, obs = ps.obs, subject=i,
                                                 prune=F, cntrl_spec = control_species)
  nonpruned_sign_dif[[i]] <- find_sign_change(nonpruned_fc_changes[[i]])
}

#bind all sign_difs for each patient into a single dataframe.
all_together_nonprune_dif <- do.call(rbind, nonpruned_sign_dif)
#save it
saveRDS(all_together_nonprune_dif, "~/GitHub_Repos/MOMSPI/momspi/data/FC_data/FC_changes_dif_nonprune.RDS")

#save the fc_changes
saveRDS(nonpruned_fc_changes, "~/GitHub_Repos/MOMSPI/momspi/data/FC_data/FC_changes_all_nonprune.RDS")


### To make this simplier lets only look at control species

all_together_control <- all_together[which(all_together$control==T),]
which.max(all_together_control$FC_Diff)

Explore FC differences of control taxa

While we imputed the bias for all taxon lets just focus on the taxa that were included in the mock communnities.

all_together_dif <- readRDS("~/GitHub_Repos/MOMSPI/momspi/data/FC_data/FC_changes_dif_nonprune.RDS")
all_together_dif$FC_Diff <- unlist(all_together_dif$FC_Diff)


#only keep taxa that were included in the mock
all_together_dif_control <- all_together_dif[which(all_together_dif$control==T),]

#lets see what the maximum FC difference is
max_diff_index <- which.max(all_together_dif_control$FC_Diff)

#list of metadata to look at
metadata_of_int <- c(".sample", "host_subject_id", ".otu", "host_visit_number", "FC_Diff")


all_together_dif_control[max_diff_index,metadata_of_int]

## # A tibble: 1 × 5
##   .sample           host_subject_id .otu              host_visit_number FC_Diff
##   <chr>             <chr>           <chr>             <ord>               <dbl>
## 1 EP229454_K20_MV1D EP229454        Atopobium_vaginae 2                    17.1

#Patient EP229454 has the largest fold change difference between the different individuals

#fairly large FC change for this individual as well mostly in Prevotella bivia
all_together_dif_control[which(all_together_dif_control$host_subject_id=="EP972345"), metadata_of_int]

## # A tibble: 6 × 5
##   .sample           host_subject_id .otu                host_visit_numb… FC_Diff
##   <chr>             <chr>           <chr>               <ord>              <dbl>
## 1 EP972345_K20_MV1D EP972345        Gardnerella_vagina… 2                   1.19
## 2 EP972345_K20_MV1D EP972345        Gardnerella_vagina… 2                   1.19
## 3 EP972345_K20_MV1D EP972345        Lactobacillus_cris… 2                   1.19
## 4 EP972345_K20_MV1D EP972345        Lactobacillus_cris… 2                   1.19
## 5 EP972345_K40_MV1D EP972345        Prevotella_bivia    4                  13.1 
## 6 EP972345_K40_MV1D EP972345        Prevotella_bivia    4                  13.1

#lets do the same as above but filter on FC_diff being larger than 5.
all_together_dif_control_large <- all_together_dif_control[which(all_together_dif_control$FC_Diff>=5),]
tail(sort(table(all_together_dif_control_large$host_subject_id)))

## 
## EP694022 EP740843 EP891541 EP974070 EP218253 EP587032 
##        4        4        4        4        6        6

Patient EP229454 has the largest FC difference between calibrated and observed individuals and it is for the taxon Atopobium_vaginae.

Patient EP972345 is also a good example of a patient with a large FC difference between the calibrated and observed individuals.

Patient EP587032 has multiple instances of fairly large FC differences between the calibrated and observed individuals

We will inspect each of these patients individually

x1 <- list(Calibrated = ps.cal, Observed = ps.obs) %>%
  map(~prune_taxa(control_species, .)) %>%
  map_dfr(as_tibble, .id = "type") %>%
  mutate(otu_abbrev = .otu %>% str_replace("_", " ") %>% abbreviate)

Patient EP229454

EP229454_lines <-  x1 %>%
  filter(host_subject_id == "EP229454") %>%
  ggplot(aes(host_visit_number, .abundance, color = type)) +
  scale_y_log10() +
  facet_grid(otu_abbrev~., scales = 'fixed') +
  geom_line(aes(group = type)) +
  ylab("Log10 relative abundance") +
  xlab("Visit Number") +
  labs(color="Observation Type")


EP229454_me <- mean_eff %>%
  filter(host_subject_id == "EP154266") %>%
  ggplot(aes(host_visit_number, mean_efficiency)) +
  #scale_y_log10() +
  scale_y_log10(limits = c(1e-2, 1e1)) +
  geom_line(aes(group = 1)) +
  ylab("Log10 Mean Efficiency") +
  xlab("Visit Number")

EP229454_lines / EP229454_me + plot_layout(heights = c(1, 0.1))

## Warning: Transformation introduced infinite values in continuous y-axis

We can see an example of a sign change between the two Observation types from visits 1 - 2 in Atopobium vaginae

Patient EP972345

EP972345_lines <- x1 %>%
  filter(host_subject_id == "EP972345") %>%
  ggplot(aes(host_visit_number, .abundance, color = type)) +
  scale_y_log10() +
  facet_grid(otu_abbrev~., scales = 'fixed') +
  geom_line(aes(group = type)) +
  ylab("Log10 relative abundance") +
  xlab("Visit Number") +
  labs(color="Observation Types")

EP972345_me <- mean_eff %>%
  filter(host_subject_id == "EP972345") %>%
  ggplot(aes(host_visit_number, mean_efficiency)) +
  #scale_y_log10() +
  scale_y_log10(limits = c(1e-2, 1e1)) +
  geom_line(aes(group = 1)) +
  ylab("Log10 Mean Efficiency") +
  xlab("Visit Number")


EP972345_lines / EP972345_me + plot_layout(heights = c(1, 0.1))

## Warning: Transformation introduced infinite values in continuous y-axis

Here we see that there is a sign change between the observation types from visits 2-4. This is accompanied by a decrease in mean effiency due to an increase in the abundance of Gardnerella vaginalis

Patient EP587032

EP587032_lines <- x1 %>%
  filter(host_subject_id == "EP587032") %>%
  ggplot(aes(host_visit_number, .abundance, color = type)) +
  scale_y_log10() +
  facet_grid(otu_abbrev~., scales = 'fixed') +
  geom_line(aes(group = type)) +
  ylab("Log10 relative abundance") +
  xlab("Visit Number") +
  labs(color="Observation Types")

EP587032_me <- mean_eff %>%
  filter(host_subject_id == "EP587032") %>%
  ggplot(aes(host_visit_number, mean_efficiency)) +
  #scale_y_log10() +
  scale_y_log10(limits = c(1e-2, 1e1)) +
  geom_line(aes(group = 1)) +
  ylab("Log10 Mean Efficiency") +
  xlab("Visit Number")


EP587032_lines / EP587032_me + plot_layout(heights = c(1, 0.1))

## Warning: Transformation introduced infinite values in continuous y-axis

This patient is a very interesting example as there is sign changes in almost all of the taxa being measured. This is also accompanied by a decrease in the mean effiency due to an increase in Streptococcus aglactiae.

Overall fold changes

#get a list of subjects in the dataset
Subject_names <- unique(sample_data(ps.cal)$host_subject_id)

#remove subjects with only 1 samples
one_sample_subjects <- names(which(table(sample_data(ps.cal)$host_subject_id) ==1 ))
Subject_names <- Subject_names[! Subject_names %in% one_sample_subjects]

nonpruned_fc_changes <- readRDS("~/GitHub_Repos/MOMSPI/momspi/data/FC_data/FC_changes_all_nonprune.RDS")
All_fc_change <- list()
for(i in Subject_names){
  message(i)
  All_fc_change[[i]] <- Get_total_fc(nonpruned_fc_changes[[i]])
  
}

View(All_fc_change)
ALL_fc_change_df <- do.call(rbind, All_fc_change)
saveRDS(ALL_fc_change_df, "~/GitHub_Repos/MOMSPI/momspi/data/FC_data/FC_changes_all_nonprune_df.RDS")

All Fold Change Comparisons

All_fc_change_df <- readRDS("~/GitHub_Repos/MOMSPI/momspi/data/FC_data/FC_changes_all_nonprune_df.RDS")


### First lets contrict to just control taxa
All_fc_change_df <- All_fc_change_df %>% filter(.otu %in% control_species)
### remove start samples
All_fc_change_df <- All_fc_change_df %>% filter(change != "START")

#remove when FC was the same usually this is when FC is NaN
#this is not a completely "fair" way to filter the data most likely...
All_fc_change_df <- All_fc_change_df %>% filter(change != "SAME")

split_dfs <- split(All_fc_change_df, All_fc_change_df$Type)

identical(split_dfs[[1]]$.otu, split_dfs[[2]]$.otu)

## [1] TRUE

identical(split_dfs[[1]]$.sample, split_dfs[[2]]$.sample)

## [1] TRUE

split_dfs[[1]]$obs_val <- split_dfs[[2]]$fold_change_from_last
split_dfs[[1]]$obs_change <- split_dfs[[2]]$change

test_plot <- ggplot(split_dfs[[1]], aes(y=log(fold_change_from_last), x=log(obs_val), color=.otu)) + geom_point() +
  xlab("log(Observed Fold Changes)") + ylab("log(Calibrated Fold Changes)") + geom_abline(intercept=0, slope=1)
test_plot

Lets examine these results looking at specific taxa

taxa_split_dfs <- split(split_dfs[[1]], split_dfs[[1]]$.otu)

fc_change_perc <- c()
for(i in control_species){
  
  agg_occ <- table(taxa_split_dfs[[i]]$change, taxa_split_dfs[[i]]$obs_change)
  total_dis <- sum(agg_occ[1,2], agg_occ[2,1])
  fc_change_perc[i] <- total_dis/dim(taxa_split_dfs[[i]])[1]*100
}

fc_change_perc

##        Atopobium_vaginae    Gardnerella_vaginalis  Lactobacillus_crispatus 
##                 5.075758                 5.708391                 6.556246 
##      Lactobacillus_iners         Prevotella_bivia           Sneathia_amnii 
##                 6.177868                 3.516820                 4.303087 
## Streptococcus_agalactiae 
##                 2.127660

agreeance_plots <- list()
for(i in control_species){
  
  agreeance_plots[[i]] <- ggplot(taxa_split_dfs[[i]], aes(y=log(fold_change_from_last), x=log(obs_val), color=.otu)) + geom_point() +
  xlab("log(Observed Fold Changes)") + ylab("log(Calibrated Fold Changes)") + geom_abline(intercept=0, slope=1)
}

Atopobium vaginae

agreeance_plots[1]

## $Atopobium_vaginae

Gardnerella vaginalis

agreeance_plots[2]

## $Gardnerella_vaginalis

Lactobacillus crispatus

agreeance_plots[3]

## $Lactobacillus_crispatus

Lactobacillus iners

agreeance_plots[4]

## $Lactobacillus_iners

Prevotella bivia

agreeance_plots[5]

## $Prevotella_bivia

Sneathia amnii

agreeance_plots[6]

## $Sneathia_amnii

Steptococcus agalactiae

agreeance_plots[7]

## $Streptococcus_agalactiae

Percent occurences of fold change differences

#table showing total occurences of when things don't agree
agreeance_occurances <- table(split_dfs[[1]]$change, split_dfs[[1]]$obs_change)

total_disagreeance <- sum(agreeance_occurances[1,2],agreeance_occurances[2,1])
total_disagreeance/dim(split_dfs[[1]])[1]*100

## [1] 5.160374

### in total it looks like it occurs roughly 5% of the time regardless of taxa

## Note this is after filtering out start samples and samples that didn't have a fold change from their last visit.