Capstone Code

# df with proportions
dat_prop <- dat1 %>% 
  mutate(Diatoms_prop = Diatoms/TotalCells,
        Dinoflagellates_prop = Dinoflagellates/TotalCells,
        Cyanobacteria_prop = Cyanobacteria/TotalCells,
        Chlorophyta_prop = Chlorophyta/TotalCells,
        Chrysophyta_prop = Chrysophyta/TotalCells,
        uniqueID = paste(Date, Site),
        Date = as.character(Date),
        Month = str_split_fixed(Date, "/", 2)[ ,1],
        Day = str_split_fixed(Date, "/", 2)[ ,2],
        Date = dmy(paste(Day, Month, 2020)))

# df for pie chart 
dat_pie_prop <- dat_prop %>% 
  select(-Diatoms, -Dinoflagellates, -Cyanobacteria, -Chlorophyta, -Chrysophyta, -TotalCells) %>% 
  pivot_longer(cols = 4:8, names_to = "phyto_type") %>% 
  group_by(ID) 

# pie chart of proportions for each sample
ggplot(dat_pie_prop, aes(x="", y=value, fill=phyto_type)) +
  facet_wrap(~ ID, ncol = 10) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y") + 
  scale_fill_manual(values=c("#24AB55", "#FF7A32", "#FFC433", 
                             "#308DF5","#D83535"),
                    labels=c("Chlorophytes", "Chrysophytes", "Cyanobacteria", 
                             "Diatoms", "Dinoflagellates"),
                    name="Phytoplankton taxonmic group") +
  ylab("Site") + xlab("Date") +
  theme(axis.text.x=element_blank()) +
  ggtitle("Proportion of phytoplankton taxonomic groups in each sample")

# line plot proportions for each site over time
dat_line_prop <- dat_prop %>% 
  select(-Diatoms, -Dinoflagellates, -Cyanobacteria, -Chlorophyta, -Chrysophyta, -TotalCells) %>% 
  pivot_longer(cols = 4:8, names_to = "phyto_type") %>% 
  group_by(Site)

ggplot(dat_line_prop, aes(x=Date, y=value, color=phyto_type)) +
  facet_wrap(~ Site, nrow =2) + geom_line() + 
  scale_color_manual(values=c("#24AB55", "#FF7A32", "#FFC433", 
                               "#308DF5","#D83535", "black"),
                     labels=c("Chlorophytes", "Chrysophytes", "Cyanobacteria", 
                              "Diatoms", "Dinoflagellates"),
                     name="Phytoplankton taxonomic group") +
  theme_classic() + ylab("Proportion") +
  theme(legend.position="bottom") +
  guides(col = guide_legend(nrow = 1, byrow = TRUE)) +
  theme(legend.text=element_text(size=12))

# 5 taxonomic groups averaged by date over time
dateavg_datp <- dat_prop %>% 
  group_by(Date) %>% 
  summarise(Diatoms_mean = mean(Diatoms_prop),
            Dinoflagellates_mean = mean(Dinoflagellates_prop),
            Cyanobacteria_mean = mean(Cyanobacteria_prop),
            Chlorophyta_mean = mean(Chlorophyta_prop),
            Chrysophyta_mean = mean(Chrysophyta_prop)) %>% 
  pivot_longer(cols = -1, names_to = "phyto_type")

# Create date vector (IMPORTANT FOR ALL PLOTS FOLLOWING)
dates <- unique(dateavg_datp$Date)

# most important capstone plot: phyto community structure over time
ggplot(dateavg_datp, aes(x=Date, y=value, color=phyto_type)) + geom_line() + geom_point() +
  scale_color_manual(values=c("#24AB55", "#FF7A32", "#FFC433", 
                              "#308DF5","#D83535", "black"),
                     labels=c("Chlorophytes", "Chrysophytes", "Cyanobacteria", 
                              "Diatoms", "Dinoflagellates"),
                     name="Phytoplankton taxonomic group") +
  theme_classic() + ylab("Proportion") + 
  scale_x_date(breaks = dates, date_labels = "%b %d") +
  theme(legend.position="bottom") +
  guides(col = guide_legend(nrow = 3, byrow = TRUE))

# Simpson and Shannon Diversity indices plots

phytoData2 <- read.csv("sites_data.csv") #load phytoplankton data

phytoData3 <- phytoData2 %>% 
  mutate(Date = as.character(Date),
         Month = str_split_fixed(Date, "/", 2)[ ,1],
         Day = str_split_fixed(Date, "/", 2)[ ,2],
         Date = dmy(paste(Day, Month, 2020)))

## Warning: 930 failed to parse.

phytoData3$simpDiversity <- diversity(phytoData2[,3:7], "simpson") #simpson index
phytoData3$shanDiversity <- diversity(phytoData2[,3:7], "shannon") #shannon index

# simpson boxplot
ggplot(phytoData3, aes(x = Date, y = simpDiversity, group = Date)) + 
  geom_boxplot() + ylab("Simpson Diversity Index") +
  scale_x_date(breaks = dates, date_labels = "%b %d") + theme_classic()

## Warning: Removed 930 rows containing missing values (stat_boxplot).

# shannon boxplot
ggplot(phytoData3, aes(x = Date, y = shanDiversity, group = Date)) + 
  geom_boxplot(fill = "bisque") + ylab("Shannon Diversity Index")

## Warning: Removed 930 rows containing missing values (stat_boxplot).

# Boxplot and ANOVA tests
# boxplots
ggplot(dat_prop, aes(x = Date, y = Cyanobacteria_prop, group = Date)) + 
  geom_boxplot(fill = "bisque") + ylab("Proportion of Cyanobacteria") +
  theme_classic() + scale_x_date(breaks = dates, date_labels = "%b %d")

# anova
anova.test <- anova(lm(Cyanobacteria_prop ~ Date, dat_prop))

# anova test for difference in date
as.factor(dat_prop$Date) -> dat_prop$Date
aov(Cyanobacteria_prop ~ Date, dat_prop) -> aov.test
summary(aov.test)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Date         5  1.678  0.3356   12.13 6.75e-08 ***
## Residuals   54  1.494  0.0277                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(aov.test)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Cyanobacteria_prop ~ Date, data = dat_prop)
## 
## $Date
##                              diff         lwr         upr     p adj
## 2020-09-15-2020-09-08  0.06438642 -0.15536964  0.28414247 0.9530198
## 2020-09-22-2020-09-08  0.39724737  0.17749131  0.61700342 0.0000272
## 2020-09-29-2020-09-08  0.41949284  0.19973678  0.63924889 0.0000093
## 2020-10-06-2020-09-08  0.36916506  0.14940901  0.58892112 0.0001030
## 2020-10-20-2020-09-08  0.16352612 -0.05622994  0.38328217 0.2554076
## 2020-09-22-2020-09-15  0.33286095  0.11310490  0.55261701 0.0005456
## 2020-09-29-2020-09-15  0.35510642  0.13535037  0.57486248 0.0001982
## 2020-10-06-2020-09-15  0.30477864  0.08502259  0.52453470 0.0018714
## 2020-10-20-2020-09-15  0.09913970 -0.12061635  0.31889576 0.7655872
## 2020-09-29-2020-09-22  0.02224547 -0.19751059  0.24200152 0.9996603
## 2020-10-06-2020-09-22 -0.02808231 -0.24783836  0.19167375 0.9989451
## 2020-10-20-2020-09-22 -0.23372125 -0.45347730 -0.01396519 0.0308790
## 2020-10-06-2020-09-29 -0.05032778 -0.27008383  0.16942828 0.9838049
## 2020-10-20-2020-09-29 -0.25596672 -0.47572277 -0.03621066 0.0136050
## 2020-10-20-2020-10-06 -0.20563894 -0.42539500  0.01411711 0.0791028

# anova test for difference in site
anova.test_cyano_site <- anova(lm(Cyanobacteria_prop ~ Site, dat_prop))
aov.test_cyano_site <- aov(Cyanobacteria_prop ~ Site, dat_prop)
summary(aov.test_cyano_site)

##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         9 0.3115 0.03461   0.605  0.787
## Residuals   50 2.8603 0.05721

TukeyHSD(aov.test_cyano_site)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Cyanobacteria_prop ~ Site, data = dat_prop)
## 
## $Site
##             diff        lwr       upr     p adj
## B-A  0.099133696 -0.3579798 0.5562472 0.9992646
## C-A -0.019027993 -0.4761415 0.4380855 1.0000000
## D-A  0.169575191 -0.2875383 0.6266887 0.9640971
## E-A  0.185667187 -0.2714464 0.6427807 0.9377740
## F-A  0.101288079 -0.3558255 0.5584016 0.9991276
## G-A  0.045459192 -0.4116543 0.5025727 0.9999990
## H-A  0.199222276 -0.2578913 0.6563358 0.9073106
## I-A  0.103293776 -0.3538198 0.5604073 0.9989814
## J-A  0.153669321 -0.3034442 0.6107829 0.9810691
## C-B -0.118161688 -0.5752752 0.3389518 0.9971190
## D-B  0.070441495 -0.3866720 0.5275550 0.9999560
## E-B  0.086533492 -0.3705800 0.5436470 0.9997551
## F-B  0.002154383 -0.4549592 0.4592679 1.0000000
## G-B -0.053674504 -0.5107880 0.4034390 0.9999957
## H-B  0.100088581 -0.3570250 0.5572021 0.9992063
## I-B  0.004160080 -0.4529535 0.4612736 1.0000000
## J-B  0.054535626 -0.4025779 0.5116492 0.9999951
## D-C  0.188603184 -0.2685104 0.6457167 0.9318450
## E-C  0.204695180 -0.2524184 0.6618087 0.8927411
## F-C  0.120316071 -0.3367975 0.5774296 0.9966983
## G-C  0.064487185 -0.3926264 0.5216007 0.9999792
## H-C  0.218250269 -0.2388633 0.6753638 0.8510451
## I-C  0.122321769 -0.3347918 0.5794353 0.9962627
## J-C  0.172697314 -0.2844162 0.6298109 0.9597679
## E-D  0.016091996 -0.4410215 0.4732055 1.0000000
## F-D -0.068287113 -0.5254007 0.3888264 0.9999662
## G-D -0.124115999 -0.5812295 0.3329975 0.9958344
## H-D  0.029647085 -0.4274665 0.4867606 1.0000000
## I-D -0.066281415 -0.5233950 0.3908321 0.9999738
## J-D -0.015905870 -0.4730194 0.4412077 1.0000000
## F-E -0.084379109 -0.5414926 0.3727344 0.9998010
## G-E -0.140207995 -0.5973215 0.3169055 0.9899242
## H-E  0.013555089 -0.4435584 0.4706686 1.0000000
## I-E -0.082373411 -0.5394869 0.3747401 0.9998368
## J-E -0.031997866 -0.4891114 0.4251157 1.0000000
## G-F -0.055828886 -0.5129424 0.4012847 0.9999940
## H-F  0.097934198 -0.3591793 0.5550477 0.9993327
## I-F  0.002005698 -0.4551078 0.4591192 1.0000000
## J-F  0.052381243 -0.4047323 0.5094948 0.9999965
## H-G  0.153763084 -0.3033505 0.6108766 0.9809916
## I-G  0.057834584 -0.3992790 0.5149481 0.9999919
## J-G  0.108210129 -0.3489034 0.5653237 0.9985338
## I-H -0.095928500 -0.5530420 0.3611850 0.9994346
## J-H -0.045552955 -0.5026665 0.4115606 0.9999990
## J-I  0.050375545 -0.4067380 0.5074891 0.9999975

# Environmental Variables plots

# import the data
env_dat <- read.csv("environmental_variables_data.csv")

env_dat2 <- env_dat %>% 
  mutate(Date = as.character(Date),
         Month = str_split_fixed(Date, "/", 2)[ ,1],
         Day = str_split_fixed(Date, "/", 2)[ ,2],
         Date = dmy(paste(Day, Month, 2020)))

# Chl-a and cyano final plot
coeff <- 100
# create plot 
ggplot(env_dat2, aes(x=Date)) +
  geom_line(aes(y=Prop_cyano, color = "Proportion of Cyanobacteria")) + 
  geom_point(aes(y=Prop_cyano, color = "Proportion of Cyanobacteria")) +
  geom_line(aes(y=Chl_a_conc / coeff, color = "Chl-a concentration")) + 
  geom_point(aes(y=Chl_a_conc / coeff, color = "Chl-a concentration")) +
  scale_y_continuous(name = "Proportion of Cyanobacteria",
    sec.axis = sec_axis(~.*coeff, name="Chl-a concentration")) +
  theme_classic() + scale_x_date(breaks = dates, date_labels = "%b %d") +
  theme(legend.position="bottom") +
  guides(col = guide_legend(nrow = 1, byrow = TRUE)) +
  theme(legend.text=element_text(size=12), 
        axis.text=element_text(size=12),
        axis.title =element_text(size =14))

# load the nutrient data
nut_datb <- read.csv("nutrientdat copy.csv")

# correlation: cyano vs nutrient ratio 
ggplot(nut_datb, aes(x = din_srp_ratio, y=cyano_prop)) + geom_point() +
  ylab("Proportion of Cyanobacteria") + xlab("DIN:SRP ratio") +
  theme_classic() + stat_smooth(method ="lm")

regression1 <- lm(cyano_prop ~ din_srp_ratio, dat = nut_datb)
summary(regression1)

## 
## Call:
## lm(formula = cyano_prop ~ din_srp_ratio, data = nut_datb)
## 
## Residuals:
##        1        2        3        4        5 
## -0.28231 -0.14265  0.17243  0.09463  0.15791 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   6.668e-01  1.686e-01   3.955   0.0289 *
## din_srp_ratio 2.189e-05  5.211e-05   0.420   0.7028  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2336 on 3 degrees of freedom
## Multiple R-squared:  0.05553,    Adjusted R-squared:  -0.2593 
## F-statistic: 0.1764 on 1 and 3 DF,  p-value: 0.7028

# Abiotic factors plots
se_samp_dat <- read.csv("sites_environmental_data.csv")

se_samp_dat2 <- se_samp_dat %>% 
  mutate(Date = as.character(Date),
         Month = str_split_fixed(Date, "/", 2)[ ,1],
         Day = str_split_fixed(Date, "/", 2)[ ,2],
         Date = dmy(paste(Day, Month, 2020))) %>% 
  select(-Month, -Day)

# average by date
se_samp_dat_sum <- se_samp_dat2 %>%   
  group_by(Date) %>% 
  summarise(avg_s.m = mean(S.m),
            avg_temp = mean(Temp),
            avg_secchi = mean(Secchi))

# errors table
error_se_samp_dat_sum <- se_samp_dat2 %>%   
  group_by(Date) %>% 
  summarise(s.m_se = mean(S.m_se),
            temp_se = mean(Temp_se),
            secchi_se = mean(Secchi_se))
# flip errors table
error_flip <- error_se_samp_dat_sum %>% 
  rename("Mean Secchi Depth (m)" = secchi_se,
         "Mean Temperature (˚C)" = temp_se,
         "Mean Conductivity (S/m)" = s.m_se) %>% 
  pivot_longer(cols = 2:4, names_to = "Variable") %>% 
  arrange(Date, Variable) %>% 
  mutate(unique_id = c(1:18)) %>% 
  rename("se" = value)


# plot df
se_samp_dat_flip <- se_samp_dat_sum %>% 
  rename("Mean Secchi Depth (m)" = avg_secchi,
         "Mean Temperature (˚C)" = avg_temp,
         "Mean Conductivity (S/m)" = avg_s.m) %>% 
  pivot_longer(cols = 2:4, names_to = "Variable") %>% 
  arrange(Date, Variable) %>% 
  mutate(unique_id = c(1:18))

# join the dfs
se_samp_dat_plot <- left_join(se_samp_dat_flip, error_flip, by = "unique_id")

se_samp_dat_plot2 <- se_samp_dat_plot %>% 
  rename("Date" = Date.x,
         "Variable" = Variable.x) %>% 
  select(Date, Variable, value, se)

# 3 tier env variable plot for our SAMPLE data:
ggplot(se_samp_dat_plot2, aes(x=Date, y=value, group = 1, col = Variable)) +  
  geom_line() + geom_point() +
  geom_errorbar(aes(ymin=value-se, ymax=value+se), width=.2) +
  facet_grid(Variable ~ ., scales = "free") + theme_classic() +
  scale_x_date(breaks = dates, date_labels = "%b %d")

Capstone Code

Kirsten Miller

12/07/2020