The DAPC process

Load adegenet

library(adegenet)

Read your data

trout_data <- read.genepop("../data/trout_riv.gen",
                           missing = "mean")

 Converting data from a Genepop .gen file to a genind object... 


File description:  small_scale_trout_data 

...done.

Find the number of clusters to explain your data

grps <- find.clusters(trout_data, max.n.clust = 20,
                      choose.n.clust = FALSE,
                      criterion = "diffNgroup",
                      n.pca = 500)

Run an initial DAPC

trout_dapc1 <- dapc(trout_data, grp = grps$grp, n.pca = 100, n.da = 2,
                    scannf = FALSE)

plot dapc result

scatter(trout_dapc1)

Find the optimal number of PCs to retain

a_optim <- optim.a.score(trout_dapc1)

Re-run DAPC for the optimal PC number

trout_dapc_final <- dapc(trout_data, grp = grps$grp,
                         n.pca = a_optim$best, n.da = 2)

re-plot results

scatter(trout_dapc_final)

define a function to subset the data

subsetter <- function(x){
  ps <- length(x)
  n <- round((ps/100)*20)
  return(sample(x, n))
}

split the trout_data into training inds and supp inds

sup_ind  <- unlist(tapply(1:nInd(trout_data),
                          pop(trout_data),
                          subsetter))

define a training data set

trn_data <- trout_data[-sup_ind]

define a supplementary dataset

sup_data <- trout_data[sup_ind]

re-run find.clusters process

trn_grp <- find.clusters(trn_data, max.n.clust = 20,
                         choose.n.clust = FALSE,
                         criterion = "diffNgroup",
                         n.pca = 500)

Now we run a dapc on the training data

trn_dapc1 <- dapc(trn_data, grp = trn_grp$grp, n.pca = 100, n.da = 2)

plot the initial dapc of the training set

scatter(trn_dapc1)

choose the optimal number of PCs

trn_a_optim <- optim.a.score(trn_dapc1)

Run a dapc using the optimised number of PCs

trn_dapc_final <- dapc(trn_data, grp = trn_grp$grp,
                       n.pca = trn_a_optim$best, n.da = 2)

Next we see how well the trn_dapc_final model predicts the supplementary individuals

pred_mod <- predict.dapc(trn_dapc_final,
                         newdata = sup_data)

visualise the model performance

convert training data to a dataframe

trn_points <- data.frame(x = trn_dapc_final$ind.coord[,1],
                         y = trn_dapc_final$ind.coord[,2],
                         t_points = trn_dapc_final$grp)

convert supplementary data to a dataframe

sup_points <- data.frame(x = pred_mod$ind.scores[,1],
                         y = pred_mod$ind.scores[,2],
                         s_points = pred_mod$assign)

plot the relationship between training individuals and supplementary individuals

library(ggplot2)
ggplot() +
  geom_point(data = trn_points, aes(x = x, y = y, colour = t_points),
             size = 10, alpha = 0.3) +
  geom_point(data = sup_points, aes(x = x, y = y, colour = s_points),
             pch = 15, size = 10) +
  theme_bw()

Claculate the mean accuracy of assignment

mean((as.character(pred_mod$assign) == as.character(pop(sup_data))))
[1] 0.9772727