The DAPC process
Load adegenet
library(adegenet)
Read your data
trout_data <- read.genepop("../data/trout_riv.gen",
missing = "mean")
Converting data from a Genepop .gen file to a genind object...
File description: small_scale_trout_data
...done.
Find the number of clusters to explain your data
grps <- find.clusters(trout_data, max.n.clust = 20,
choose.n.clust = FALSE,
criterion = "diffNgroup",
n.pca = 500)
Run an initial DAPC
trout_dapc1 <- dapc(trout_data, grp = grps$grp, n.pca = 100, n.da = 2,
scannf = FALSE)
plot dapc result
scatter(trout_dapc1)
Find the optimal number of PCs to retain
a_optim <- optim.a.score(trout_dapc1)
Re-run DAPC for the optimal PC number
trout_dapc_final <- dapc(trout_data, grp = grps$grp,
n.pca = a_optim$best, n.da = 2)
re-plot results
scatter(trout_dapc_final)
define a function to subset the data
subsetter <- function(x){
ps <- length(x)
n <- round((ps/100)*20)
return(sample(x, n))
}
split the trout_data into training inds and supp inds
sup_ind <- unlist(tapply(1:nInd(trout_data),
pop(trout_data),
subsetter))
define a training data set
trn_data <- trout_data[-sup_ind]
define a supplementary dataset
sup_data <- trout_data[sup_ind]
re-run find.clusters process
trn_grp <- find.clusters(trn_data, max.n.clust = 20,
choose.n.clust = FALSE,
criterion = "diffNgroup",
n.pca = 500)
Now we run a dapc on the training data
trn_dapc1 <- dapc(trn_data, grp = trn_grp$grp, n.pca = 100, n.da = 2)
plot the initial dapc of the training set
scatter(trn_dapc1)
choose the optimal number of PCs
trn_a_optim <- optim.a.score(trn_dapc1)
Run a dapc using the optimised number of PCs
trn_dapc_final <- dapc(trn_data, grp = trn_grp$grp,
n.pca = trn_a_optim$best, n.da = 2)
Next we see how well the trn_dapc_final model predicts the supplementary individuals
pred_mod <- predict.dapc(trn_dapc_final,
newdata = sup_data)
visualise the model performance
convert training data to a dataframe
trn_points <- data.frame(x = trn_dapc_final$ind.coord[,1],
y = trn_dapc_final$ind.coord[,2],
t_points = trn_dapc_final$grp)
convert supplementary data to a dataframe
sup_points <- data.frame(x = pred_mod$ind.scores[,1],
y = pred_mod$ind.scores[,2],
s_points = pred_mod$assign)
plot the relationship between training individuals and supplementary individuals
library(ggplot2)
ggplot() +
geom_point(data = trn_points, aes(x = x, y = y, colour = t_points),
size = 10, alpha = 0.3) +
geom_point(data = sup_points, aes(x = x, y = y, colour = s_points),
pch = 15, size = 10) +
theme_bw()
Claculate the mean accuracy of assignment
mean((as.character(pred_mod$assign) == as.character(pop(sup_data))))
[1] 0.9772727