Project for Evolution 2013 meeting: predict if diversification of morphology = diversification of species.
Collected over 600 measurements from AMNH of wing length, tail length, bill length, bill width, bill depth, tarsus length, and hallux. I still need to combine the data collected from the museums in Vietnam.
First step is to impute missing data by taking the mean of that character from other measurements of the same species.
The following is the R script I wrote to do this:
#impute missing values by taking mean of all other values of of other species ofthat character
i<-1 #set the index
species.removed<-c()
#sift through all babbler species
while (i < length(babblers$UniqueID_1)){
#missing wing lengths; if the Wing Length is NA for that value
if(is.na(babblers$Wing.Length[i]) == TRUE){
#select the wing lengths of all birds which match this species with missing wing length
wing.species.vector<-babblers$Wing.Length[which(babblers$Species[i] == babblers$Species)]
#make a new vector without the missing value
wing.species.filler<-wing.species.vector[which(is.na(wing.species.vector) == FALSE)]
#calculate the mean
wing.impute<-mean(wing.species.filler)
#fill in missing value with the mean for that bird
babblers$Wing.Length[i]<-wing.impute
}
#repeat the above method for all characters
if(is.na(babblers$Tail.Length[i]) == TRUE){
tail.species.vector<-babblers$Tail.Length[which(babblers$Species[i] == babblers$Species)]
tail.species.filler<-tail.species.vector[which(is.na(tail.species.vector) == FALSE)]
tail.impute<-mean(tail.species.filler)
babblers$Tail.Length[i]<-tail.impute
}
if(is.na(babblers$Bill.Length[i]) == TRUE){
BL.species.vector<-babblers$Bill.Length[which(babblers$Species[i] == babblers$Species)]
BL.species.filler<-BL.species.vector[which(is.na(BL.species.vector) == FALSE)]
BL.impute<-mean(BL.species.filler)
babblers$Bill.Length[i]<-BL.impute
}
if(is.na(babblers$Bill.Width[i]) == TRUE){
BW.species.vector<-babblers$Bill.Width[which(babblers$Species[i] == babblers$Species)]
BW.species.filler<-BW.species.vector[which(is.na(BW.species.vector) == FALSE)]
BW.impute<-mean(BW.species.filler)
babblers$Bill.Width[i]<-BW.impute
}
if(is.na(babblers$Bill.Depth[i]) == TRUE){
BD.species.vector<-babblers$Bill.Depth[which(babblers$Species[i] == babblers$Species)]
BD.species.filler<-BD.species.vector[which(is.na(BD.species.vector) == FALSE)]
BD.impute<-mean(BD.species.filler)
babblers$Bill.Depth[i]<-BD.impute
}
if(is.na(babblers$Tarsus.Length[i]) == TRUE){
TL.species.vector<-babblers$Tarsus.Length[which(babblers$Species[i] == babblers$Species)]
TL.species.filler<-TL.species.vector[which(is.na(TL.species.vector) == FALSE)]
TL.impute<-mean(BD.species.filler)
babblers$Tarsus.Length[i]<-TL.impute
}
if(is.na(babblers$Hallux[i]) == TRUE){
hallux.species.vector<-babblers$Hallux[which(babblers$Species[i] == babblers$Species)]
hallux.species.filler<-hallux.species.vector[which(is.na(hallux.species.vector) == FALSE)]
hallux.impute<-mean(hallux.species.filler)
babblers$Hallux[i]<-hallux.impute
}
#if there are any values that canont be computed, remove them from the data set
if(is.nan(babblers$Wing.Length[i]) == TRUE){
babblers<-babblers[-i,]
append(species.removed, babblers$UniqueID_1[i])
}
i<-i+1
}
cat("The following species were removed from the analysis:", species.removed, "\n")
I have then scaled each point to take size into account by dividing each by the geometric mean of all characters for each species. The following is the R script I wrote in order to do that:
#Size-adjust the raw data by dividing each individual by the geometric mean
j<-1
could.not.scale<-c()
while(j<length(babblers$UniqueID_1)){
#grab all the data of that species and put in vector
temp<-c(babblers$Wing.Length[j], babblers$Tail.Length[j], babblers$Bill.Length[j],
babblers$Bill.Width[j], babblers$Bill.Depth[j], babblers$Tarsus.Length[j],
babblers$Hallux[j])
#calculate the geometric mean of meausrements for that species
geo.mean<-exp(mean(log(temp)))
if(is.nan(geo.mean)==FALSE){
#divide all values for that species by the geometric mean
geo.wing<-(babblers$Wing.Length[j])/geo.mean
babblers$Wing.Length[j]<-geo.wing
geo.tail<-babblers$Tail.Length[j]/geo.mean
babblers$Tail.Length[j]<-geo.tail
geo.BL<-babblers$Bill.Length[j]/geo.mean
babblers$Bill.Length[j]<-geo.BL
geo.BW<-babblers$Bill.Width[j]/geo.mean
babblers$Bill.Width[j]<-geo.BW
geo.BD<-babblers$Bill.Depth[j]/geo.mean
babblers$Bill.Depth[j]<-geo.BD
geo.tarsus<-babblers$Tarsus.Length[j]/geo.mean
babblers$Tarsus.Length[j]<-geo.tarsus
geo.hallux<-babblers$Hallux[j]/geo.mean
babblers$Hallux[j]<-geo.hallux
}
else{
append(could.not.scale, babblers$UniqueID_1[j])
}
#reset the temp
temp<-c()
#increment to next species
j<-j+1
}
cat("The following species could not be scaled:", could.not.scale, "\n")
Now I can run a PCA on the 7 measurements to determine what holds the most variance:
Collected over 600 measurements from AMNH of wing length, tail length, bill length, bill width, bill depth, tarsus length, and hallux. I still need to combine the data collected from the museums in Vietnam.
First step is to impute missing data by taking the mean of that character from other measurements of the same species.
The following is the R script I wrote to do this:
#impute missing values by taking mean of all other values of of other species ofthat character
i<-1 #set the index
species.removed<-c()
#sift through all babbler species
while (i < length(babblers$UniqueID_1)){
#missing wing lengths; if the Wing Length is NA for that value
if(is.na(babblers$Wing.Length[i]) == TRUE){
#select the wing lengths of all birds which match this species with missing wing length
wing.species.vector<-babblers$Wing.Length[which(babblers$Species[i] == babblers$Species)]
#make a new vector without the missing value
wing.species.filler<-wing.species.vector[which(is.na(wing.species.vector) == FALSE)]
#calculate the mean
wing.impute<-mean(wing.species.filler)
#fill in missing value with the mean for that bird
babblers$Wing.Length[i]<-wing.impute
}
#repeat the above method for all characters
if(is.na(babblers$Tail.Length[i]) == TRUE){
tail.species.vector<-babblers$Tail.Length[which(babblers$Species[i] == babblers$Species)]
tail.species.filler<-tail.species.vector[which(is.na(tail.species.vector) == FALSE)]
tail.impute<-mean(tail.species.filler)
babblers$Tail.Length[i]<-tail.impute
}
if(is.na(babblers$Bill.Length[i]) == TRUE){
BL.species.vector<-babblers$Bill.Length[which(babblers$Species[i] == babblers$Species)]
BL.species.filler<-BL.species.vector[which(is.na(BL.species.vector) == FALSE)]
BL.impute<-mean(BL.species.filler)
babblers$Bill.Length[i]<-BL.impute
}
if(is.na(babblers$Bill.Width[i]) == TRUE){
BW.species.vector<-babblers$Bill.Width[which(babblers$Species[i] == babblers$Species)]
BW.species.filler<-BW.species.vector[which(is.na(BW.species.vector) == FALSE)]
BW.impute<-mean(BW.species.filler)
babblers$Bill.Width[i]<-BW.impute
}
if(is.na(babblers$Bill.Depth[i]) == TRUE){
BD.species.vector<-babblers$Bill.Depth[which(babblers$Species[i] == babblers$Species)]
BD.species.filler<-BD.species.vector[which(is.na(BD.species.vector) == FALSE)]
BD.impute<-mean(BD.species.filler)
babblers$Bill.Depth[i]<-BD.impute
}
if(is.na(babblers$Tarsus.Length[i]) == TRUE){
TL.species.vector<-babblers$Tarsus.Length[which(babblers$Species[i] == babblers$Species)]
TL.species.filler<-TL.species.vector[which(is.na(TL.species.vector) == FALSE)]
TL.impute<-mean(BD.species.filler)
babblers$Tarsus.Length[i]<-TL.impute
}
if(is.na(babblers$Hallux[i]) == TRUE){
hallux.species.vector<-babblers$Hallux[which(babblers$Species[i] == babblers$Species)]
hallux.species.filler<-hallux.species.vector[which(is.na(hallux.species.vector) == FALSE)]
hallux.impute<-mean(hallux.species.filler)
babblers$Hallux[i]<-hallux.impute
}
#if there are any values that canont be computed, remove them from the data set
if(is.nan(babblers$Wing.Length[i]) == TRUE){
babblers<-babblers[-i,]
append(species.removed, babblers$UniqueID_1[i])
}
i<-i+1
}
cat("The following species were removed from the analysis:", species.removed, "\n")
I have then scaled each point to take size into account by dividing each by the geometric mean of all characters for each species. The following is the R script I wrote in order to do that:
#Size-adjust the raw data by dividing each individual by the geometric mean
j<-1
could.not.scale<-c()
while(j<length(babblers$UniqueID_1)){
#grab all the data of that species and put in vector
temp<-c(babblers$Wing.Length[j], babblers$Tail.Length[j], babblers$Bill.Length[j],
babblers$Bill.Width[j], babblers$Bill.Depth[j], babblers$Tarsus.Length[j],
babblers$Hallux[j])
#calculate the geometric mean of meausrements for that species
geo.mean<-exp(mean(log(temp)))
if(is.nan(geo.mean)==FALSE){
#divide all values for that species by the geometric mean
geo.wing<-(babblers$Wing.Length[j])/geo.mean
babblers$Wing.Length[j]<-geo.wing
geo.tail<-babblers$Tail.Length[j]/geo.mean
babblers$Tail.Length[j]<-geo.tail
geo.BL<-babblers$Bill.Length[j]/geo.mean
babblers$Bill.Length[j]<-geo.BL
geo.BW<-babblers$Bill.Width[j]/geo.mean
babblers$Bill.Width[j]<-geo.BW
geo.BD<-babblers$Bill.Depth[j]/geo.mean
babblers$Bill.Depth[j]<-geo.BD
geo.tarsus<-babblers$Tarsus.Length[j]/geo.mean
babblers$Tarsus.Length[j]<-geo.tarsus
geo.hallux<-babblers$Hallux[j]/geo.mean
babblers$Hallux[j]<-geo.hallux
}
else{
append(could.not.scale, babblers$UniqueID_1[j])
}
#reset the temp
temp<-c()
#increment to next species
j<-j+1
}
cat("The following species could not be scaled:", could.not.scale, "\n")
Now I can run a PCA on the 7 measurements to determine what holds the most variance:
#PCA for all measurements
pca.result <- prcomp(~babblers$Wing.Length + babblers$Tail.Length + babblers$Bill.Length + babblers$Bill.Width + babblers$Bill.Depth + babblers$Tarsus.Length + babblers$Hallux, data = babblers, retx = T)
No comments:
Post a Comment