Showing posts with label geometric mean. Show all posts
Showing posts with label geometric mean. Show all posts

Monday, September 9, 2013

To see if the geometric mean could be used to predict body size, I performed a phylogenetic regression.

Here is the following script used to generate the data for the paper:
(start with body_size.csv).

library (caper)
#make a comparative data frame
data<-comparative.data(phy=tree,data=collapsed.babbler,names.col=matched.tip )

#make two different models; one using maximum likelihood and one using browning motion
mod1<-pgls(log(mean.mass)~log(mean.gm),data, lambda = "ML")
mod2<-pgls(log(mean.mass)~log(mean.gm),data)

#compare the two models using log likelihood
logLik(mod1)
logLik(mod2)

#results of ML model

#results of brownian motion model


#when comparing the two models, compare the LL--since very close, use simpler model (Brownian motion)

Here is the ggplot2 code to generate the scatterplot:
library(ggplot2)
qplot(log(mean.gm),log(mean.mass), data=new,xlab = "Geometric Mean (log)", ylab = "Body Mass (log)",  size = I(7))+theme_bw()+theme(text = element_text(size=30))+ geom_abline(intercept=-1.82487, slope=2.28979)

Thursday, June 6, 2013

PCAs and reweights of the values for each species is finally done and in a format that can be easily used to run the evol.rate.mcmc()


Script to impute data:
#impute missing values by taking mean of all other values of of other species ofthat character 

i<-1 #set the index
species.removed<-c()
#sift through all babbler species
while (i <= length(babblers$UniqueID_1)){
  #missing wing lengths; if the Wing Length is NA for that value
  if(is.na(babblers$Wing.Length[i]) == TRUE){
    #select the wing lengths of all birds which match this species with missing wing length
    wing.species.vector<-babblers$Wing.Length[which(babblers$Species[i] == babblers$Species)]
    #make a new vector without the missing value
    wing.species.filler<-wing.species.vector[which(is.na(wing.species.vector) == FALSE)]
    #calculate the mean
    wing.impute<-mean(wing.species.filler)
    #fill in missing value with the mean for that bird
    babblers$Wing.Length[i]<-wing.impute
  }
  
  #repeat the above method for all characters
  if(is.na(babblers$Tail.Length[i]) == TRUE){
    tail.species.vector<-babblers$Tail.Length[which(babblers$Species[i] == babblers$Species)]
    tail.species.filler<-tail.species.vector[which(is.na(tail.species.vector) == FALSE)]
    tail.impute<-mean(tail.species.filler)
    babblers$Tail.Length[i]<-tail.impute
  }
  
  
  
  if(is.na(babblers$Bill.Length[i]) == TRUE){
    BL.species.vector<-babblers$Bill.Length[which(babblers$Species[i] == babblers$Species)]
    BL.species.filler<-BL.species.vector[which(is.na(BL.species.vector) == FALSE)]
    BL.impute<-mean(BL.species.filler)
    babblers$Bill.Length[i]<-BL.impute
  }
  
  if(is.na(babblers$Bill.Width[i]) == TRUE){
    BW.species.vector<-babblers$Bill.Width[which(babblers$Species[i] == babblers$Species)]
    BW.species.filler<-BW.species.vector[which(is.na(BW.species.vector) == FALSE)]
    BW.impute<-mean(BW.species.filler)
    babblers$Bill.Width[i]<-BW.impute
  }
  
  if(is.na(babblers$Bill.Depth[i]) == TRUE){
    BD.species.vector<-babblers$Bill.Depth[which(babblers$Species[i] == babblers$Species)]
    BD.species.filler<-BD.species.vector[which(is.na(BD.species.vector) == FALSE)]
    BD.impute<-mean(BD.species.filler)
    babblers$Bill.Depth[i]<-BD.impute
  }
  
  if(is.na(babblers$Tarsus.Length[i]) == TRUE){
    TL.species.vector<-babblers$Tarsus.Length[which(babblers$Species[i] == babblers$Species)]
    TL.species.filler<-TL.species.vector[which(is.na(TL.species.vector) == FALSE)]
    TL.impute<-mean(BD.species.filler)
    babblers$Tarsus.Length[i]<-TL.impute
  }
  
  if(is.na(babblers$Hallux[i]) == TRUE){
    hallux.species.vector<-babblers$Hallux[which(babblers$Species[i] == babblers$Species)]
    hallux.species.filler<-hallux.species.vector[which(is.na(hallux.species.vector) == FALSE)]
    hallux.impute<-mean(hallux.species.filler)
    babblers$Hallux[i]<-hallux.impute
  }

  i<-i+1
}

Script to scale by geometric mean:
#make empty columns for geometric mean scale
babblers$gm.wing<-0
babblers$gm.tail<-0
babblers$gm.BL<-0
babblers$gm.BW<-0
babblers$gm.BD<-0
babblers$gm.tarsus<-0
babblers$gm.hallux<-0

#Size-adjust the raw data by dividing each individual by the geometric mean 

j<-1
could.not.scale<-c()
while(j<=length(babblers$UniqueID_1)){
  #grab all the data of that species and put in vector
  temp<-c(babblers$Wing.Length[j], babblers$Tail.Length[j], babblers$Bill.Length[j],
          babblers$Bill.Width[j], babblers$Bill.Depth[j], babblers$Tarsus.Length[j], 
          babblers$Hallux[j])
  temp<-na.omit(temp)
  #calculate the geometric mean of meausrements for that species
  geo.mean<-exp(mean(log(temp)))
  
  if(is.nan(geo.mean)==FALSE){
  #divide all values for that species by the geometric mean
  geo.wing<-(babblers$Wing.Length[j])/geo.mean
  babblers$gm.wing[j]<-geo.wing
  
  geo.tail<-babblers$Tail.Length[j]/geo.mean
  babblers$gm.tail[j]<-geo.tail
  
  geo.BL<-babblers$Bill.Length[j]/geo.mean
  babblers$gm.BL[j]<-geo.BL
  
  geo.BW<-babblers$Bill.Width[j]/geo.mean
  babblers$gm.BW[j]<-geo.BW
  
  geo.BD<-babblers$Bill.Depth[j]/geo.mean
  babblers$gm.BD[j]<-geo.BD
  
  geo.tarsus<-babblers$Tarsus.Length[j]/geo.mean
  babblers$gm.tarsus[j]<-geo.tarsus
  
  geo.hallux<-babblers$Hallux[j]/geo.mean
  babblers$gm.hallux[j]<-geo.hallux
 }
 else{
    append(could.not.scale, babblers$UniqueID_1[j])
  }
  #reset the temp
  temp<-c()
  
  #increment to next species
  j<-j+1
}

cat("The following species could not be scaled:", could.not.scale, "\n")

Script to do PCA with geometric mean values and put rescaled scores into new data frame:
#PCA for all measurements
pca.result <- prcomp(~babblers$gm.wing + babblers$gm.tail + babblers$gm.BL + babblers$gm.BW + babblers$gm.BD + 
       babblers$gm.tarsus + babblers$gm.hallux, data = babblers, retx = T)

#standard deviation of PCA
sd <- pca.result$sdev
#make vector of weights
loadings <- pca.result$rotation
#value of the rotated data (scores)
scores <- pca.result$x

#make a new data frame
new.babblers<-as.data.frame(babblers)

#make empty columns in the data frame
new.babblers$pca1<-0
new.babblers$pca2<-0
new.babblers$pca3<-0
new.babblers$pca4<-0
new.babblers$pca5<-0
new.babblers$pca6<-0
new.babblers$pca7<-0

#put PCA Scores into the data frame
new.babblers$pca1<-scores[,1]
new.babblers$pca2<-scores[,2]
new.babblers$pca3<-scores[,3]
new.babblers$pca4<-scores[,4]
new.babblers$pca5<-scores[,5]
new.babblers$pca6<-scores[,6]
new.babblers$pca7<-scores[,7]

new.babblers$pca# are the traits that will be used in the Bayesian analysis.
This site was really useful for understanding PCA:
http://strata.uga.edu/software/pdf/pcaTutorial.pdf

Tuesday, May 21, 2013

Project for Evolution 2013 meeting: predict if diversification of morphology = diversification of species.

Collected over 600 measurements from AMNH of wing length, tail length, bill length, bill width, bill depth, tarsus length, and hallux. I still need to combine the data collected from the museums in Vietnam.

First step is to impute missing data by taking the mean of that character from other measurements of the same species.

The following is the R script I wrote to do this:


#impute missing values by taking mean of all other values of of other species ofthat character 

i<-1 #set the index
species.removed<-c()
#sift through all babbler species
while (i < length(babblers$UniqueID_1)){
  #missing wing lengths; if the Wing Length is NA for that value
  if(is.na(babblers$Wing.Length[i]) == TRUE){
    #select the wing lengths of all birds which match this species with missing wing length
    wing.species.vector<-babblers$Wing.Length[which(babblers$Species[i] == babblers$Species)]
    #make a new vector without the missing value
    wing.species.filler<-wing.species.vector[which(is.na(wing.species.vector) == FALSE)]
    #calculate the mean
    wing.impute<-mean(wing.species.filler)
    #fill in missing value with the mean for that bird
    babblers$Wing.Length[i]<-wing.impute
  }
  
  #repeat the above method for all characters
  if(is.na(babblers$Tail.Length[i]) == TRUE){
    tail.species.vector<-babblers$Tail.Length[which(babblers$Species[i] == babblers$Species)]
    tail.species.filler<-tail.species.vector[which(is.na(tail.species.vector) == FALSE)]
    tail.impute<-mean(tail.species.filler)
    babblers$Tail.Length[i]<-tail.impute
  }
  
  if(is.na(babblers$Bill.Length[i]) == TRUE){
    BL.species.vector<-babblers$Bill.Length[which(babblers$Species[i] == babblers$Species)]
    BL.species.filler<-BL.species.vector[which(is.na(BL.species.vector) == FALSE)]
    BL.impute<-mean(BL.species.filler)
    babblers$Bill.Length[i]<-BL.impute
  }
  
  if(is.na(babblers$Bill.Width[i]) == TRUE){
    BW.species.vector<-babblers$Bill.Width[which(babblers$Species[i] == babblers$Species)]
    BW.species.filler<-BW.species.vector[which(is.na(BW.species.vector) == FALSE)]
    BW.impute<-mean(BW.species.filler)
    babblers$Bill.Width[i]<-BW.impute
  }
  
  if(is.na(babblers$Bill.Depth[i]) == TRUE){
    BD.species.vector<-babblers$Bill.Depth[which(babblers$Species[i] == babblers$Species)]
    BD.species.filler<-BD.species.vector[which(is.na(BD.species.vector) == FALSE)]
    BD.impute<-mean(BD.species.filler)
    babblers$Bill.Depth[i]<-BD.impute
  }
  
  if(is.na(babblers$Tarsus.Length[i]) == TRUE){
    TL.species.vector<-babblers$Tarsus.Length[which(babblers$Species[i] == babblers$Species)]
    TL.species.filler<-TL.species.vector[which(is.na(TL.species.vector) == FALSE)]
    TL.impute<-mean(BD.species.filler)
    babblers$Tarsus.Length[i]<-TL.impute
  }
  
  if(is.na(babblers$Hallux[i]) == TRUE){
    hallux.species.vector<-babblers$Hallux[which(babblers$Species[i] == babblers$Species)]
    hallux.species.filler<-hallux.species.vector[which(is.na(hallux.species.vector) == FALSE)]
    hallux.impute<-mean(hallux.species.filler)
    babblers$Hallux[i]<-hallux.impute
  }
  #if there are any values that canont be computed, remove them from the data set
  if(is.nan(babblers$Wing.Length[i]) == TRUE){
    babblers<-babblers[-i,]
    append(species.removed, babblers$UniqueID_1[i])
  }
    
  i<-i+1
}

cat("The following species were removed from the analysis:", species.removed, "\n")

I have then scaled each point to take size into account by dividing each by the geometric mean of all characters for each species. The following is the R script I wrote in order to do that:


#Size-adjust the raw data by dividing each individual by the geometric mean 
j<-1
could.not.scale<-c()
while(j<length(babblers$UniqueID_1)){
  #grab all the data of that species and put in vector
  temp<-c(babblers$Wing.Length[j], babblers$Tail.Length[j], babblers$Bill.Length[j],
          babblers$Bill.Width[j], babblers$Bill.Depth[j], babblers$Tarsus.Length[j], 
          babblers$Hallux[j])

  #calculate the geometric mean of meausrements for that species
  geo.mean<-exp(mean(log(temp)))
  
  if(is.nan(geo.mean)==FALSE){
   #divide all values for that species by the geometric mean
   geo.wing<-(babblers$Wing.Length[j])/geo.mean
   babblers$Wing.Length[j]<-geo.wing
  
   geo.tail<-babblers$Tail.Length[j]/geo.mean
   babblers$Tail.Length[j]<-geo.tail
   
   geo.BL<-babblers$Bill.Length[j]/geo.mean
   babblers$Bill.Length[j]<-geo.BL
  
   geo.BW<-babblers$Bill.Width[j]/geo.mean
   babblers$Bill.Width[j]<-geo.BW
   
   geo.BD<-babblers$Bill.Depth[j]/geo.mean
   babblers$Bill.Depth[j]<-geo.BD
   
   geo.tarsus<-babblers$Tarsus.Length[j]/geo.mean
   babblers$Tarsus.Length[j]<-geo.tarsus
  
   geo.hallux<-babblers$Hallux[j]/geo.mean
   babblers$Hallux[j]<-geo.hallux
  }
  else{
    append(could.not.scale, babblers$UniqueID_1[j])
  }
  #reset the temp
  temp<-c()
  
  #increment to next species
  j<-j+1
}

cat("The following species could not be scaled:", could.not.scale, "\n")

Now I can run a PCA on the 7 measurements to determine what holds the most variance:

#PCA for all measurements
pca.result <- prcomp(~babblers$Wing.Length + babblers$Tail.Length + babblers$Bill.Length + babblers$Bill.Width + babblers$Bill.Depth + babblers$Tarsus.Length + babblers$Hallux, data = babblers, retx = T)