Laurel's Lab Notebook

Thursday, June 6, 2013

Had a minor panic attack because I realized my missing tarsus values were being imputed with mean of the bill length which of course would lead to extreme variance in the trait. Anyways, fixed it right up and still got somewhat similar results. Whew.
New PCA result:

Fixed script:
#impute missing values by taking mean of all other values of of other species ofthat character

i<-1 #set the index
species.removed<-c()
#sift through all babbler species
while (i <= length(babblers$UniqueID_1)){
#missing wing lengths; if the Wing Length is NA for that value
if(is.na(babblers$Wing.Length[i]) == TRUE){
#select the wing lengths of all birds which match this species with missing wing length
wing.species.vector<-babblers$Wing.Length[which(babblers$Species[i] == babblers$Species)]
#make a new vector without the missing value
wing.species.filler<-wing.species.vector[which(is.na(wing.species.vector) == FALSE)]
#calculate the mean
wing.impute<-mean(wing.species.filler)
#fill in missing value with the mean for that bird
babblers$Wing.Length[i]<-wing.impute
}

#repeat the above method for all characters
if(is.na(babblers$Tail.Length[i]) == TRUE){
tail.species.vector<-babblers$Tail.Length[which(babblers$Species[i] == babblers$Species)]
tail.species.filler<-tail.species.vector[which(is.na(tail.species.vector) == FALSE)]
tail.impute<-mean(tail.species.filler)
babblers$Tail.Length[i]<-tail.impute
}



if(is.na(babblers$Bill.Length[i]) == TRUE){
BL.species.vector<-babblers$Bill.Length[which(babblers$Species[i] == babblers$Species)]
BL.species.filler<-BL.species.vector[which(is.na(BL.species.vector) == FALSE)]
BL.impute<-mean(BL.species.filler)
babblers$Bill.Length[i]<-BL.impute
}

if(is.na(babblers$Bill.Width[i]) == TRUE){
BW.species.vector<-babblers$Bill.Width[which(babblers$Species[i] == babblers$Species)]
BW.species.filler<-BW.species.vector[which(is.na(BW.species.vector) == FALSE)]
BW.impute<-mean(BW.species.filler)
babblers$Bill.Width[i]<-BW.impute
}

if(is.na(babblers$Bill.Depth[i]) == TRUE){
BD.species.vector<-babblers$Bill.Depth[which(babblers$Species[i] == babblers$Species)]
BD.species.filler<-BD.species.vector[which(is.na(BD.species.vector) == FALSE)]
BD.impute<-mean(BD.species.filler)
babblers$Bill.Depth[i]<-BD.impute
}

if(is.na(babblers$Tarsus.Length[i]) == TRUE){
TL.species.vector<-babblers$Tarsus.Length[which(babblers$Species[i] == babblers$Species)]
TL.species.filler<-TL.species.vector[which(is.na(TL.species.vector) == FALSE)]
TL.impute<-mean(TL.species.filler)
babblers$Tarsus.Length[i]<-TL.impute
}

if(is.na(babblers$Hallux[i]) == TRUE){
hallux.species.vector<-babblers$Hallux[which(babblers$Species[i] == babblers$Species)]
hallux.species.filler<-hallux.species.vector[which(is.na(hallux.species.vector) == FALSE)]
hallux.impute<-mean(hallux.species.filler)
babblers$Hallux[i]<-hallux.impute
}

i<-i+1
}

PCAs and reweights of the values for each species is finally done and in a format that can be easily used to run the evol.rate.mcmc()

Script to impute data:

#impute missing values by taking mean of all other values of of other species ofthat character

i<-1 #set the index

species.removed<-c()

#sift through all babbler species

while (i <= length(babblers$UniqueID_1)){

#missing wing lengths; if the Wing Length is NA for that value

if(is.na(babblers$Wing.Length[i]) == TRUE){

#select the wing lengths of all birds which match this species with missing wing length

wing.species.vector<-babblers$Wing.Length[which(babblers$Species[i] == babblers$Species)]

#make a new vector without the missing value

wing.species.filler<-wing.species.vector[which(is.na(wing.species.vector) == FALSE)]

#calculate the mean

wing.impute<-mean(wing.species.filler)

#fill in missing value with the mean for that bird

babblers$Wing.Length[i]<-wing.impute

}

#repeat the above method for all characters

if(is.na(babblers$Tail.Length[i]) == TRUE){

tail.species.vector<-babblers$Tail.Length[which(babblers$Species[i] == babblers$Species)]

tail.species.filler<-tail.species.vector[which(is.na(tail.species.vector) == FALSE)]

tail.impute<-mean(tail.species.filler)

babblers$Tail.Length[i]<-tail.impute

}

if(is.na(babblers$Bill.Length[i]) == TRUE){

BL.species.vector<-babblers$Bill.Length[which(babblers$Species[i] == babblers$Species)]

BL.species.filler<-BL.species.vector[which(is.na(BL.species.vector) == FALSE)]

BL.impute<-mean(BL.species.filler)

babblers$Bill.Length[i]<-BL.impute

}

if(is.na(babblers$Bill.Width[i]) == TRUE){

BW.species.vector<-babblers$Bill.Width[which(babblers$Species[i] == babblers$Species)]

BW.species.filler<-BW.species.vector[which(is.na(BW.species.vector) == FALSE)]

BW.impute<-mean(BW.species.filler)

babblers$Bill.Width[i]<-BW.impute

}

if(is.na(babblers$Bill.Depth[i]) == TRUE){

BD.species.vector<-babblers$Bill.Depth[which(babblers$Species[i] == babblers$Species)]

BD.species.filler<-BD.species.vector[which(is.na(BD.species.vector) == FALSE)]

BD.impute<-mean(BD.species.filler)

babblers$Bill.Depth[i]<-BD.impute

}

if(is.na(babblers$Tarsus.Length[i]) == TRUE){

TL.species.vector<-babblers$Tarsus.Length[which(babblers$Species[i] == babblers$Species)]

TL.species.filler<-TL.species.vector[which(is.na(TL.species.vector) == FALSE)]

TL.impute<-mean(BD.species.filler)

babblers$Tarsus.Length[i]<-TL.impute

}

if(is.na(babblers$Hallux[i]) == TRUE){

hallux.species.vector<-babblers$Hallux[which(babblers$Species[i] == babblers$Species)]

hallux.species.filler<-hallux.species.vector[which(is.na(hallux.species.vector) == FALSE)]

hallux.impute<-mean(hallux.species.filler)

babblers$Hallux[i]<-hallux.impute

}

i<-i+1

}

Script to scale by geometric mean:

#make empty columns for geometric mean scale

babblers$gm.wing<-0

babblers$gm.tail<-0

babblers$gm.BL<-0

babblers$gm.BW<-0

babblers$gm.BD<-0

babblers$gm.tarsus<-0

babblers$gm.hallux<-0

#Size-adjust the raw data by dividing each individual by the geometric mean

j<-1

could.not.scale<-c()

while(j<=length(babblers$UniqueID_1)){

#grab all the data of that species and put in vector

temp<-c(babblers$Wing.Length[j], babblers$Tail.Length[j], babblers$Bill.Length[j],

babblers$Bill.Width[j], babblers$Bill.Depth[j], babblers$Tarsus.Length[j],

babblers$Hallux[j])

temp<-na.omit(temp)

#calculate the geometric mean of meausrements for that species

geo.mean<-exp(mean(log(temp)))

if(is.nan(geo.mean)==FALSE){

#divide all values for that species by the geometric mean

geo.wing<-(babblers$Wing.Length[j])/geo.mean

babblers$gm.wing[j]<-geo.wing

geo.tail<-babblers$Tail.Length[j]/geo.mean

babblers$gm.tail[j]<-geo.tail

geo.BL<-babblers$Bill.Length[j]/geo.mean

babblers$gm.BL[j]<-geo.BL

geo.BW<-babblers$Bill.Width[j]/geo.mean

babblers$gm.BW[j]<-geo.BW

geo.BD<-babblers$Bill.Depth[j]/geo.mean

babblers$gm.BD[j]<-geo.BD

geo.tarsus<-babblers$Tarsus.Length[j]/geo.mean

babblers$gm.tarsus[j]<-geo.tarsus

geo.hallux<-babblers$Hallux[j]/geo.mean

babblers$gm.hallux[j]<-geo.hallux

}

else{

append(could.not.scale, babblers$UniqueID_1[j])

}

#reset the temp

temp<-c()

#increment to next species

j<-j+1

}

cat("The following species could not be scaled:", could.not.scale, "\n")

Script to do PCA with geometric mean values and put rescaled scores into new data frame:

#PCA for all measurements

pca.result <- prcomp(~babblers$gm.wing + babblers$gm.tail + babblers$gm.BL + babblers$gm.BW + babblers$gm.BD +

babblers$gm.tarsus + babblers$gm.hallux, data = babblers, retx = T)

#standard deviation of PCA

sd <- pca.result$sdev

#make vector of weights

loadings <- pca.result$rotation

#value of the rotated data (scores)

scores <- pca.result$x

#make a new data frame

new.babblers<-as.data.frame(babblers)

#make empty columns in the data frame

new.babblers$pca1<-0

new.babblers$pca2<-0

new.babblers$pca3<-0

new.babblers$pca4<-0

new.babblers$pca5<-0

new.babblers$pca6<-0

new.babblers$pca7<-0

#put PCA Scores into the data frame

new.babblers$pca1<-scores[,1]

new.babblers$pca2<-scores[,2]

new.babblers$pca3<-scores[,3]

new.babblers$pca4<-scores[,4]

new.babblers$pca5<-scores[,5]

new.babblers$pca6<-scores[,6]

new.babblers$pca7<-scores[,7]

new.babblers$pca# are the traits that will be used in the Bayesian analysis.
This site was really useful for understanding PCA:
http://strata.uga.edu/software/pdf/pcaTutorial.pdf

I am dropping Actinodura ramsayi.

LY_0134	Actinodura	ramsayi			8.3	NA	10.44	5.34	5.86	25.57	NA
LY_0135	Actinodura	ramsayi			7.9	NA	10.86	6.24	5.64	NA	NA

I only have two specimens for Actinodura ramsayi but am missing the tail and hallux measurements for both so cannot impute the missing data. It really messes up the PCA and I will not be able to get trait data for this node if I do not fill these in. Should I take the mean of those samples from the entire Actinodura genus? Or drop the species?

Tuesday, June 4, 2013

Updated morphological data to include measurements from samples from data from Vietnam. Strangely, variance does not seem to be held just in one trait.

When running PRGmatic v1.7, if you get an error:

"Can't exec "./cap3": No such file or directory at ./PRGmatic.pl line 116, <STDIN> line 8.
contigFile is Species_01_trimmed.fasta.cap.contigs
No such file or directory at ./PRGmatic.pl line 127, <STDIN> line 8."

Go to PRGmatic folder in home directory (or wherever it is), click >Dependent Software and you probably need to unzip "cap3.macosx.intel64.zip". Unzip this this folder and move all of the contents to the PRGmatic main directory.

Thursday, May 30, 2013

Implemented code from Revell, et al. (2012) in supplemental material

http://onlinelibrary.wiley.com/store/10.1111/j.1558-5646.2011.01435.x/asset/supinfo/EVO_1435_sm_suppmat.pdf?v=1&s=cb7aa4c4e828d51fd4fe1519178c9c31fd956dc6

No problems compiling the code but had to spend awhile tweaking my data set to get the names of the character data species to match the names of the nodes.

Started playing around with data to see if I could get something to orient myself. Picked a random tree from the posterior distribution of trees from Reddy/Moyle (2012) paper. Sampled randomly from my data for Wing Length and after getting all of my names to match up, I could run:

result<-evol.rate.mcmc(test.tree,x)
min_split<-minSplit(tree,result$mcmc[21:101,c("node","bp")])
mcmc<-posterior.evolrate(tree,min_split,result$mcmc[21:101,],result$tips[201:1001])

Error in if (where == 0 || where == "root") where <- ROOTx :
missing value where TRUE/FALSE needed

I ran into an error at this point and I think it is a problem that could be patched with tweaking of of the split.tree() function obtained from Revell's blog (see comment by Jessica):
http://blog.phytools.org/2011/04/updates-to-evolratemcmc-new-analysis-of.html

Seemed to fix the problem. Now onto formatting my real data set...