kopperud / slouch Goto Github PK
View Code? Open in Web Editor NEWStochastic Linear Ornstein Uhlenbeck Comparative Hypotheses
License: GNU General Public License v2.0
Stochastic Linear Ornstein Uhlenbeck Comparative Hypotheses
License: GNU General Public License v2.0
In slouch version 2.1.4
there is a bug in calculating the weight matrix for a model with discrete predictors mapped as regimes on the tree. The reason we discovered this, is that we were working on an alternative (faster) implementation of the likelihood. I'm copy-pasting a minimum-working example below
## minimum working example
library(ape)
library(phytools)
library(slouch)
############################################################
# #
# State-dependent pruning #
# !! σ2, α are scalars, θ is a vector of length # states #
# #
############################################################
sd_postorder <- function(node_index, edge, tree, continuousChar,
μ, V, log_norm_factor, subedges_lengths, σ2, α, θ){
ntip = length(tree$tip.label)
# if is internal node
if (node_index > ntip){
left_edge = which(edge[,1] == node_index)[1] # index of left child edge
right_edge = which(edge[,1] == node_index)[2] # index of right child edge
left = edge[left_edge,2] # index of left child node
right = edge[right_edge,2] # index of right child node
output_left <- sd_postorder(left, edge, tree, continuousChar,
μ, V, log_norm_factor, subedges_lengths, σ2, α, θ)
μ <- output_left[[1]]
V <- output_left[[2]]
log_norm_factor <- output_left[[3]]
output_right <- sd_postorder(right, edge, tree, continuousChar,
μ, V, log_norm_factor, subedges_lengths, σ2, α, θ)
μ <- output_right[[1]]
V <- output_right[[2]]
log_norm_factor <- output_right[[3]]
sub_bl_left = subedges_lengths[left_edge][[1]] # all subedges of left child edge
sub_bl_right = subedges_lengths[right_edge][[1]] # all subedges of right child edge
# for the sake of readability, computation of variance, mean, and log_nf are done in separate loops
# 1) variance of the normal variable: this branch (v_left) and the subtree (V[left])
## Is 'delta_left* exp(2.0 * α * bl_left)' added in each sub-edge?
delta_left = V[left]
v_left = 0 # initialise v_left
for (i in rev(1:length(sub_bl_left))){
state <- names(sub_bl_left)[i]
delta_t <- sub_bl_left[[i]]
v_left = σ2/(2*α) * expm1(2.0*α * delta_t)
delta_left = v_left + delta_left * exp(2.0 * α * delta_t)
}
delta_right = V[right]
v_right = 0 # initialise v_right
for (i in rev(1:length(sub_bl_right))){
state <- names(sub_bl_right)[i]
v_right = σ2/(2*α) *expm1(2.0*α*sub_bl_right[[i]])
delta_right = v_right + delta_right * exp(2.0 * α * sub_bl_right[[i]])
}
var_left = delta_left
var_right = delta_right
# 2) mean of the normal variable
mean_left = μ[left]
for (i in rev(1:length(sub_bl_left))){
state <- names(sub_bl_left)[i]
mean_left = exp(α*sub_bl_left[[i]])*(mean_left - θ[[state]]) + θ[[state]]
}
mean_right = μ[right]
for (i in rev(1:length(sub_bl_right))){
state <- names(sub_bl_right)[i]
mean_right = exp(α*sub_bl_right[[i]])*(mean_right - θ[[state]]) + θ[[state]]
}
## compute the mean and variance of the node
mean_ancestor = (mean_left * var_right + mean_right * var_left) / (var_left + var_right)
μ[node_index] = mean_ancestor
var_node = (var_left * var_right) / (var_left + var_right)
V[node_index] = var_node
## compute the normalizing factor, the left-hand side of the pdf of the normal variable
log_nf_left = 0
for (i in rev(1:length(sub_bl_left))){
state <- names(sub_bl_left)[i]
delta_t <- sub_bl_left[[i]]
log_nf_left = log_nf_left + delta_t * α
}
log_nf_right = 0
for (i in rev(1:length(sub_bl_right))){
state <- names(sub_bl_right)[i]
log_nf_right = log_nf_right + sub_bl_right[[i]] * α
}
contrast = mean_left - mean_right
a = -(contrast*contrast / (2*(var_left+var_right)))
b = log(2*pi*(var_left+var_right))/2.0
log_nf = log_nf_left + log_nf_right + a - b
log_norm_factor[node_index] = log_nf
return(list(μ, V, log_norm_factor))
}
# if is tip
else{
species = tree$tip.label[node_index]
μ[node_index] = continuousChar[[which(names(continuousChar) == species)]]
V[node_index] = 0.0 ## if there is no observation error
return(list(μ, V, log_norm_factor))
}
}
sd_logL_pruning <- function(tree, continuousChar, σ2, α, θ){
ntip = length(tree$tip.label) # number of tips
edge = tree$edge # equals tree[:edge] in Julia
n_edges = length(edge[,1]) # number of edges
max_node_index = max(tree$edge) # total number of nodes
V = numeric(max_node_index)
μ = numeric(max_node_index)
log_norm_factor = numeric(max_node_index)
subedges_lengths = tree$maps
root_index = ntip + 1
output <- sd_postorder(root_index, edge, tree, continuousChar,
μ, V, log_norm_factor, subedges_lengths, σ2, α, θ)
μ <- output[[1]]
V <- output[[2]]
log_norm_factor <- output[[3]]
## assume root value equal to theta
μ_root = μ[root_index]
v_root = V[root_index]
left_edge_from_root <- which(edge[,1] == ntip+1)[1] # obtain left child edge index of root node
left_subedges_from_root <- subedges_lengths[[left_edge_from_root]] # obtain sub-edge lengths
root_state = names(tail(left_subedges_from_root))[[1]] # obtain root state, assuming it equals last state at left child edge
lnl = dnorm(θ[[root_state]], mean = μ_root, sd = sqrt(v_root), log = TRUE)
## add norm factor
for (log_nf in log_norm_factor){
lnl = lnl + log_nf
}
return(lnl)
}
And some code to test the two versions of computing the likelihood
###################################################
# #
# Testing... #
# #
###################################################
# test with slouch data set
data("artiodactyla")
data("neocortex")
# convert continuous data to read.nexus.data() format
brain <- list()
for (i in 1:length(neocortex$brain_mass_g_log_mean)){
sp <- neocortex$species[i]
brain[sp] <- list(neocortex$brain_mass_g_log_mean[i])
}
neocortex <- neocortex[match(artiodactyla$tip.label, neocortex$species), ]
diet <- as.character(neocortex$diet)
names(diet) <- neocortex$species
set.seed(123)
smaptree <- phytools::make.simmap(artiodactyla, diet)
This is what the SIMMAP tree looks like (it's just one random SIMMAP, but it does not matter for illustrating that the likelihoods are equivalent)
m0 <- slouch.fit(
smaptree,
species = neocortex$species,
response = neocortex$brain_mass_g_log_mean,
fixed.fact = neocortex$diet,
a_values = 0.01,
sigma2_y_values = 1.0,
anc_maps = "simmap",
hillclimb = FALSE
)
theta <- dput(m0$beta_primary$coefficients[,1])
lnl_brain_pruning <- sd_logL_pruning(smaptree, brain,
σ2 = 1.0,
α = 0.01, θ = theta)
As of version 2.1.4
, these two methods gave different results (i.e. different probability densities). There is a fix in 88c6181. When printing the likelihoods after the bug fix, in version 2.1.5
(I will upload a new version to CRAN), the likelihoods are identical up to floating point errors
print(m0$modfit$Support, digits = 20)
print(lnl_brain_pruning, digits = 20)
[1] -89.861374999422423571
[1] -89.86137499942240936
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.