Coding for Performance
4: More Examples
For the assignment, most students focused on optimizing either the share
function from our earlier BLP example, or the statparts
function from GMMInference.
Let’s see how we can improve these functions’ performance.
statparts
A few people worked on optimizing the klm
or clr
function from GMMInference.jl. Here is the original code for klm
.
using ForwardDiff
function statparts(gi::Function)
function P(A::AbstractMatrix) # projection matrix
A*pinv(A'*A)*A'
end
function(θ)
= gi(θ)
giθ = length(θ)
p = size(giθ)
(n, k) = cov(giθ)
Ω =mean(gi(θ), dims=1)'
gn= ForwardDiff.jacobian(gi,θ)
Gi= reshape(Gi, n , k, p)
Gi = mean(Gi, dims=1)
G = zeros(eltype(Gi),p,k,k)
Γ = zeros(eltype(Gi),k, p)
D for j in 1:p
for i in 1:n
:,:] += (Gi[i,:,j] .- G[1,:,j]) * giθ[i,:]'
Γ[j,end
:,:] ./= n
Γ[j,:,j] = G[1,:,j] - Γ[j,:,:]*inv(Ω)*gn
D[end
return(n,k,p,gn, Ω, D, P)
end
end
function klm(gi::Function)
= statparts(gi)
SP function(θ)
= SP(θ)
(n,k,p,gn, Ω, D, P) return n*(gn'*Ω^(-1/2)*P(Ω^(-1/2)*D)*Ω^(-1/2)*gn)[1]
end
end
klm (generic function with 1 method)
To run the code, we need an example gi
function. We’ll just copy the example from the docs.
import Random
function simulate_ivshare(n,β,γ,ρ)
= randn(n, size(γ)[1])
z = randn(n, length(β))
endo = z*γ .+ endo
x = rand(Normal(0,sqrt((1.0-ρ^2))),n).+endo[:,1]*ρ
ξ = cdf.(Logistic(), x*β .+ ξ)
y return((y=y,x=x,z=z))
end
= 100
n = 2
k = 3
iv 0 = ones(k)
β0 = vcat(5*I,ones(iv-k,k))
π= 0.5
ρ Random.seed!(622)
= simulate_ivshare(n,β0,π0,ρ)
(y,x,z)
function gi_ivshare(β,y,x,z)
= quantile.(Logistic(),y) .- x*β
ξ .*z
ξend
= let y=y, x=x, z=z
gi β->gi_ivshare(β,y,x,z)
end
#26 (generic function with 1 method)
Initial Benchmark
@benchmark klm(gi)(β0)
BenchmarkTools.Trial: 9747 samples with 1 evaluation.
Range (min … max): 458.320 μs … 14.717 ms ┊ GC (min … max): 0.00% … 95.
33%
Time (median): 473.890 μs ┊ GC (median): 0.00%
Time (mean ± σ): 506.388 μs ± 513.723 μs ┊ GC (mean ± σ): 5.24% ± 4.
99%
▁▄▇█▇▆▄▂
▂▂▂▃▄▆█████████▇▆▆▅▅▅▄▄▄▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▃
458 μs Histogram: frequency by time 542 μs <
Memory estimate: 211.50 KiB, allocs estimate: 2759.
Fixing Type Instabilities
From @code_warntype
, we see that the compiles is unable to infer the type of some variables. The problem seems to start with D
. This is quite puzzling because D
is explicitly initialized as zeros(eltype(Gi),...).
> @code_warntype klm(gi)(β0)
juliafor (::var"#24#25"{var"#21#23"{var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}, var"#P#22"}})(::Vector{Float64})
MethodInstance ::var"#24#25")(θ) @ Main ~/.julia/dev/ARGridBootstrap/docs/jmd/assignment.jmd:31
from (
Arguments#self#::var"#24#25"{var"#21#23"{var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}, var"#P#22"}}
::Vector{Float64}
θ
Locals@_3::Int64
::var"#P#22"
P::ANY
D::Matrix{Float64}
Ω::Adjoint{Float64, Matrix{Float64}}
gn::Int64
p::Int64
k::Int64
n::ANY
Body1 ─ %1 = Core.getfield(#self#, :SP)::var"#21#23"{var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}, var"#P#22"}
%2 = (%1)(θ)::TUPLE{INT64, INT64, INT64, ADJOINT{FLOAT64, MATRIX{FLOAT64}}, MATRIX{FLOAT64}, ANY, VAR"#P#22"}
│ %3 = Base.indexed_iterate(%2, 1)::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(2)])
│ = Core.getfield(%3, 1))
│ (n @_3 = Core.getfield(%3, 2))
│ (%6 = Base.indexed_iterate(%2, 2, @_3::Core.Const(2))::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(3)])
│ = Core.getfield(%6, 1))
│ (k @_3 = Core.getfield(%6, 2))
│ (%9 = Base.indexed_iterate(%2, 3, @_3::Core.Const(3))::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(4)])
│ = Core.getfield(%9, 1))
│ (p @_3 = Core.getfield(%9, 2))
│ (%12 = Base.indexed_iterate(%2, 4, @_3::Core.Const(4))::Core.PartialStruct(Tuple{Adjoint{Float64, Matrix{Float64}}, Int64}, Any[Adjoint{Float64, Matrix{Float64}}, Core.Const(5)])
│ = Core.getfield(%12, 1))
│ (gn @_3 = Core.getfield(%12, 2))
│ (%15 = Base.indexed_iterate(%2, 5, @_3::Core.Const(5))::Core.PartialStruct(Tuple{Matrix{Float64}, Int64}, Any[Matrix{Float64}, Core.Const(6)])
│ = Core.getfield(%15, 1))
│ (Ω @_3 = Core.getfield(%15, 2))
│ (%18 = Base.indexed_iterate(%2, 6, @_3::Core.Const(6))::Core.PartialStruct(Tuple{Any, Int64}, Any[Any, Core.Const(7)])
│ = Core.getfield(%18, 1))
│ (D @_3 = Core.getfield(%18, 2))
│ (%21 = Base.indexed_iterate(%2, 7, @_3::Core.Const(7))::Core.PartialStruct(Tuple{var"#P#22", Int64}, Any[var"#P#22", Core.Const(8)])
│ = Core.getfield(%21, 1))
│ (P %23 = n::Int64
│ %24 = Main.:var"'"(gn)::Matrix{Float64}
│ %25 = Ω::Matrix{Float64}
│ %26 = (-1 / 2)::Core.Const(-0.5)
│ %27 = (%25 ^ %26)::ANY
│ %28 = Ω::Matrix{Float64}
│ %29 = (-1 / 2)::Core.Const(-0.5)
│ %30 = (%28 ^ %29)::ANY
│ %31 = (%30 * D)::ANY
│ %32 = (P)(%31)::ANY
│ %33 = Ω::Matrix{Float64}
│ %34 = (-1 / 2)::Core.Const(-0.5)
│ %35 = (%33 ^ %34)::ANY
│ %36 = (%24 * %27 * %32 * %35 * gn)::ANY
│ %37 = Base.getindex(%36, 1)::ANY
│ %38 = (%23 * %37)::ANY
│ return %38 └──
To investigate further, let us focus on statparts
.
> @code_warntype statparts(gi)(β0)
juliafor (::var"#21#23"{var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}, var"#P#22"})(::Vector{Float64})
MethodInstance ::var"#21#23")(θ) @ Main ~/.julia/dev/ARGridBootstrap/docs/jmd/assignment.jmd:7
from (
Arguments#self#::var"#21#23"{var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}, var"#P#22"}
::Vector{Float64}
θ
Locals@_3::UNION{NOTHING, TUPLE{INT64, INT64}}
@_4::Int64
::ANY
D::ANY
Γ::ANY
G::ANY
Gi::Adjoint{Float64, Matrix{Float64}}
gn::Matrix{Float64}
Ω::Int64
k::Int64
n::Int64
p::Matrix{Float64}
giθ@_15::UNION{NOTHING, TUPLE{INT64, INT64}}
::Int64
j::Int64
i::TUPLE{INT64, INT64, INT64, ADJOINT{FLOAT64, MATRIX{FLOAT64}}, MATRIX{FLOAT64}, ANY, VAR"#P#22"}
Body1 ─ %1 = Core.getfield(#self#, :gi)::var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}
= (%1)(θ))
│ (giθ = Main.length(θ))
│ (p %4 = Main.size(giθ)::Tuple{Int64, Int64}
│ %5 = Base.indexed_iterate(%4, 1)::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(2)])
│ = Core.getfield(%5, 1))
│ (n @_4 = Core.getfield(%5, 2))
│ (%8 = Base.indexed_iterate(%4, 2, @_4::Core.Const(2))::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(3)])
│ = Core.getfield(%8, 1))
│ (k = Main.cov(giθ))
│ (Ω %11 = Core.getfield(#self#, :gi)::var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}
│ %12 = (%11)(θ)::Matrix{Float64}
│ %13 = (:dims,)::Core.Const((:dims,))
│ %14 = Core.apply_type(Core.NamedTuple, %13)::Core.Const(NamedTuple{(:dims,)})
│ %15 = Core.tuple(1)::Core.Const((1,))
│ %16 = (%14)(%15)::Core.Const((dims = 1,))
│ %17 = Core.kwcall(%16, Main.mean, %12)::Matrix{Float64}
│ = Main.:var"'"(%17))
│ (gn %19 = ForwardDiff.jacobian::Core.Const(ForwardDiff.jacobian)
│ %20 = Core.getfield(#self#, :gi)::var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}
│ = (%19)(%20, θ))
│ (Gi = Main.reshape(Gi, n, k, p))
│ (Gi %23 = (:dims,)::Core.Const((:dims,))
│ %24 = Core.apply_type(Core.NamedTuple, %23)::Core.Const(NamedTuple{(:dims,)})
│ %25 = Core.tuple(1)::Core.Const((1,))
│ %26 = (%24)(%25)::Core.Const((dims = 1,))
│ = Core.kwcall(%26, Main.mean, Gi))
│ (G %28 = Main.eltype(Gi)::ANY
│ %29 = p::Int64
│ %30 = k::Int64
│ = Main.zeros(%28, %29, %30, k))
│ (Γ %32 = Main.eltype(Gi)::ANY
│ %33 = k::Int64
│ = Main.zeros(%32, %33, p))
│ (D %35 = (1:p)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
│ @_3 = Base.iterate(%35))
│ (%37 = (@_3 === nothing)::Bool
│ %38 = Base.not_int(%37)::Bool
│ #7 if not %38
└── goto 2 ┄ %40 = @_3::Tuple{Int64, Int64}
= Core.getfield(%40, 1))
│ (j %42 = Core.getfield(%40, 2)::Int64
│ %43 = (1:n)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
│ @_15 = Base.iterate(%43))
│ (%45 = (@_15 === nothing)::Bool
│ %46 = Base.not_int(%45)::Bool
│ #5 if not %46
└── goto 3 ┄ %48 = @_15::Tuple{Int64, Int64}
= Core.getfield(%48, 1))
│ (i %50 = Core.getfield(%48, 2)::Int64
│ %51 = Base.getindex(Γ, j, Main.:(:), Main.:(:))::ANY
│ %52 = Main.:-::Core.Const(-)
│ %53 = Base.getindex(Gi, i, Main.:(:), j)::ANY
│ %54 = Base.getindex(G, 1, Main.:(:), j)::ANY
│ %55 = Base.broadcasted(%52, %53, %54)::ANY
│ %56 = Base.materialize(%55)::ANY
│ %57 = Base.getindex(giθ, i, Main.:(:))::Vector{Float64}
│ %58 = Main.:var"'"(%57)::Adjoint{Float64, Vector{Float64}}
│ %59 = (%56 * %58)::ANY
│ %60 = (%51 + %59)::ANY
│ Base.setindex!(Γ, %60, j, Main.:(:), Main.:(:))
│ @_15 = Base.iterate(%43, %50))
│ (%63 = (@_15 === nothing)::Bool
│ %64 = Base.not_int(%63)::Bool
│ #5 if not %64
└── goto 4 ─ goto #3
5 ┄ %67 = Base.dotview(Γ, j, Main.:(:), Main.:(:))::ANY
%68 = Main.:/::Core.Const(/)
│ %69 = Base.getindex(Γ, j, Main.:(:), Main.:(:))::ANY
│ %70 = Base.broadcasted(%68, %69, n)::ANY
│ Base.materialize!(%67, %70)
│ %72 = Base.getindex(G, 1, Main.:(:), j)::ANY
│ %73 = Base.getindex(Γ, j, Main.:(:), Main.:(:))::ANY
│ %74 = Main.inv(Ω)::Matrix{Float64}
│ %75 = (%73 * %74 * gn)::ANY
│ %76 = (%72 - %75)::ANY
│ Base.setindex!(D, %76, Main.:(:), j)
│ @_3 = Base.iterate(%35, %42))
│ (%79 = (@_3 === nothing)::Bool
│ %80 = Base.not_int(%79)::Bool
│ #7 if not %80
└── goto 6 ─ goto #2
7 ┄ %83 = n::Int64
%84 = k::Int64
│ %85 = p::Int64
│ %86 = gn::Adjoint{Float64, Matrix{Float64}}
│ %87 = Ω::Matrix{Float64}
│ %88 = D::ANY
│ %89 = Core.getfield(#self#, :P)::Core.Const(var"#P#22"())
│ %90 = Core.tuple(%83, %84, %85, %86, %87, %88, %89)::TUPLE{INT64, INT64, INT64, ADJOINT{FLOAT64, MATRIX{FLOAT64}}, MATRIX{FLOAT64}, ANY, VAR"#P#22"}
│ return %90 └──
We see that G
, Gi
, Γ
, and D
are all type Any
. For some reason, the return value of ForwardDiff.jacobian
is not being inferred. We can workaround this by using an jacobian!
instead.
function statparts(gi::F) where {F <: Function}
function P(A::AbstractMatrix) # projection matrix
A*pinv(A'*A)*A'
end
let gi=gi
function(θ)
= gi(θ)
giθ = length(θ)
p = size(giθ)
(n, k) = Hermitian(cov(giθ))
Ω =mean(gi(θ), dims=1)'
gn= zeros(n,k,p)
Gi jacobian!(Gi,gi,θ)
ForwardDiff.= reshape(Gi, n , k, p)
Gi = mean(Gi, dims=1)
G = zeros(eltype(Gi),p,k,k)
Γ = zeros(eltype(Gi),k, p)
D for j in 1:p
for i in 1:n
:,:] += (Gi[i,:,j] .- G[1,:,j]) * giθ[i,:]'
Γ[j,end
:,:] ./= n
Γ[j,:,j] = G[1,:,j] - Γ[j,:,:]*inv(Ω)*gn
D[end
return(n,k,p,gn, Ω, D, P)
end
end
end
statparts (generic function with 1 method)
> @code_warntype statparts(gi)(β0)
juliafor (::var"#28#30"{var"#P#29", var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}})(::Vector{Float64})
MethodInstance ::var"#28#30")(θ) @ Main ~/.julia/dev/ARGridBootstrap/docs/jmd/assignment.jmd:7
from (
Arguments#self#::var"#28#30"{var"#P#29", var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}}
::Vector{Float64}
θ
Locals@_3::UNION{NOTHING, TUPLE{INT64, INT64}}
@_4::Int64
::Matrix{Float64}
D::Array{Float64, 3}
Γ::Array{Float64, 3}
G::Array{Float64, 3}
Gi::Adjoint{Float64, Matrix{Float64}}
gn::Hermitian{Float64, Matrix{Float64}}
Ω::Int64
k::Int64
n::Int64
p::Matrix{Float64}
giθ@_15::UNION{NOTHING, TUPLE{INT64, INT64}}
::Int64
j::Int64
i::Tuple{Int64, Int64, Int64, Adjoint{Float64, Matrix{Float64}}, Hermitian{Float64, Matrix{Float64}}, Matrix{Float64}, var"#P#29"}
Body1 ─ %1 = Core.getfield(#self#, Symbol("#457#gi"))::var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}
= (%1)(θ))
│ (giθ = Main.length(θ))
│ (p %4 = Main.size(giθ)::Tuple{Int64, Int64}
│ %5 = Base.indexed_iterate(%4, 1)::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(2)])
│ = Core.getfield(%5, 1))
│ (n @_4 = Core.getfield(%5, 2))
│ (%8 = Base.indexed_iterate(%4, 2, @_4::Core.Const(2))::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(3)])
│ = Core.getfield(%8, 1))
│ (k %10 = Main.cov(giθ)::Matrix{Float64}
│ = Main.Hermitian(%10))
│ (Ω %12 = Core.getfield(#self#, Symbol("#457#gi"))::var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}
│ %13 = (%12)(θ)::Matrix{Float64}
│ %14 = (:dims,)::Core.Const((:dims,))
│ %15 = Core.apply_type(Core.NamedTuple, %14)::Core.Const(NamedTuple{(:dims,)})
│ %16 = Core.tuple(1)::Core.Const((1,))
│ %17 = (%15)(%16)::Core.Const((dims = 1,))
│ %18 = Core.kwcall(%17, Main.mean, %13)::Matrix{Float64}
│ = Main.:var"'"(%18))
│ (gn = Main.zeros(n, k, p))
│ (Gi %21 = ForwardDiff.jacobian!::Core.Const(ForwardDiff.jacobian!)
│ %22 = Gi::Array{Float64, 3}
│ %23 = Core.getfield(#self#, Symbol("#457#gi"))::var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}
│ %21)(%22, %23, θ)
│ (= Main.reshape(Gi, n, k, p))
│ (Gi %26 = (:dims,)::Core.Const((:dims,))
│ %27 = Core.apply_type(Core.NamedTuple, %26)::Core.Const(NamedTuple{(:dims,)})
│ %28 = Core.tuple(1)::Core.Const((1,))
│ %29 = (%27)(%28)::Core.Const((dims = 1,))
│ = Core.kwcall(%29, Main.mean, Gi))
│ (G %31 = Main.eltype(Gi)::Core.Const(Float64)
│ %32 = p::Int64
│ %33 = k::Int64
│ = Main.zeros(%31, %32, %33, k))
│ (Γ %35 = Main.eltype(Gi)::Core.Const(Float64)
│ %36 = k::Int64
│ = Main.zeros(%35, %36, p))
│ (D %38 = (1:p)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
│ @_3 = Base.iterate(%38))
│ (%40 = (@_3 === nothing)::Bool
│ %41 = Base.not_int(%40)::Bool
│ #7 if not %41
└── goto 2 ┄ %43 = @_3::Tuple{Int64, Int64}
= Core.getfield(%43, 1))
│ (j %45 = Core.getfield(%43, 2)::Int64
│ %46 = (1:n)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
│ @_15 = Base.iterate(%46))
│ (%48 = (@_15 === nothing)::Bool
│ %49 = Base.not_int(%48)::Bool
│ #5 if not %49
└── goto 3 ┄ %51 = @_15::Tuple{Int64, Int64}
= Core.getfield(%51, 1))
│ (i %53 = Core.getfield(%51, 2)::Int64
│ %54 = Base.getindex(Γ, j, Main.:(:), Main.:(:))::Matrix{Float64}
│ %55 = Main.:-::Core.Const(-)
│ %56 = Base.getindex(Gi, i, Main.:(:), j)::Vector{Float64}
│ %57 = Base.getindex(G, 1, Main.:(:), j)::Vector{Float64}
│ %58 = Base.broadcasted(%55, %56, %57)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Nothing, typeof(-), Tuple{Vector{Float64}, Vector{Float64}}}
│ %59 = Base.materialize(%58)::Vector{Float64}
│ %60 = Base.getindex(giθ, i, Main.:(:))::Vector{Float64}
│ %61 = Main.:var"'"(%60)::Adjoint{Float64, Vector{Float64}}
│ %62 = (%59 * %61)::Matrix{Float64}
│ %63 = (%54 + %62)::Matrix{Float64}
│ Base.setindex!(Γ, %63, j, Main.:(:), Main.:(:))
│ @_15 = Base.iterate(%46, %53))
│ (%66 = (@_15 === nothing)::Bool
│ %67 = Base.not_int(%66)::Bool
│ #5 if not %67
└── goto 4 ─ goto #3
5 ┄ %70 = Base.dotview(Γ, j, Main.:(:), Main.:(:))::SubArray{Float64, 2, Array{Float64, 3}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}}, true}
%71 = Main.:/::Core.Const(/)
│ %72 = Base.getindex(Γ, j, Main.:(:), Main.:(:))::Matrix{Float64}
│ %73 = Base.broadcasted(%71, %72, n)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{2}, Nothing, typeof(/), Tuple{Matrix{Float64}, Int64}}
│ Base.materialize!(%70, %73)
│ %75 = Base.getindex(G, 1, Main.:(:), j)::Vector{Float64}
│ %76 = Base.getindex(Γ, j, Main.:(:), Main.:(:))::Matrix{Float64}
│ %77 = Main.inv(Ω::Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')]))::Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')])
│ %78 = (%76 * %77 * gn)::Matrix{Float64}
│ %79 = (%75 - %78)::Matrix{Float64}
│ Base.setindex!(D, %79, Main.:(:), j)
│ @_3 = Base.iterate(%38, %45))
│ (%82 = (@_3 === nothing)::Bool
│ %83 = Base.not_int(%82)::Bool
│ #7 if not %83
└── goto 6 ─ goto #2
7 ┄ %86 = n::Int64
%87 = k::Int64
│ %88 = p::Int64
│ %89 = gn::Adjoint{Float64, Matrix{Float64}}
│ %90 = Ω::Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')])
│ %91 = D::Matrix{Float64}
│ %92 = Core.getfield(#self#, :P)::Core.Const(var"#P#29"())
│ %93 = Core.tuple(%86, %87, %88, %89, %90, %91, %92)::Core.PartialStruct(Tuple{Int64, Int64, Int64, Adjoint{Float64, Matrix{Float64}}, Hermitian{Float64, Matrix{Float64}}, Matrix{Float64}, var"#P#29"}, Any[Int64, Int64, Int64, Adjoint{Float64, Matrix{Float64}}, Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')]), Matrix{Float64}, var"#P#29"])
│ return %93 └──
I also added the where {F
statement to ensure compiler specialization, and added the let gi=gi
line to help with the performance of captured variables.
> @code_warntype klm(gi)(β0)
juliafor (::var"#24#25"{var"#28#30"{var"#P#29", var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}}})(::Vector{Float64})
MethodInstance ::var"#24#25")(θ) @ Main ~/.julia/dev/ARGridBootstrap/docs/jmd/assignment.jmd:31
from (
Arguments#self#::var"#24#25"{var"#28#30"{var"#P#29", var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}}}
::Vector{Float64}
θ
Locals@_3::Int64
::var"#P#29"
P::Matrix{Float64}
D::Hermitian{Float64, Matrix{Float64}}
Ω::Adjoint{Float64, Matrix{Float64}}
gn::Int64
p::Int64
k::Int64
n::UNION{FLOAT64, COMPLEXF64}
Body1 ─ %1 = Core.getfield(#self#, :SP)::var"#28#30"{var"#P#29", var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}}
%2 = (%1)(θ)::Core.PartialStruct(Tuple{Int64, Int64, Int64, Adjoint{Float64, Matrix{Float64}}, Hermitian{Float64, Matrix{Float64}}, Matrix{Float64}, var"#P#29"}, Any[Int64, Int64, Int64, Adjoint{Float64, Matrix{Float64}}, Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')]), Matrix{Float64}, var"#P#29"])
│ %3 = Base.indexed_iterate(%2, 1)::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(2)])
│ = Core.getfield(%3, 1))
│ (n @_3 = Core.getfield(%3, 2))
│ (%6 = Base.indexed_iterate(%2, 2, @_3::Core.Const(2))::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(3)])
│ = Core.getfield(%6, 1))
│ (k @_3 = Core.getfield(%6, 2))
│ (%9 = Base.indexed_iterate(%2, 3, @_3::Core.Const(3))::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(4)])
│ = Core.getfield(%9, 1))
│ (p @_3 = Core.getfield(%9, 2))
│ (%12 = Base.indexed_iterate(%2, 4, @_3::Core.Const(4))::Core.PartialStruct(Tuple{Adjoint{Float64, Matrix{Float64}}, Int64}, Any[Adjoint{Float64, Matrix{Float64}}, Core.Const(5)])
│ = Core.getfield(%12, 1))
│ (gn @_3 = Core.getfield(%12, 2))
│ (%15 = Base.indexed_iterate(%2, 5, @_3::Core.Const(5))::Core.PartialStruct(Tuple{Hermitian{Float64, Matrix{Float64}}, Int64}, Any[Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')]), Core.Const(6)])
│ = Core.getfield(%15, 1))
│ (Ω @_3 = Core.getfield(%15, 2))
│ (%18 = Base.indexed_iterate(%2, 6, @_3::Core.Const(6))::Core.PartialStruct(Tuple{Matrix{Float64}, Int64}, Any[Matrix{Float64}, Core.Const(7)])
│ = Core.getfield(%18, 1))
│ (D @_3 = Core.getfield(%18, 2))
│ (%21 = Base.indexed_iterate(%2, 7, @_3::Core.Const(7))::Core.PartialStruct(Tuple{var"#P#29", Int64}, Any[var"#P#29", Core.Const(8)])
│ = Core.getfield(%21, 1))
│ (P %23 = n::Int64
│ %24 = Main.:var"'"(gn)::Matrix{Float64}
│ %25 = Ω::Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')])
│ %26 = (-1 / 2)::Core.Const(-0.5)
│ %27 = (%25 ^ %26)::UNION{HERMITIAN{FLOAT64, MATRIX{FLOAT64}}, MATRIX{COMPLEXF64}}
│ %28 = Ω::Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')])
│ %29 = (-1 / 2)::Core.Const(-0.5)
│ %30 = (%28 ^ %29)::UNION{HERMITIAN{FLOAT64, MATRIX{FLOAT64}}, MATRIX{COMPLEXF64}}
│ %31 = (%30 * D)::UNION{MATRIX{COMPLEXF64}, MATRIX{FLOAT64}}
│ %32 = (P)(%31)::UNION{MATRIX{COMPLEXF64}, MATRIX{FLOAT64}}
│ %33 = Ω::Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')])
│ %34 = (-1 / 2)::Core.Const(-0.5)
│ %35 = (%33 ^ %34)::UNION{HERMITIAN{FLOAT64, MATRIX{FLOAT64}}, MATRIX{COMPLEXF64}}
│ %36 = (%24 * %27 * %32 * %35 * gn)::UNION{MATRIX{COMPLEXF64}, MATRIX{FLOAT64}}
│ %37 = Base.getindex(%36, 1)::UNION{FLOAT64, COMPLEXF64}
│ %38 = (%23 * %37)::UNION{FLOAT64, COMPLEXF64}
│ return %38 └──
There’s still a type-instability in klm
. This one is harder to understand. It is due to the fact that the appropriate meaning of a matrix square root depends on the nature of the matrix. In particular, the value could be a complex valued matrix instead of real valued. We know that Ω should be positive definite with a real matrix square root. We can compute its square root from its Eigen decomposition and avoid the type instability.
function klm(gi::F ) where {F <: Function}
let gi=gi
function(θ)
= statparts(gi)(θ)
(n,k,p,gn, Ω, D, P) = eigen(Ω)
λ, v = v*diagm(λ.^(-1/2))*v'
irΩ return n*(gn'*irΩ*P(irΩ*D)*irΩ*gn)[1]
end
end
end
klm (generic function with 1 method)
> @code_warntype klm(gi)(β0)
juliafor (::var"#31#32"{var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}})(::Vector{Float64})
MethodInstance ::var"#31#32")(θ) @ Main ~/.julia/dev/ARGridBootstrap/docs/jmd/assignment.jmd:4
from (
Arguments#self#::var"#31#32"{var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}}
::Vector{Float64}
θ
Locals@_3::Val{:vectors}
@_4::Int64
::Matrix{Float64}
irΩ::Matrix{Float64}
v::Vector{Float64}
λ::var"#P#29"
P::Matrix{Float64}
D::Hermitian{Float64, Matrix{Float64}}
Ω::Adjoint{Float64, Matrix{Float64}}
gn::Int64
p::Int64
k::Int64
n::Float64
Body1 ─ %1 = Core.getfield(#self#, Symbol("#458#gi"))::var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}
%2 = Main.statparts(%1)::var"#28#30"{var"#P#29", var"#26#27"{Vector{Float64}, Matrix{Float64}, Matrix{Float64}}}
│ %3 = (%2)(θ)::Core.PartialStruct(Tuple{Int64, Int64, Int64, Adjoint{Float64, Matrix{Float64}}, Hermitian{Float64, Matrix{Float64}}, Matrix{Float64}, var"#P#29"}, Any[Int64, Int64, Int64, Adjoint{Float64, Matrix{Float64}}, Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')]), Matrix{Float64}, var"#P#29"])
│ %4 = Base.indexed_iterate(%3, 1)::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(2)])
│ = Core.getfield(%4, 1))
│ (n @_4 = Core.getfield(%4, 2))
│ (%7 = Base.indexed_iterate(%3, 2, @_4::Core.Const(2))::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(3)])
│ = Core.getfield(%7, 1))
│ (k @_4 = Core.getfield(%7, 2))
│ (%10 = Base.indexed_iterate(%3, 3, @_4::Core.Const(3))::Core.PartialStruct(Tuple{Int64, Int64}, Any[Int64, Core.Const(4)])
│ = Core.getfield(%10, 1))
│ (p @_4 = Core.getfield(%10, 2))
│ (%13 = Base.indexed_iterate(%3, 4, @_4::Core.Const(4))::Core.PartialStruct(Tuple{Adjoint{Float64, Matrix{Float64}}, Int64}, Any[Adjoint{Float64, Matrix{Float64}}, Core.Const(5)])
│ = Core.getfield(%13, 1))
│ (gn @_4 = Core.getfield(%13, 2))
│ (%16 = Base.indexed_iterate(%3, 5, @_4::Core.Const(5))::Core.PartialStruct(Tuple{Hermitian{Float64, Matrix{Float64}}, Int64}, Any[Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')]), Core.Const(6)])
│ = Core.getfield(%16, 1))
│ (Ω @_4 = Core.getfield(%16, 2))
│ (%19 = Base.indexed_iterate(%3, 6, @_4::Core.Const(6))::Core.PartialStruct(Tuple{Matrix{Float64}, Int64}, Any[Matrix{Float64}, Core.Const(7)])
│ = Core.getfield(%19, 1))
│ (D @_4 = Core.getfield(%19, 2))
│ (%22 = Base.indexed_iterate(%3, 7, @_4::Core.Const(7))::Core.PartialStruct(Tuple{var"#P#29", Int64}, Any[var"#P#29", Core.Const(8)])
│ = Core.getfield(%22, 1))
│ (P %24 = Main.eigen(Ω::Core.PartialStruct(Hermitian{Float64, Matrix{Float64}}, Any[Matrix{Float64}, Core.Const('U')]))::Eigen{Float64, Float64, Matrix{Float64}, Vector{Float64}}
│ %25 = Base.indexed_iterate(%24, 1)::Tuple{Vector{Float64}, Val{:vectors}}
│ = Core.getfield(%25, 1))
│ (λ @_3 = Core.getfield(%25, 2))
│ (%28 = Base.indexed_iterate(%24, 2, @_3)::Tuple{Matrix{Float64}, Val{:done}}
│ = Core.getfield(%28, 1))
│ (v %30 = v::Matrix{Float64}
│ %31 = Main.:^::Core.Const(^)
│ %32 = λ::Vector{Float64}
│ %33 = (-1 / 2)::Core.Const(-0.5)
│ %34 = Base.broadcasted(%31, %32, %33)::Core.PartialStruct(Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Nothing, typeof(^), Tuple{Vector{Float64}, Float64}}, Any[Core.Const(Base.Broadcast.DefaultArrayStyle{1}()), Core.Const(^), Core.PartialStruct(Tuple{Vector{Float64}, Float64}, Any[Vector{Float64}, Core.Const(-0.5)]), Nothing])
│ %35 = Base.materialize(%34)::Vector{Float64}
│ %36 = Main.diagm(%35)::Matrix{Float64}
│ %37 = Main.:var"'"(v)::Adjoint{Float64, Matrix{Float64}}
│ = %30 * %36 * %37)
│ (irΩ %39 = n::Int64
│ %40 = Main.:var"'"(gn)::Matrix{Float64}
│ %41 = irΩ::Matrix{Float64}
│ %42 = (irΩ * D)::Matrix{Float64}
│ %43 = (P)(%42)::Matrix{Float64}
│ %44 = irΩ::Matrix{Float64}
│ %45 = (%40 * %41 * %43 * %44 * gn)::Matrix{Float64}
│ %46 = Base.getindex(%45, 1)::Float64
│ %47 = (%39 * %46)::Float64
│ return %47 └──
Fixing these type instabilities speeds up the code by a factor of about 5.
@benchmark klm(gi)(β0)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 132.812 μs … 9.251 ms ┊ GC (min … max): 0.00% … 90
.89%
Time (median): 148.332 μs ┊ GC (median): 0.00%
Time (mean ± σ): 174.327 μs ± 435.225 μs ┊ GC (mean ± σ): 12.87% ± 5
.07%
▃▆██▅▂
▂▂▂▂▂▂▃▄▄▄▅▅▆████████▇▆▆▅▅▄▄▄▃▃▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂ ▃
133 μs Histogram: frequency by time 186 μs <
Memory estimate: 219.61 KiB, allocs estimate: 1516.
Reducing allocations and Other Optimizations
Profiling reveals the majority of time is spent in the innermost loop of the statparts
function. This loop allocates quite a bit because the arrays are using slices. We can avoid allocations by using @views
and more broadcasting. See “Consider using views for slices” and “More dots”,
function statparts(gi::F) where {F <: Function}
function P(A::AbstractMatrix) # projection matrix
A*pinv(A'*A)*A'
end
let gi=gi
function(θ)
= gi(θ)
giθ = length(θ)
p = size(giθ)
(n, k) = Hermitian(cov(giθ))
Ω =mean(gi(θ), dims=1)'
gn= Ω \ gn
iΩgn = zeros(n,k,p)
Gi jacobian!(Gi,gi,θ)
ForwardDiff.= reshape(Gi, n , k, p)
Gi = mean(Gi, dims=1)
G = zeros(eltype(Gi),p,k,k)
Γ = zeros(eltype(Gi),k, p)
D @inbounds for j in 1:p
@inbounds for i in 1:n
@views Γ[j,:,:] .+= (Gi[i,:,j] .- G[1,:,j]) * giθ[i,:]'
end
:,:] ./= n
Γ[j,@views D[:,j] .= G[1,:,j] .- Γ[j,:,:]*iΩgn
end
return(n,k,p,gn, Ω, D, P)
end
end
end
@benchmark klm(gi)(β0)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 69.363 μs … 6.704 ms ┊ GC (min … max): 0.00% … 92.
90%
Time (median): 76.354 μs ┊ GC (median): 0.00%
Time (mean ± σ): 86.922 μs ± 255.393 μs ┊ GC (mean ± σ): 11.10% ± 3.
73%
▁▁▃▄▆▅▇▇▆█▆▇▄▃▂
▁▁▁▁▁▂▂▃▃▄▅▆█████████████████▇▆▅▄▃▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▃
69.4 μs Histogram: frequency by time 89 μs <
Memory estimate: 118.53 KiB, allocs estimate: 501.
The code is now about ten times faster than the original.
Footnotes
Like scalar variables, StaticArrays exist on the stack instead of the heap, so creating them is much less costly and they do not count toward the reported allocations.↩︎