function [nll, dnll] = pic_lik_grad(w, x, subdomains, param)

% neg. log likelihood for Local and Global Gaussina Process and gradients 
% with respect to pseudo-inputs and hyperparameters. Gaussian covariance 
% with one lengthscale per dimension.
%
% w               [reshape(xb,n*dim,1); logtheta] where xb is pseudo-inputs
% x               training inputs
% param.M         number of pseudo-inputs
% param.covfunc   prior covariance function of Gaussian process
% subdomains      N-by-1 cell array, containing the information about 
%                 a partition of training inputs by k-means clustering
% idx             logical mask to indicate the subset of training data
%                 used for learning the hyperparameters.
% nll -- negative log likelihood
% dnll -- gradient of nll 
% 
% Chiwoo Park (2009)

covfunc = param.covfunc;
M       = param.M; 

[N,dim] = size(x); 
xb       = reshape(w(1:end-dim-2),M,dim);
logtheta = w((end-dim-1):end);

K_M  = feval(covfunc{:}, logtheta, xb);
[K_xx, K_NM] = feval(covfunc{:}, logtheta, x, xb);              
L_M  = chol(K_M, 'lower');
iLK  = L_M\K_NM';

nGrp    = size(subdomains, 1);
Q_M     = K_M;
X       = zeros(M, 1);
nll      = 0;
logdet   = 0;
for i = 1:nGrp
    sd    = subdomains{i};
    K_B   = feval(covfunc{:}, logtheta, sd.hx); %C_B
    lidx  = sd.segI & param.idx;
    K_BM  = K_NM(lidx, :);  %UB
    D_B   = K_B - iLK(:,lidx)'*iLK(:,lidx); %AB
    
    L_B   = chol(D_B, 'lower');
    LK_BM = L_B \ K_BM; 
    Ly    = L_B \ sd.hy;
    
    Q_M   = Q_M + LK_BM' * LK_BM;
    X     = X   + LK_BM' * Ly;
    nll   = nll  + Ly' * Ly;
    
    % use logdet(A) = tr(log(A))
    logdet = logdet + 2*sum(log(diag(L_B)));
    sd.L_B = L_B;
    sd.iD  = inv(D_B);
    sd.lidx = lidx;
    subdomains{i} = sd;
    %fprintf('Iteration %6i Finished.\n', i);
end

% Negative Log Likelihood   
Q_M = (Q_M+Q_M')./2;  %Q_M = K_M + K_MN * inv(A) * K_NM
C_L = chol(Q_M)';
b = C_L \ X;
nll = nll - b'*b;
logdet = logdet - 2*sum(log(diag(L_M))) + 2*sum(log(diag(C_L)));
nll = nll + logdet;

%OPTIONAL derivatives
if nargout > 1
    % all gradients computed with respect to log(parameters)
    %
    % 1) compute gradients with respect to hyperparameters of a square covariance kernl function
    [dcK_NM, dbK_NM] = sqexp_g(x, xb, logtheta);
    [dcK_M, dbK_M]   = sqexp_g(xb, xb, logtheta);
    [dc] = createGrad(M, L_M, dcK_M, dcK_NM);
    db   = cell(1, dim);
    for i = 1:dim
        db{i} = createGrad(M, L_M, dbK_M{i}, dbK_NM{i}); 
    end
    for i = 1:nGrp
        sd  = subdomains{i};
        [dcK_B, dbK_B] = sqexp_g(sd.hx, sd.hx, logtheta);
        K_BM   = K_NM(sd.lidx, :);
        dcK_BM = dcK_NM(sd.lidx, :);
        iLK_B  = iLK(:, sd.lidx);
        dc = precompute_grad(sd, dc, K_BM, dcK_B, dcK_BM, iLK_B);
        
        for j = 1:dim
            dbK_BM = dbK_NM{j}(sd.lidx, :);
            db{j}  = precompute_grad(sd, db{j}, K_BM, dbK_B{j}, dbK_BM, iLK_B);
        end
    end
    dcv = compute_grad(C_L, dc);
    dbv = zeros(dim, 1);
    for i = 1:dim
        dbv(i) = compute_grad(C_L, db{i});
    end
    
    % 2) compute gradient with respect to inducing points
    dindv = zeros(M, dim);
    for i = 1:M
        [dK_NM, dK_M] = sqexp_gind(x, xb, K_NM, K_M, i, logtheta);      
        dind   = cell(1, dim);
        for k = 1:dim
            dind{k} = createGrad(M, L_M, dK_M{k}, dK_NM{k}); 
        end
        
        for j = 1:nGrp
            sd    = subdomains{j}; sd.n = sum(sd.lidx);
            dK_B  = zeros(sd.n, sd.n);  %look! gradient is zero
            K_BM  = K_NM(sd.lidx, :);
            iLK_B = iLK(:, sd.lidx);
            for k = 1:dim
                dK_BM = dK_NM{k}(sd.lidx, :);
                dind{k} = precompute_grad(sd, dind{k}, K_BM, dK_B, dK_BM, iLK_B);
            end
        end
        
        for k = 1:dim
            dindv(i,k) = compute_grad(C_L, dind{k});
        end
    end
    
    % 3) compute gradient with respect to noise
    sig2  = exp(logtheta(end));
    KD2y  = zeros(M, 1);
    KDy   = zeros(M, 1);
    KD2K  = zeros(M, M);
    dsigv = 0;
    for i = 1:nGrp
        sd    = subdomains{i};
        D2    = sd.iD * sd.iD;
        K_BM  = K_NM(sd.lidx, :);
        dsigv = dsigv - sd.hy' * D2 * sd.hy + trace(sd.iD);
        KD2y  = KD2y + K_BM' * D2 * sd.hy;
        KDy   = KDy  + K_BM' * sd.iD * sd.hy;
        KD2K  = KD2K + K_BM' * D2 * K_BM;
    end
    KD2y  = C_L \ KD2y;
    KDy   = C_L \ KDy; 
    KD2K  = C_L \ KD2K;  KD2K = KD2K / C_L';
    dsigv = dsigv + 2*KD2y' * KDy - KDy' * KD2K * KDy - trace(KD2K);
    dsigv = sig2 * dsigv;
    
    % 4) packing the gradients computed
    dnll = [reshape(dindv,M*dim,1);dbv;dcv;dsigv];
end
end

function K = sqexp(x1,x2,hyp)
n1 = size(x1, 1); n2 = size(x2,1);
b = exp(hyp(1:end-2)); c = exp(hyp(end-1));

x1 = x1.*repmat(sqrt(b)',n1,1);
x2 = x2.*repmat(sqrt(b)',n2,1);

K = -2*x1*x2' + repmat(sum(x2.*x2,2)',n1,1) + repmat(sum(x1.*x1,2),1,n2);
K = c*exp(-0.5*K);
end 

% gradient of kernel matrix with respect to hyperparameters
function [dK_c, dK_b] = sqexp_g(x1, x2, hyp)
dK_c = sqexp(x1, x2, hyp);
dim  = size(x1, 2);
b    = exp(hyp(1:end-2));
dK_b = cell(1, dim);
for d=1:dim
    s = b(d);        % set the length
    dist2 = dist(x1(:,d),x2(:,d)).^2;
    dK_b{d} = -0.5*s.*dK_c.*dist2;
end
end

% gradient of kernel matrix with respect to inducing points
function [dK_NM, dK_M] = sqexp_gind(x, xb, K_NM, K_M, j, hyp)
dim   = size(x, 2);
s     = exp(hyp(1:end-2));
dK_NM = cell(1, dim);
dK_M  = cell(1, dim);
for d=1:dim
    dK_NM{d} = zeros(size(K_NM));
    dK_M{d}  = zeros(size(K_M));
    dK_NM{d}(:,j) = s(d).*dist(x(:,d), xb(j, d)).*K_NM(:,j);
    dK_M{d}(:,j) = s(d).*dist(xb(:,d), xb(j, d)).*K_M(:,j);
    dK_M{d}(j,:) = dK_M{d}(j,:) - dK_M{d}(:,j)';
end
end

function [dc] = createGrad(M, L_M, dcK_M, dcK_NM)
    dc.KiDy   = zeros(M, 1);
    dc.dKiDy  = zeros(M, 1);
    dc.dQ_M   = dcK_M;
    dc.dyD3K  = zeros(1, M);
    dc.value  = 0;
    dc.diLK  = L_M\dcK_NM';        %  K_M^{-1/2} dK_MN
    dc.diKM  = (L_M\dcK_M) / L_M'; %  K_M^{-1/2} dK_M K_M^{-1/2}
    dc.triDD = 0;
end

function [dc] = precompute_grad(sd, dc, K_BM, dcK_B, dcK_BM, iLK)
    D2       = sd.iD * (dcK_B - 2*dc.diLK(:, sd.lidx)' * iLK + iLK' * dc.diKM * iLK);
    D3       = D2 * sd.iD;
    dc.KiDy  = dc.KiDy + K_BM' * sd.iD * sd.hy;
    dc.dKiDy = dc.dKiDy + dcK_BM' * sd.iD * sd.hy;
    dc.dQ_M  = dc.dQ_M + 2*dcK_BM' * sd.iD * K_BM - K_BM' * D3 * K_BM;
    dc.value = dc.value - sd.hy' * D3 * sd.hy;
    dc.dyD3K = dc.dyD3K + sd.hy' * D3 * K_BM;
    dc.triDD = dc.triDD + trace(D2);
end

function [g] = compute_grad(C_L, dc)
    KiDy  = C_L \ dc.KiDy;
    dKiDy = C_L \ dc.dKiDy;
    dQ_M  = C_L \ dc.dQ_M;  dQ_M = dQ_M / C_L';
    dyD3K = dc.dyD3K / C_L';
    g = dc.value - 2*dKiDy' * KiDy + KiDy' * dQ_M * KiDy + 2*dyD3K * KiDy;
    g = g + dc.triDD - trace(dc.diKM) + trace(dQ_M);
end
