function gradF = gradVol_RBF( X, alpha, Sigma, rbf, id, Y )
	% function for computing gradient of the volume penalty for RBF-based representation
	% F(nSmp x K) is the RBF representation of prediction function
	% Y(nSmp x K) is Softmax(F)
	%
	% Qinxun Bai, Feb 2016



	% dimenstion of the data space and embeded space
	[nSmp, d] = size(X);
	K = size(alpha, 2); 

	% compute first derivatives of F 
	df1 = mex_df1_RBF(X', alpha', rbf'); % K x d*nSmp
	df1 = reshape(df1, K, d, nSmp);

	% these computation can be moved outside of this function
	dX = cell(nSmp,1);
	rbf_Sigma = cell(nSmp,1);
	for iSmp = 1 : nSmp
		dX{iSmp} = X - repmat(X(iSmp,:),nSmp,1);
		rbf_Sigma{iSmp} = 4*repmat( (rbf(iSmp,:)./Sigma)',1,K); % nSmp x K
	end

	dY2_g_inv = zeros(1, K-1);
	gradF     = zeros(nSmp, K);

	for iSmp = 1 : nSmp
		% compute first derivatives of Y: dY1
		Yi         = Y(iSmp, :);
		diagY      = diag(Yi);  % K x K
		outerY     = Yi' * Yi;  % K x K
		dYf1       = diagY - outerY; % K x K
		dYf1_full  = dYf1;
		dYf1(id,:) = [];  % (K-1) x K
		df1i       = df1(:,:,iSmp);
		dY1        = dYf1 * df1i;  % (K-1) x d    

		% compute Riemannian metric g
		g             = eye(d) + dY1'*dY1;
		g_inv         = g\eye(d);
		g_inv_reshape = reshape(g_inv, d^2, 1);

		% compute second derivatives of Y: dY2
		dX_iSmp         = dX{iSmp};
		alpha_rbf_Sigma = rbf_Sigma{iSmp}.*alpha; % nSmp x K
		alpha_rbf       = 2*( rbf(iSmp,:)*alpha )'; % K x 1
		df2             = mex_df2_RBF_fast(dX_iSmp, alpha_rbf_Sigma, alpha_rbf); %K x d^2
		idx             = 1:K;
		idx(id)         = [];
		for k = 1:K-1
			dYf2                = 2*outerY - diagY; % K x K
			dYf2(idx(k),:)      = dYf2(idx(k),:) - Yi;
			dYf2(:,idx(k))      = dYf2(:,idx(k)) - Yi';
			dYf2(idx(k),idx(k)) = dYf2(idx(k),idx(k)) + 1; % KxK
			dYf2                = Yi(idx(k)) * dYf2;
			dY2                 = reshape(df1i'*dYf2*df1i, 1,d^2) + dYf1(k,:)*df2; % 1xd^2
			dY2_g_inv(k)        = dY2 * g_inv_reshape;
		end
		negTrII = dY2_g_inv*dY1*g_inv*dY1' - dY2_g_inv; % 1 x (K-1)
		gradY   = [negTrII(1:id-1), -sum(negTrII), negTrII(id:end)]; % 1 x K

		% if outside of the simplex, scale it back
		lambda = ones(1,K);
		for k = 1 : K
			if Yi(k)*gradY(k) > 0
				lambda(k) = 10*Yi(k)/gradY(k);
			end
		end
		lambda = min(min(lambda),1); 
		gradY  = lambda * gradY;    

		% compute gradF
		gradF(iSmp,:) = gradY * dYf1_full;

	end

end
