% This is an implementation of sparsemap published in 'cluster-preserving
% embedding of proteins' (1999) by Hristescu and Farach-Colton.
%
% O:            list of cells which contains the items of the source space
% dissFunc:     dissimilarity function (O x O -> R)
% pSigma:       portion (0, 1] of used items for calculating the true distance
% pRes:         portion (0, 1] of the used items in greedy resampling
% k:            embedding dimension

function [E] = sparsemap(set, dist_func, pSigma, pRes, k)
    % get the number of items
    NItems = numel(set);

    % Number of columns for the reference matrix. Every column contains the
    % same number of items
    NSetsCols = floor(log2(NItems));

    % Number of rows in the reference matrix. Every set in a row contains
    % 2^i items. The parameter pRows can reduce it.
    NSetsRows = floor(log2(NItems));

    % Number of reference sets
    NSets = NSetsRows * NSetsCols;
    
    % The reference sets are aligned as row vector.
    R = cell(1, NSets);

    % Creates the matrix of reference sets R. Every cell contains a set.
    XiIndex = 1;
    for i=1:NSetsRows
        for j=1:NSetsCols
            % Generates a permutation of 2^i items
            Xi = {randperm(NItems, 2^(i))};
            % Stores the set
            R(XiIndex) = Xi;
            % if the stored set contains all available items
            if numel(R{XiIndex}) == NItems
                % remove unnecessary cells 
                R((XiIndex+1):NSets) = [];
                % adjust the number of available reference cets
                NSets = numel(R);
                % break the initialisation
                break;
            end
            % init the next reference set
            XiIndex = XiIndex + 1;
        end
    end

    % Matrix for lipschitz embedding
    E = zeros(NItems, NSets);

    % For every reference set with Index i
    for i=1:NSets
		% Gets the current reference set Xi. Xi contains the global ids.
		Xi = R{i};
	
		% Gets the number of items in the current reference set
		NItemsXi = length(Xi);
	
        % For every item with index p of O
        for p=1:NItems
            % For every item q of the current reference set, compute
            % the approximated distance to p
            approxDistXi = zeros(1, NItemsXi);
            for q=1:NItemsXi
                % if there are embedded features available,
                % the approximated distances will be calculated.
                if i > 1
                    approxDistXi(q) = sum((E(p, 1:i-1) ...
                        - E(Xi(q), 1:i-1)).^2);
                else
                    approxDistXi(q) = dist_func(set{p}, set{Xi(q)});
                end
            end

            % if there are embedded features available, the current distances
            % are approximated and a subset of true distances will be calculated.
            if i > 1
                % Calculates an order of the approximated distances.
                % The approxDistXiOrder stores the increasing ranking of item ids
                % of the current reference set.
                [~, approxDistXiOrder] = sort(approxDistXi);

                % Choose the number of items in the current subset for which
                % the true distance will be calculated. Minimum 1.
                NTrueDistXiSub = ceil(length(approxDistXi) * pSigma);

                % Calculates the true distances between p and q
                trueDistXiSub = zeros(1, NTrueDistXiSub);
                for q=1:NTrueDistXiSub
                    trueDistXiSub(q) = dist_func(set{p}, ...
                        set{Xi(approxDistXiOrder(q))});
                end
            else
                trueDistXiSub = approxDistXi;
            end

            % the minimum true distance is the k' feature for p
            E(p, i) = min(trueDistXiSub);
        end
    end

    % now we can perform greey resampling to reduce the dimensions
    E = greedyResampling(set, dist_func, E, pRes, k);
end

% This function implements the greedy resampling technic of sparsemap. It
% reduces the number of dimensions to the best k dimensions.
%
% O:            list of cells which contains the items of the source space
% dissFunc:     dissimilarity function (O x O -> R)
% E:            Embedding which will be reduced to k dimensions
% pRes:         portion as value (0:1] of considered distances
% k:            new number of dimensions
function Ek = greedyResampling(set, dist_func, E, pRes, k)
    % returns if the embedding has already equal or less dimensions then k
    if size(E, 2) <= k
        Ek = E;
        return;
    end

    % get the number of items
    NItems = numel(set);
    % chooses a random set of ids
    subset = randperm(NItems, round(NItems * pRes));
    % get the subset items
    OSubset = set(subset);
    
    % Calculates the euclidean distances of the items to subset ids
    distSubsetO = squareform(pidist(OSubset', dist_func));

    % The destination embedding space with k dimensions
    Ek = zeros(NItems, k);

    % for the number of destination dimensions
    for ki=1:k
        % Current number of available features in E
        NFeatures = size(E, 2);
        % line with all stresses for the current features which are
        % combined with the so far created embedding space
        kiFStress = zeros(1, NFeatures);
        % for every considered feature of E
        for fi=1:NFeatures
            % combines the current Feature fi with the |ki| features of E
            f = [Ek(:, 1:ki-1), E(:, fi)];
            % calculates the euclidean distances of the current features
            fDist = pdist(f(subset, :));
            % calculates the distance between the features and the
            % distances
            kiFStress(fi) = sum((fDist - distSubsetO).^2);
        end
        % get the index of the best feature combination
        [~, bestF] = min(kiFStress);
        % copy the feature which was combinated to the embedding space Ek
        Ek(:, ki) = E(:, bestF);
        % remove the feature from the full embedding
        E(:, bestF) = [];
    end
end

% This function calculates the pairwise distance between the items of the
% set
%
% set:          row list of cells
% func:         distance function
function dists = pidist(set, dist_func)
    % gets the number of items
    N = numel(set);
    % creates a distance matrix
    dists = zeros(N,N);
    % for every row
    for i = 1:N
        % for every column
        for j = 1:N
            % if the items are different
            if (i ~= j)
                % calculate the distance
                dists(i,j) = dist_func(set{i}, set{j});
                % clone the distance
                dists(j,i) = dists(i,j);
            end
        end
    end
end
