function embedding = prototype(set, dist_func, k, embedding_type)
%PROTOTYPE calculates the prototype embedding of a given set
%PARAMETERS:
%	set - Cell array of n objects
%	dist_func(a,b) - Distance function between objects a and b
%	k - number of selected Prototypes (embedding dimension)
%	embedding_type - prototype selector. Options are:
%			 'center', 'border', 'spanning', 'k-medians' and 'random'
%RETURNS:
% 	embedding - n x k Matrix of n embedding vectors in k dimensions
  
    
    if (k > length(set))
        k = length(set);
        warning('Prototype: k cannot be greater than the set size');
    end
    
    subset = {};
    
    if (nargin < 4)
        embedding_type = 'k-medians';
    end
      
    switch embedding_type
        case 'center'
            [subset, subset_indices] = get_subset_center(set, k, dist_func); 
 
        case 'border'
            [subset, subset_indices] = get_subset_border(set, k, dist_func); 
                      
        case 'spanning'
            [subset, subset_indices] = get_subset_spanning(set, k, dist_func); 
                    
        case 'k-medians'
            [subset, subset_indices] = get_subset_k_medians(set, k, dist_func);  
            
        case 'random'
            [subset, subset_indices] = get_subset_random(set, k); 
        
    end

    embedding = dpe_pdist(set, subset, dist_func);
       
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [ subset, subset_indices ] = get_subset_center( set, subset_size, dist_func )
%CENTERPROTOTYPESELECTOR gets a subset of strings picked from the passed set of
%strings. The parameter "subsetSize" is the size of the subset in percent
%relative to the main set of strings. "subsetIndices" passes the indices of
%the chosen prototype strings (concerning the set).
%The method selects the the subset strings in the following way: it gets
%the strings very close to the set median string by iteratively computing
%the next set median string considering the strings without the ones which
%are chosen yet.

    set_size = length(set);
    leftSetIndices = 1:set_size;
    subset_indices = zeros(1, subset_size);

    %Gets the strings "around" the set median string.
    for i = 1:subset_size
        %Gets the set median string of the strings which are still left.
        [~, setMedianIndex] = dpe_set_median(set(setdiff((1:set_size), subset_indices)), dist_func);
        %Gets the index of the set median string concerning the set.
        setMedianIndex = leftSetIndices(setMedianIndex);
        %Updates the indices of the strings which are still left.
        leftSetIndices = setdiff(leftSetIndices, setMedianIndex);
        %Adds the next set median string index.
        subset_indices(i) = setMedianIndex;
    end
    %Sorts the indices.
    subset_indices = sort(subset_indices);
    %Gets the subset strings by index.
    subset = set(subset_indices);

end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [ subset, subset_indices ] = get_subset_border( set, subset_size, dist_func )
%BORDERPROTOTYPESELECTOR gets a subset of strings picked from the passed set of 
%strings. The parameter "subsetSize" is the size of the subset in percent
%relative to the main set of strings. "subsetIndices" passes the indices of
%the chosen prototype strings (concerning the set).
%The method selects the the subset strings in the following way: it gets
%the strings far away from the set median string by iteratively computing
%the next set marginal string considering the strings without the ones
%which are chosen yet.

    set_size = length(set);
    leftSetIndices = 1:set_size;
    subset_indices = zeros(1, subset_size);

    %Gets the strings far away from the set median string.
    for i = 1:subset_size
        %Gets the set marginal string of the strings which are still left.
        [farthestDist, setMedianIndex] = ...
            getSetMarginalStr(set(setdiff((1:set_size), subset_indices)), dist_func);
        %Gets the index of the set marginal string concerning the set.
        setMedianIndex = leftSetIndices(setMedianIndex);
        %Updates the indices of the strings which are still left.
        leftSetIndices = setdiff(leftSetIndices, setMedianIndex);
        %Adds the next set marginal string index.
        subset_indices(i) = setMedianIndex;
    end
    %Sorts the indices.
    subset_indices = sort(subset_indices);
    %Gets the subset strings by index.
    subset = set(subset_indices);

end

function [SOD, setMedianIndex ] = getSetMarginalStr( stringSet, dist_func )
%GETSETMARGINALSTRING gets the index of the set marginal string (concerning
%the set) and its sum of distances.
stringCount = length(stringSet);
DistMatrix = zeros(stringCount);
for i = 2:stringCount
    for j = 1:i-1
        currentWords = {stringSet{i}, stringSet{j}};
        DistMatrix(i,j) = dist_func(currentWords{1}, currentWords{2});
        %this provides a symmetric matrix
        DistMatrix(j,i) = DistMatrix(i,j);
    end
end
[SOD, setMedianIndex] = max(sum(DistMatrix,2));

end


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [ subset, subset_indices ] = get_subset_spanning( set, subset_size, dist_func )
%SPANNINGPROTOTYPESELECTOR gets a subset of strings picked from the passed set of 
%strings. The parameter "subsetSize" is the size of the subset in percent
%relative to the main set of strings. "subsetIndices" passes the indices of
%the chosen prototype strings (concerning the set).
%The method selects the the subset strings in the following way: at the
%beginning it gets the set median as the first string of the subset. Then
%it continues collecting strings iteratively by chosing the one which has
%the highest distance to the previously selected strings.

    %initializes the subset
    set_set = length(set);
    subset = cell(1, subset_size);
    %initializes the vector of the indices
    subset_indices = zeros(1, subset_size);

    %gets the Set Median String
    [shortestDist, setMedianIndex] = dpe_set_median(set, dist_func);

    %The first string of the subset is the set median.
    subset{1} = set{setMedianIndex};
    subset_indices(1) = setMedianIndex;

    if (subset_size > 1)
        %Repeat until the subset is full
        for p = 2:subset_size
            %Gets the indices, which are not yet in the subset
            freeIndices = setdiff((1:set_set), subset_indices);
            %The number of current subset strings.
            lengthIV = length(find(subset_indices ~= 0));
            %The number of current non-subset strings.
            lengthFI = length(freeIndices);
            %Gets a matrix with the edit distances between subset strings
            %(columns) and non-subset strings (rows).
            maxMinVSM = zeros(lengthFI, lengthIV);
            for j = 1:lengthIV
                for i = 1:lengthFI
                    maxMinVSM(i, j) = dist_func(set{freeIndices(i)},set{subset_indices(j)});
                end
            end
            %Gets the minimum row-values of the matrix, which is equal to the
            %minimum distance between the non-subset string (row) and the
            %subset.
            [minValues, minIndices] = min(maxMinVSM, [], 2);
            %Gets the row-index of the maximum value of "minValues".
            [maxMinValue, maxMinIndex] = max(minValues);
            %Updates the subsetIndices and the subset
            subset_indices(p) = freeIndices(maxMinIndex);
            subset(p) = set(freeIndices(maxMinIndex));
        end
    end

end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [subset, subset_indices] = get_subset_k_medians(set, subset_size, dist_func)
%GET_SUBSET_K_MEDIANS gets a subset of strings picked from the passed
%set of strings. The parameter "subsetSize" is the size of the subset in
%percent relative to the main set of strings. "subsetIndices" passes the
%indices of the chosen prototype strings (concerning the set).
%The method selects the the subset strings in the following way: it gets
%the strings by using the k-medians algorithm, which is similar to the
%k-means algorithm.

    set_size = length(set);
    difference = set_size - subset_size;
    %subsetIndices = zeros(1, subsetStringCount);

    %%
    %--------------------------------------------------------------------------
    %Here the initial subset strings (as the k-medians) are selected
    %--------------------------------------------------------------------------
    %A vector is created to get a random index-selection of the main set
    randomVector = ones(1, set_size);
    deletedIndices = 0;
    %Gets a vector of the size of the set and sets the elements to 0 which are
    %not needed
    while deletedIndices < difference
        randomIndex = randi(set_size);
        if (randomVector(randomIndex) ~= 0)
            randomVector(randomIndex) = 0;
            deletedIndices = deletedIndices + 1;
        end
    end
    %The random indices are found here by getting the "randomVector"-indices
    %without the elements which are set to 0
    subset_indices = find(randomVector ~= 0);

    % Variant 2 ==============================================================
    counter = 0;
    minDistanceSum = Inf;
    minDistanceSumNew = Inf;
    while (((counter == 0) || (minDistanceSumNew < minDistanceSum)) ...
            && (counter < 15))
        %----------------------------------------------------------------------
        %Here the strings of the set are attached to the k-medians
        %----------------------------------------------------------------------
        %The "ClusterMatrix" is updated.
        minDistanceSum = minDistanceSumNew;
        %Gets the k-median strings
        subset = set(subset_indices);
        %Gets a vector space matrix filled with the edit distances to the k-medians
        DistanceMatrix = dpe_pdist(set, subset, dist_func);


        %Gets the indices of the closest string (k-median) for every string of the
        %set (columns)
        [shortestDistances, kMedianIndices] = min(DistanceMatrix);
        %The new sum of distances from the set-strings to its k-median is
        %calculated
        minDistanceSumNew = sum(shortestDistances);
        %If there are multiple subset-strings, it is possible that its indices
        %are attached to a wrong k-Median index -> Correction of
        %"kMedianIndices"
        for i = 1:subset_size
            kMedianIndices(subset_indices(i)) = i;
        end

        %----------------------------------------------------------------------
        %Here the new k-medians are calculated
        %----------------------------------------------------------------------
        %Initialisation of "ClusterMatrix"
        ClusterMatrix = zeros(subset_size, difference + 1);
        %Calculates the index of the new k-median strings
        for i = 1:subset_size
            %Gets the indices of the set-strings, which belong to the i-th k-median
            ClusterMatrix(i, (1:length(find(kMedianIndices == i)))) = ...
                find(kMedianIndices == i);
            %Extracts the indices of the set-strings, which belong to the i-th
            %k-median in the vector "currentIndices"
            currentIndices = ClusterMatrix(i, :);
            currentIndices = currentIndices(currentIndices ~= 0);
            %Computes the new median of the strings, which set-indices are
            %included in "currentIndices" and passes the index of the new
            %k-median into "subsetIndices"
            [dist, subset_indices(i)] = dpe_set_median(set(currentIndices),dist_func);
            %Transforms the non-subset index of the new median into the
            %set-index
            subset_indices(i) = currentIndices(subset_indices(i));
        end
        subset_indices = sort(subset_indices);
        counter = counter + 1;
    end



end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [ subset, subsetIndices ] = get_subset_random( set, subset_size )
%GET_SUBSET_RANDOM gets a subset of strings picked from the passed set of 
%strings. The parameter "subsetSize" is the size of the subset in percent
%relative to the main set of strings. "subsetIndices" passes the indices of
%the chosen prototype strings (concerning the set). The method selects the
%the subset strings in a random way.

set_size = length(set);
difference = set_size - subset_size;

%A vector is created to get a random index-selection of the main set
randomVector = ones(1, set_size);
deletedIndices = 0;
%Gets a vector of the size of the set and sets the elements to 0 which are
%not needed
while deletedIndices < difference
    randomIndex = randi(set_size);
    if (randomVector(randomIndex) ~= 0)
        randomVector(randomIndex) = 0;
        deletedIndices = deletedIndices + 1;
    end
end
%The random indices are found here by getting the "randomVector"-indices
%without the elements which are set to 0
subsetIndices = find(randomVector);

%The subset is filled using the random indices
subset = set(subsetIndices);

end


