%% ARK (Aggregation of Reads by K-means) framework algorithm

% ARK framework will be used for SEK, Quikr and Taxy methods (these three
% methods were earlier used for bacterial community composition estimation
% and published in Bioinformatics journal)

% SEK paper title:- "Sek: Sparsity exploiting k-mer-based estimation of bacterial community composition"
% Quikr paer title:- "Quikr: a method for rapid reconstruction of bacterial communities via compressive sensing"
% Taxy paper title:- "Mixture models for analysis of the taxonomic composition of metagenomes"


%% ARK method 

% For experimental evaluation, Saikat Chatterjee used the mock communities databse of 16S rRNA, 
% as used in BeBAC and SEK papers (both papers are in Bioinformatics journal)

clear all;
clc


%% variable definitions

k = 4;  % k for k-mers
NoOfSpecies = 21; % the database contains 21 species for references and reads

% Variables for SEK based on OMP^{+,1} (please see Algorithm 1 in SEK paper)
I=100; % Maximum allowable iteration in SEK:OMP^{+,1}    
nu=0.00001; % allowable tolerance in l1 norm computation for OMP^{+,1}

% Variable for Quikr
lambda = 10000; 

% Variable for clustering (LBG algorithm based K-means clustering) in ARK algorithm
eta=0.0005; % User choice: Tolerance till convergence of squared error distance (for all algorithms: SEK, Quikr and Taxy).
MaxNoOfClusters = 32; % Q_{max}    User choice: Maximum allowable number of clusters 


%% Loading GroundTruth variable which contains ground truth and reference species identification
load('GroundTruth.mat');
true_solution=sol_species'; true_solution=true_solution/sum(true_solution);


%% System Matrix Generation (by processing reference data followed by computing k-mers)
% Note that this is off-line computation

% System Matrix Generation for Taxy and Quikr (Note that Taxy and Quikr
% have same system matrix and hence the following single program function is used) 
A = generate_ref_kmer_vector_taxy('Reference.fasta', NoOfSpecies, seq2species, k);


% System Matrix Generation for SEK
% SEK requires more time to compute system matrix. Hence, if the directory contains system matrix then skip computation (automatically skips this step if necessary .mat files are present).

if  exist('kmer_vector_allref_trans_SEK.mat','file') == 0 
    
    REF_seq = fastaread('Reference.fasta');

    generate_ref_kmer_vector_SEK(REF_seq, k); % saved as kmer_vector_allref_SEK.mat
    allref_vector_compile_SEK('kmer_vector_allref_SEK.mat'); % saved as kmer_vector_allref_trans_SEK.mat
end

load('kmer_vector_allref_trans_SEK.mat');

X = all_species_kmer_trans; % X is the system matrix for SEK


%% k-mers from reads (Online computation)
% Note that all three methods (SEK, Quikr and Taxy) use same k-mers vectors. Hence one program function is sufficient
% Further, if k-mers from reads are already pre-computed and exist in the directory then skip computation to save time (automatically skips this step if necessary .mat files are present).


if  exist('kmer_vector_reads_SEK.mat','file') == 0
    READ_seq = fastaread('Measurement.fasta');
        
    generate_read_kmer_vector_SEK(READ_seq, k); % saved as kmer_vector_reads_SEK.mat
end

load('kmer_vector_reads_SEK.mat'); 



%% ARK Simulation
%% We use SEK, Taxy and Quikr


%% ARK-SEK simulation (ARK_SEK)

% Initialization
Composition_ARK_SEK  = [];
ChangeInComposition_ARK_SEK = 1;
NoOfClusters_SEK = 0;
data_ARK_SEK = [];

tstart_ARK_SEK = tic;

while (ChangeInComposition_ARK_SEK  > eta) && (NoOfClusters_SEK < MaxNoOfClusters)  % (stopping criteria for LBG based clustering)
    
    
    % ---- This is LBG based K-means clustering ----
    % The variable C_ARK_SEK contains mean vectors of clusters
    
    if NoOfClusters_SEK == 0
        C_ARK_SEK = mean(kmer_vector_reads); ClusterProbability = [1];
    else
        [C_ARK_SEK, ClusterProbability] = LBG(kmer_vector_reads, C_ARK_SEK, ClusterProbability);  % LBG algorithm increases the number of clusters as output from the input no of clusters by one 
    end
    
    NoOfClusters_SEK = length(ClusterProbability);
    
    
    % After clustering, SEK:OMP^{+,1} is used for each cluster
    
    Mu_ARK_SEK = C_ARK_SEK';  % Cluster mean vectors)
    tmp = 0; gamma = 0;
    result_ARK_SEK = zeros(1,NoOfSpecies);
    
    for i=1:NoOfClusters_SEK
        [tmp, ~]= OMP_plus_1(X,Mu_ARK_SEK(:,i),nu,I);
        
        tmp_ARK_SEK = zeros(1,NoOfSpecies);
        for j=1:length(tmp)
            if tmp(j) ~=0
                tmp_ARK_SEK(fragment2species(j)) = tmp_ARK_SEK(fragment2species(j)) + tmp(j);
            end
        end
        
        result_ARK_SEK = result_ARK_SEK + ClusterProbability(i)*tmp_ARK_SEK; % result_ARK_SEK contains final estimate of community composition by linear addition
                
    end

     
    VD_ARK_SEK = 0.5 * norm((true_solution - result_ARK_SEK),1)  
    
    if NoOfClusters_SEK > 1, ChangeInComposition_ARK_SEK  = norm((Composition_ARK_SEK(end,:) - result_ARK_SEK), 1);    end
    
    
    fprintf('VD with in current and previous iterations at %u clusters: %d\n', NoOfClusters_SEK, ChangeInComposition_ARK_SEK)
    
    data_ARK_SEK = [data_ARK_SEK; NoOfClusters_SEK ChangeInComposition_ARK_SEK  VD_ARK_SEK];
    
    Composition_ARK_SEK  = [Composition_ARK_SEK; result_ARK_SEK];
     
end

elapsedtime_ARK_SEK=toc(tstart_ARK_SEK);

disp('Elapsed time till convergence:'); elapsedtime_ARK_SEK
disp('Number of clusters at convergence:'); NoOfClusters_SEK



%% ARK-Taxy simulation 

% Initialization
Composition_ARK_Taxy  = [];
ChangeInComposition_ARK_Taxy = 1;
NoOfClusters_Taxy = 0;
data_ARK_Taxy = [];

tstart_ARK_Taxy = tic;


while (ChangeInComposition_ARK_Taxy  > eta) && (NoOfClusters_Taxy < MaxNoOfClusters)  % (stopping criteria for LBG based clustering)
    
    
    % ---- This is LBG based K-means clustering ----
    % The variable C_ARK_Taxy contains mean vectors of clusters
    
    if NoOfClusters_Taxy == 0
        C_ARK_Taxy = mean(kmer_vector_reads); ClusterProbability = [1];
    else
        [C_ARK_Taxy, ClusterProbability] = LBG(kmer_vector_reads, C_ARK_Taxy, ClusterProbability);  % LBG algorithm increases the number of clusters as output from the input no of clusters by one 
    end
    
    NoOfClusters_Taxy = length(ClusterProbability);
    
    
    % After clustering, Taxy is used for each cluster
    
    Mu_ARK_Taxy = C_ARK_Taxy';  % Cluster mean vectors)
    result_ARK_Taxy = zeros(1,NoOfSpecies);
    
    for i=1:NoOfClusters_Taxy 
        tmp_ARK_Taxy = L2_min_L1_constraint(A,Mu_ARK_Taxy(:,i))';
        result_ARK_Taxy = result_ARK_Taxy + ClusterProbability(i)*tmp_ARK_Taxy;  % This is the linear additive composition estimation
    end
        
             
    VD_ARK_Taxy = 0.5 * norm((true_solution - result_ARK_Taxy),1)    
    if NoOfClusters_Taxy > 1, ChangeInComposition_ARK_Taxy  = norm((Composition_ARK_Taxy(end,:) - result_ARK_Taxy), 1);    end
    
    
    fprintf('VD with in current and previous iterations at %u clusters: %d\n', NoOfClusters_Taxy, ChangeInComposition_ARK_Taxy)
    
    data_ARK_Taxy = [data_ARK_Taxy; NoOfClusters_Taxy ChangeInComposition_ARK_Taxy  VD_ARK_Taxy];
    
    Composition_ARK_Taxy  = [Composition_ARK_Taxy; result_ARK_Taxy];
     
end


elapsedtime_ARK_Taxy=toc(tstart_ARK_Taxy);

disp('Elapsed time till convergence:'); elapsedtime_ARK_Taxy
disp('Number of clusters at convergence:'); NoOfClusters_Taxy



%% ARK-Quikr simulation 

% Initialization
Composition_ARK_Quikr  = [];
ChangeInComposition_ARK_Quikr = 1;
NoOfClusters_Quikr = 0;
data_ARK_Quikr = [];

tstart_ARK_Quikr = tic;

while (ChangeInComposition_ARK_Quikr  > eta) && (NoOfClusters_Quikr < MaxNoOfClusters)  % (stopping criteria for LBG based clustering)
    
    
    % ---- This is LBG based K-means clustering ----
    % The variable C_ARK_Quikr contains mean vectors of clusters
    
    if NoOfClusters_Quikr == 0
        C_ARK_Quikr = mean(kmer_vector_reads); ClusterProbability = [1];
    else
        [C_ARK_Quikr, ClusterProbability] = LBG(kmer_vector_reads, C_ARK_Quikr, ClusterProbability);  % LBG algorithm increases the number of clusters as output from the input no of clusters by one 
    end
    
    NoOfClusters_Quikr = length(ClusterProbability);
    
    
    % After clustering, Quikr is used for each cluster
    
    Mu_ARK_Quikr = C_ARK_Quikr';  % Cluster mean vectors)
    result_ARK_Quikr = zeros(1,NoOfSpecies);
    
    for i=1:NoOfClusters_Quikr
        A_Quikr_m = [ones(1, size(A,2)); lambda*A];
        s = [0; lambda*Mu_ARK_Quikr(:,i)];        
        tmp_ARK_Quikr = lsqnonneg(A_Quikr_m, s)';
        result_ARK_Quikr = result_ARK_Quikr + ClusterProbability(i)*tmp_ARK_Quikr; % This is the linear additive composition estimation
    end
    
        
    VD_ARK_Quikr = 0.5 * norm((true_solution - result_ARK_Quikr),1)    
    if NoOfClusters_Quikr > 1, ChangeInComposition_ARK_Quikr  = norm((Composition_ARK_Quikr(end,:) - result_ARK_Quikr), 1);    end
    
    
    fprintf('VD with in current and previous iterations at %u clusters: %d\n', NoOfClusters_Quikr, ChangeInComposition_ARK_Quikr)
    
    data_ARK_Quikr = [data_ARK_Quikr; NoOfClusters_Quikr ChangeInComposition_ARK_Quikr  VD_ARK_Quikr];
    
    Composition_ARK_Quikr  = [Composition_ARK_Quikr; result_ARK_Quikr];
    
end

elapsedtime_ARK_Quikr=toc(tstart_ARK_Quikr);

disp('Elapsed time till convergence:'); elapsedtime_ARK_Quikr
disp('Number of clusters at convergence:'); NoOfClusters_Quikr



%% Saving outputs

save('Convergencedata.mat','elapsedtime_ARK_SEK','elapsedtime_ARK_Taxy','elapsedtime_ARK_Quikr','data_ARK_SEK','Composition_ARK_SEK','data_ARK_Taxy','Composition_ARK_Taxy','data_ARK_Quikr','Composition_ARK_Quikr');

disp ('---------------------------------');
disp('Elapsed time till convergence: ARK_SEK, ARK_Taxy, ARK_Quikr');
elapsedtime_ARK_SEK
elapsedtime_ARK_Taxy 
elapsedtime_ARK_Quikr


% Plot of VD for DAK:SEK, DAK:Taxy, DAK:Quikr
subplot(1,3,1); plot(data_ARK_SEK(:,1), data_ARK_SEK(:,3)); xlabel('Number of clusters'); ylabel('VD'); legend('ARK-SEK');
subplot(1,3,2); plot(data_ARK_Taxy(:,1), data_ARK_Taxy(:,3)); xlabel('Number of clusters'); ylabel('VD'); legend('ARK-Taxy');
subplot(1,3,3); plot(data_ARK_Quikr(:,1), data_ARK_Quikr(:,3)); xlabel('Number of clusters'); ylabel('VD'); legend('ARK-Quikr');



% Showing bar chart
BarYY = [true_solution; Composition_ARK_SEK(1,:); Composition_ARK_SEK(end,:)];
BarX = 1:21;
figure;
bar(BarX,BarYY',1);
legend('Ground Truth', 'SEK', 'ARK-SEK');
axis([0 22 0 0.35]);
xlabel('species'); ylabel('proportion');


BarYY = [true_solution; Composition_ARK_Taxy(1,:); Composition_ARK_Taxy(end,:)];
BarX = 1:21;
figure;
bar(BarX,BarYY',1);
legend('Ground Truth', 'Taxy', 'ARK-Taxy');
axis([0 22 0 0.35]);
xlabel('species'); ylabel('proportion');


BarYY = [true_solution; Composition_ARK_Quikr(1,:); Composition_ARK_Quikr(end,:)];
BarX = 1:21;
figure;
bar(BarX,BarYY',1);
legend('Ground Truth', 'Quikr', 'ARK-Quikr');
axis([0 22 0 0.35]);
xlabel('species'); ylabel('proportion');


