%% Hepatocytes_reconstruction%%
% this script create data used in figure 4

%load functions
addpath(genpath('./helperFunctions'));
load ./inputData.mat hint param tbl hlz
outputDir='hepatocytes_reconstruction_output';
mkdir(outputDir);

%% create coarse-grained for the 'non-malignant' cells 
[G,iident,treat] = findgroups(hint.metadata.seurat_clusters,hint.metadata.treatment);
E=splitapply(@(x) mean(x,1),hint.mat_norm',G)'; 
ind_far=find(strcmpi(treat,'far'));
E=E(:,ind_far);

% match cluster names
[~,locb]=ismember(iident(ind_far),tbl.cellNumber);
cnms=tbl.cellName(locb);

% add cluster names to hint 
[~,locb]=ismember(hint.metadata.seurat_clusters,tbl.cellNumber);
hint.clusterNames=tbl.cellName(locb);

%% Select Hepatocytes marker genes 
cell_type='Hepatocytes';

% 1) find cell type specific markers
FOLD_FACTOR=2;
EXP_THRESH=1*10^-5;
[marker_genes,marker_indices_lcm,marker_indices_sc]=find_markers(E,FOLD_FACTOR,EXP_THRESH,hint,hlz);

% 2) find LM genes by LCM
COM_PC_THRESH=2.5;
COM_PP_THRESH=4.5;
EXP_THRESH=10^-5;
[pp_lm, pc_lm, pp_lm_ind,pc_lm_ind] = extract_lm_genes(hlz,marker_indices_lcm,find(strcmpi(cnms,cell_type)),COM_PC_THRESH,COM_PP_THRESH,EXP_THRESH);

% 3) Reconstruct zonation by averaging cells stratified by zone
MAX_THRESH=10^-5;
NUM_ZONES=6;
zone_struct=produce_zonation_matrix(hint,pc_lm,pp_lm,cell_type,MAX_THRESH,NUM_ZONES);

%% Reconstruct zonation by averaging cells stratified by zone from Aizarani et al.
load inputData.mat grun;
MAX_THRESH=10^-5;
NUM_ZONES=6;
grun.all_genes=grun.gene_name;
grun.clusterNames=grun.cell_type;
zone_struct_grun=produce_zonation_matrix(grun,pc_lm,pp_lm,'Hepatocytes',MAX_THRESH,NUM_ZONES);

%% Make heatmap (Figure 4c)
indin=find(zone_struct_grun.qval<0.05 & max(zone_struct_grun.Z,[],2)>10^-5);
[y,ord]=sort(zone_struct_grun.com(indin));
figure('Units','centimeters','Position',[54.5306   14.2610   5.1   8.1]);
imagesc(zone_struct_grun.Z(indin(ord),:)./max(zone_struct_grun.Z(indin(ord),:),[],2));
for i=1:size(zone_struct_grun.Z,2)+1
    line([i-0.5 i-0.5],ylim,'color','k')
end

hcb=colorbar;
axes_pos=get(gca,'Position');

hcb.Position=[hcb.Position(1) hcb.Position(2) 0.02 0.15];
ylabel(hcb,'relative expression');
hcb.FontSize=6;
set(gca,'position',axes_pos);
set(gca,'XTick',1:size(zone_struct_grun.Z,2));
set(gca,'XTickLabel',[repmat('L',size(zone_struct_grun.Z,2),1),num2str((1:size(zone_struct_grun.Z,2))')]);
set(gca,'FontSize',8);

% plot representative pericentral genes
gg={'CYP1A1','CYP2E1','FASN','DPP4','OAT','NOTUM'};
setdiff(gg,zone_struct_grun.gene_name(indin));
figure('Units','centimeters','Position',[54.5306 11.3506 4.8154 6]);
for i=1:length(gg)
    subplot(3,2,i);
    ind=find(strcmpi(zone_struct_grun.gene_name,gg{i}));
    plot_patch(1:size(zone_struct_grun.Z,2),zone_struct_grun.Z(ind,:),zone_struct_grun.ZSE(ind,:),'b');
    hold on;
    title(gg{i},'FontSize',7);
    box on;
    axis tight;
    ylim([0 max(ylim)]);
    set(gca,'FontSize',6);
    set(gca,'XTick',1:size(zone_struct_grun.Z,2));
    set(gca,'XTickLabel',[repmat('L',size(zone_struct_grun.Z,2),1),num2str((1:size(zone_struct_grun.Z,2))')]);
end
ll=findobj('Type','line');
for i=1:length(ll)
    ll(i).LineWidth=1;
end

% plot representative pericentral genes
gg={'HAL','SDS','PCK1','HAMP','ASS1','CRP'};
setdiff(gg,zone_struct_grun.gene_name(indin));
figure('Units','centimeters','Position',[59.9546 11.3771 4.8154 6]);
for i=1:length(gg)
    subplot(3,2,i);
    ind=find(strcmpi(zone_struct_grun.gene_name,gg{i}));
    plot_patch(1:size(zone_struct_grun.Z,2),zone_struct_grun.Z(ind,:),zone_struct_grun.ZSE(ind,:),'b');
    title(gg{i});
    box on;
    axis tight;
    ylim([0 max(ylim)]);
    set(gca,'FontSize',6);
    set(gca,'XTick',1:size(zone_struct_grun.Z,2));
    set(gca,'XTickLabel',[repmat('L',size(zone_struct_grun.Z,2),1),num2str((1:size(zone_struct_grun.Z,2))')]);
end
ll=findobj('Type','line');
for i=1:length(ll)
    ll(i).LineWidth=1;
end

%% Compute the fraction of zonated genes
EXP_THRESH=10^-5;
QVAL_THRESH=0.2;
a=length(find(zone_struct_grun.qval<QVAL_THRESH & max(zone_struct_grun.Z,[],2)>EXP_THRESH));
b=length(find( max(zone_struct_grun.Z,[],2)>EXP_THRESH));
display([num2str(a) ' genes with qval<' num2str(QVAL_THRESH) ' out of ' num2str(b) ' genes with expression above ' num2str(EXP_THRESH)]);

%% Export spatial reconstructed zonation (Table S7)
tmp1=table(zone_struct_grun.gene_name,'VariableNames',{'genes'});
A=[zone_struct_grun.Z, zone_struct_grun.ZSE, zone_struct_grun.pval, zone_struct_grun.qval];
B = num2cell(A);
B(isnan(A)) ={'NaN'};
tmp2=cell2table(B,...
    'VariableNames',{'mean_1','mean_2','mean_3','mean_4','mean_5','mean_6','SEM_1','SEM_2','SEM_3','SEM_4','SEM_5','SEM_6', 'pval', 'qval'});
t=[tmp1, tmp2];

% export table
fileName='Table_S7.xlsx';
headerLine={'Table S7 - mean expression of the reconstructed hepatocytes cells per zone'};
writecell(headerLine, fullfile(outputDir,fileName), 'Range', 'A1');
writetable(t,fullfile(outputDir,fileName), 'WriteVariableNames',true, 'Range', 'A3');

%% Global validation - take all genes that are zonated in scRNAseq, 
% expressed highly in LCM and calculate R. Then compare to random re-assignments
MAX_EXP_SC=1*10^-5;
MAX_EXP_LCM=5*10^-4;
QVAL_THRESH=0.1;
indin=find(zone_struct_grun.qval<QVAL_THRESH & max(zone_struct_grun.Z,[],2)>MAX_EXP_SC);
g=zone_struct_grun.gene_name(indin);
ind1_all=[];
ind2_all=[];
for i=1:length(g)
    hold on;
    indd=find(strcmpi(hlz.all_genes,g{i}));
    if ~isempty(indd)
        ind2_all(i)=indd;
    end
end
ind2_all=ind2_all(ind2_all>0);
indin2=find(max(hlz.mean(ind2_all,:),[],2)>MAX_EXP_LCM);
g=hlz.all_genes(ind2_all(indin2));
g=setdiff(g,union(pc_lm,pp_lm));
ind1_all=[];
ind2_all=[];
for i=1:length(g)
    hold on;
    ind1_all(i)=find(strcmpi(zone_struct_grun.gene_name,g{i}));
    ind2_all(i)=find(strcmpi(hlz.all_genes,g{i}));
end
ind2_all=ind2_all(:);
ind1_all=ind1_all(:);

hlz.com=NaN*ones(size(hlz.mean,1),1);
for i=1:length(hlz.mean)
    hlz.com(i)=sum((1:size(hlz.mean,2)).*hlz.mean(i,:))/sum(hlz.mean(i,:));
end
vec=[zone_struct_grun.com(ind1_all) hlz.com(ind2_all)];
figure('units','centimeters','Position',[60.7219    9.7896  7 8]);
subplot 211
hold on
scatter(vec(:,2),vec(:,1),'.');
l=lsline;
l.Color='k';

[rr,pp]=corr(vec(:,1),vec(:,2),'type','spearman');
xlabel('LCM center of mass');
ylabel('SC reconstruction center of mass');
title(['R_{Spearman}=' num2str(rr) ', p=' num2str(pp)]);
box on

s = rng; save random generator
rng(s);
ITER=10000;
Rrand=zeros(ITER,1);
for i=1:ITER
    vecr=vec(:,2);
    vecr=vecr(randperm(length(vecr)));
    Rrand(i)=corr(vec(:,1),vecr,'type','spearman');
end
subplot 212
hist(Rrand,50);
xlim([(-max(xlim)) max(xlim)]);
zz=(rr-mean(Rrand))/std(Rrand);
p=1-normcdf(zz);
line([rr rr],ylim,'color','k','linewidth',2);
xlabel('randomized correlation');
ylabel('Number of permutations');
title(['R_{Spearman}=' num2str(rr,'%.2f') ', p=' num2str(p)]);
set(gca,'FontSize',8);

%% Validation - compare expression of reconstruction with LCMseq
g={'SLCO1B3','CYP2C8','ADH1B','APOA5','UGT2B10',...
    'GC','SCD','CYP2E1','ADH4','THOC5','ALDH1B1','ALDOB','PCK1','FBP1','ASS1','ENO3','GLS2','SDS'};
ind1_all=[];
ind2_all=[];
for i=1:length(g)
    subplot(3,6,i)
    hold on;
    ind_all(i)=find(strcmpi(zone_struct.gene_name,g{i}));
    ind2_all(i)=find(strcmpi(hlz.all_genes,g{i}));
end
indin=find(max(hlz.mean(ind2_all,:),[],2)>5*10^-4);
g=hlz.all_genes(ind2_all(indin));

figure('units','centimeters','Position',[60.7219    9.7896   11 8.5]);
for i=1:length(g)
    subplot(3,4,i)
    hold on;
    ind=find(strcmpi(zone_struct.gene_name,g{i}));
    ind2=find(strcmpi(hlz.all_genes,g{i}));
    plot_patch(linspace(0,1,size(zone_struct.Z,2)),zone_struct.Z(ind,:)./max(zone_struct.Z(ind,:),[],2),...
        zone_struct.ZSE(ind,:)./max(zone_struct.Z(ind,:),[],2),'b');
    plot(linspace(0,1,size(hlz.mean,2)),hlz.mean(ind2,:)./max(hlz.mean(ind2,:)),'--k');

    title(g{i},'fontsize',8.5,'FontWeight','normal');
    ylim([0 max(ylim)]);
    xlim([0 1]);
    set(gca,'XTick',linspace(0,max(xlim),NUM_ZONES));
    set(gca,'XTickLabel',1:NUM_ZONES,'fontsize',7);
    set(gca,'Color','none');
    
    llist=findall(gca,'Type','Line');
    llist(1).LineWidth=1.5;
    llist(2).LineWidth=1.5;
    box on;
    grid on;
end

%% output GSEA for grun zonation
%use output file to run GSEA analysis
indin=find(zone_struct_grun.qval<0.25);
[y,ord]=sort(zone_struct_grun.com(indin));
fid=fopen(fullfile(outputDir,'human_hepatocyte_zonation_grun.rnk'),'w');
for i=1:length(ord)
    fprintf(fid,'%s\t%f\n',upper(zone_struct_grun.gene_name{indin(ord(i))}),zone_struct_grun.com(indin(ord(i))));
end
fclose all

