cd /mnt/d/Brain_region_project/Data_for_cNMF
conda activate cnmf_env
python

import os
import pandas as pd
import numpy as np
from scipy.io import mmread
import scipy.sparse as sp
import matplotlib.pyplot as plt
from IPython.display import Image
import scanpy as sc

np.random.seed(14)

adata = sc.read_h5ad("Brain_region_200.h5ad")
sc.pp.filter_genes(adata, min_cells=3)

#the following line may only be necessary when creating h5ad file with SeuratDisk
adata.__dict__['_raw'].__dict__['_var'] = adata.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
count_adat_fn = 'Brain_region_200_counts.h5ad'
sc.write(count_adat_fn, adata)

#######################################################################################

conda activate cnmf_env
cd /mnt/d/Brain_region_project/Data_for_cNMF

#With just 10 iterations the code below finishes within minutes
python ./cnmf.py prepare --output-dir ./cNMF --name Brain_region_200_2 -c ./Brain_region_200_counts.h5ad -k 3 4 5 6 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 --n-iter 200 --total-workers 1 --seed 14 --numgenes 2000

python ./cnmf.py factorize --output-dir ./cNMF --name Brain_region_200_2 --worker-index 0

python ./cnmf.py combine --output-dir ./cNMF --name Brain_region_200_2

python ./cnmf.py k_selection_plot --output-dir ./cNMF --name Brain_region_200_2

python ./cnmf.py consensus --output-dir ./cNMF --name Brain_region_200_2 --components 26 --local-density-threshold 2 --show-clustering

python ./cnmf.py consensus --output-dir ./cNMF --name Brain_region_200_2 --components 26 --local-density-threshold 0.2 --show-clustering

#######################################################################################

cd /mnt/d/Brain_region_project/Data_for_cNMF
conda activate cnmf_env
python
import os
import pandas as pd
import numpy as np
from scipy.io import mmread
import scipy.sparse as sp
import matplotlib.pyplot as plt
from IPython.display import Image
import scanpy as sc

usage = pd.read_csv('./cNMF/Brain_region_200_2/Brain_region_200_2.usages.k_26.dt_0_2.consensus.txt',sep='\t', index_col=0)
usage.columns = ['Usage_%s' % i for i in usage.columns]
usage.head()

usage_norm = usage.div(usage.sum(axis=1), axis=0)
usage_norm.head()

usage_norm.to_csv('Brain_region_200_2_usage_norm.csv')

## Load the Z-scored GEPs which reflect how enriched a gene is in each GEP relative to all of the others
gene_scores = pd.read_csv('./cNMF/Brain_region_200_2/Brain_region_200_2.gene_spectra_score.k_26.dt_0_2.txt',sep='\t', index_col=0).T
gene_scores.head()


top_genes = []
ngenes = 200
for gep in gene_scores.columns:
    top_genes.append(list(gene_scores.sort_values(by=gep, ascending=False).index[:ngenes]))

top_genes = pd.DataFrame(top_genes, index=gene_scores.columns).T
top_genes

gene_scores.to_csv('Brain_region_200_2_gene_scores.csv')
top_genes.to_csv('Brain_region_200_2_top_genes.csv')
