In [1]:
import numpy as np
import pandas as pd
import os
import itertools
from bib import *

In [2]:
ill_all_c=["L_1_1_c", "B_2_2_c", "M_2_1_c", "U_7_1_c", "M_8_1_c"]
ill_all_25D3=["L_1_1_1_25D3", "B_2_2_1_25D3","M_2_1_1_25D3", "U_7_1_1_25D3", "M_8_1_1_25D3"]
healthSamples_c=["A_3_2_c", "A_4_2_c", "O_5_2_c", "U_6_2_c", "A_3_1_1_25D3"]
healthSamples_25D3=["A_3_2_1_25D3", "A_4_2_1_25D3", "O_5_2_1_25D3", "U_6_2_1_25D3","A_3_1_c"]
dictWithSplit={
    "health_c":healthSamples_c,
    "health_25D3":healthSamples_25D3,
    "ill_all_c":ill_all_c,
    "ill_all_25D3":ill_all_25D3
}
df=readFileWithFPKM()

In [3]:
def normalizeZeros(df : pd.DataFrame, columnNames : list[str]):
    for n in columnNames:
        mask = df[n]==0
        df.loc[mask,n]+=0.0001

In [4]:
normalizeZeros(df, ill_all_25D3+ill_all_c+healthSamples_25D3+healthSamples_c)

In [5]:
sciezkaDoWynikow="../Wyniki/T4-0001"

In [6]:
os.makedirs(os.path.join(sciezkaDoWynikow, "perSample/whole"), exist_ok=True)
os.makedirs(os.path.join(sciezkaDoWynikow, "perSample/le"), exist_ok=True)
os.makedirs(os.path.join(sciezkaDoWynikow, "perSample/gt"), exist_ok=True)
os.makedirs(os.path.join(sciezkaDoWynikow, "perSet/whole"), exist_ok=True)
os.makedirs(os.path.join(sciezkaDoWynikow, "perSet/le"), exist_ok=True)
os.makedirs(os.path.join(sciezkaDoWynikow, "perSet/gt"), exist_ok=True)

In [7]:
def compareCollections(df : pd.DataFrame, L1, L2=None):
    resDict={}
    logDict={}
    if L2 is None:
        it=L1
    else:
        it=itertools.product(L1,L2)
    for n1,n2 in it:
        fpkm1=df[n1]
        fpkm2=df[n2]
        diff=np.log2(fpkm1/fpkm2)
        logDict[n1+"/"+n2+"_log"]=np.copy(diff)
        mask1=diff>=1
        mask2=diff<=-1
        diff[:]=0
        diff[mask1]=1
        diff[mask2]=-1
        resDict[n1+"/"+n2]=diff
    rawColumnsOut=pd.DataFrame(resDict)
    columnSum=pd.DataFrame(logDict|{"diffSum":rawColumnsOut.sum(axis=1)})
    genData=getGeneDataColumns(df)
    return rawColumnsOut.join(columnSum).join(genData)

In [8]:
def makeCompAndSaveRes(df : pd.DataFrame, L1, name, pref=os.path.join(sciezkaDoWynikow,"perSample"), L2=None):
    comparisonOut=compareCollections(df, L1, L2=L2)
    if L2 is None:
        maks=len(L1)
    else:
        maks=len(L1)*len(L2)
    comparisonOut.to_csv(os.path.join(pref,f"whole/genComparison-{name}.csv"), sep="\t")
    m=comparisonOut["diffSum"]==maks
    comparisonOut[m].to_csv(os.path.join(pref,f"gt/genComparison-{name}.csv"), sep="\t")
    m=comparisonOut["diffSum"]==-maks
    comparisonOut[m].to_csv(os.path.join(pref,f"le/genComparison-{name}.csv"), sep="\t")

In [9]:
listOfCompToMakePerSample=[
    ("health_c-health_25D3", healthSamples_c, healthSamples_25D3),
    ("ill_all_c-ill_all_25D3", ill_all_c, ill_all_25D3)
]

In [10]:
for n, L1, L2 in listOfCompToMakePerSample:
    to_compare=list(zip(L1,L2))
    makeCompAndSaveRes(df, to_compare, n)

In [11]:
listOfCompToMake=[]
for k1 in dictWithSplit:
    for k2 in dictWithSplit:
        listOfCompToMake.append(
            (k1+"-"+k2, dictWithSplit[k1], dictWithSplit[k2])
        )

In [12]:
for n, L1, L2 in listOfCompToMake:
    makeCompAndSaveRes(df, L1, n, L2=L2, pref=os.path.join(sciezkaDoWynikow,"perSet"))

=====================================

# Common parts

In [13]:
os.makedirs(os.path.join(sciezkaDoWynikow,"perSample/intersectSameTrend"))
os.makedirs(os.path.join(sciezkaDoWynikow,"perSample/intersectDiffTrend"))

In [14]:
def intersectSameTrend(dirPath, pref):
    for fileName in os.listdir(dirPath):
        path1=os.path.join(dirPath,fileName)
        df1 = pd.read_csv(path1,index_col=0,sep="\t")
        for secondFileName in os.listdir(dirPath):
            if fileName>=secondFileName:
                continue
            path2=os.path.join(dirPath,secondFileName)
            df2 = pd.read_csv(path2,index_col=0,sep="\t")
            inter=df1.join(df2,how="inner",rsuffix="_r")
            name=pref+"_"+fileName+"_vs_"+secondFileName
            inter.to_csv(os.path.join(sciezkaDoWynikow,f"perSample/intersectSameTrend/{name}.csv"), sep="\t")

In [15]:
intersectSameTrend(os.path.join(sciezkaDoWynikow,"perSample/gt/"), pref="gt")
intersectSameTrend(os.path.join(sciezkaDoWynikow,"perSample/le/"), pref="le")

In [16]:
def intersectDifferentTrend(dirPath1, dirPath2, pref1, pref2):
    for fileName in os.listdir(dirPath1):
        path1=os.path.join(dirPath1,fileName)
        df1 = pd.read_csv(path1,index_col=0,sep="\t")
        for secondFileName in os.listdir(dirPath2):
            path2=os.path.join(dirPath2,secondFileName)
            df2 = pd.read_csv(path2,index_col=0,sep="\t")
            inter=df1.join(df2,how="inner",rsuffix="_r")
            name=pref1+"_"+fileName+"_vs_"+pref2+"_"+secondFileName
            inter.to_csv(os.path.join(sciezkaDoWynikow,f"perSample/intersectDiffTrend/{name}.csv"), sep="\t")

In [17]:
intersectDifferentTrend(os.path.join(sciezkaDoWynikow,"perSample/le/"),os.path.join(sciezkaDoWynikow,"perSample/gt/"), "le", "gt")