In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import NMF
from numpy import asarray
from numpy import savetxt
import umap
from tqdm import tqdm
from scipy import stats
from matplotlib.colors import ListedColormap
import matplotlib as mpl
import os

In [2]:
os.chdir("../Data")

### Read in original data

In [3]:
# Navigate to Original data directory
os.chdir("Original data")

# ---------- TCGA ----------
# Read in meta data
meta_data = pd.read_csv("tcga.sc.rna.count.meta.tsv", sep="\t", index_col=0, header = 0)
print("TCGA meta_data shape: ", meta_data.shape)

# Read in clinical data from TCGA datasets 
lusc_meta_fh = pd.read_csv("lusc_tcga_clinical_data.tsv", sep="\t", index_col=1, header = 0)
hnsc_meta_fh = pd.read_csv("hnsc_tcga_clinical_data.tsv", sep="\t", index_col=1, header = 0)
cesc_meta_fh = pd.read_csv("cesc_tcga_clinical_data.tsv", sep="\t", index_col=1, header = 0)
lusc_meta_pc = pd.read_csv("lusc_tcga_pan_can_atlas_2018_clinical_data.tsv", sep="\t", index_col=1, header = 0)
hnsc_meta_pc = pd.read_csv("hnsc_tcga_pan_can_atlas_2018_clinical_data.tsv", sep="\t", index_col=1, header = 0)
cesc_meta_pc = pd.read_csv("cesc_tcga_pan_can_atlas_2018_clinical_data.tsv", sep="\t", index_col=1, header = 0)

# Read in raw counts matrix
counts_pd = pd.read_csv("tcga.sc.counts.matrix.csv", sep=",", index_col=0, header = 0)
print("TCGA counts_pd shape: ", counts_pd.shape)

# Read in gene conversion from ENSG to NCBI gene symbol
gene_conversion = pd.read_csv("Gene_conversion.csv", sep=",", index_col=0, header = 0)
print("TCGA gene_conversion shape: ", gene_conversion.shape)

os.chdir("..")

# Exchange file id for pt id
counts_pd = pd.DataFrame(counts_pd.values, columns = meta_data['case_submitter_id'].values, index = counts_pd.index)

TCGA meta_data shape:  (1306, 14)
TCGA counts_pd shape:  (60483, 1306)
TCGA gene_conversion shape:  (60676, 6)


In [4]:
# Combine clinical data from LUSC, HNSC, and CESC
primary_meta = pd.concat([lusc_meta_fh, lusc_meta_pc], axis=0)
primary_meta = pd.concat([primary_meta, hnsc_meta_fh], axis=0)
primary_meta = pd.concat([primary_meta, hnsc_meta_pc], axis=0)
primary_meta = pd.concat([primary_meta, cesc_meta_fh], axis=0)
primary_meta = pd.concat([primary_meta, cesc_meta_pc], axis=0)
print('Initial shape: ', primary_meta.shape)

# Remove duplicates
primary_meta['Patient'] = primary_meta.index
primary_meta = primary_meta.drop_duplicates('Patient', keep='first')
primary_meta = primary_meta.drop('Patient', axis=1)
print('Duplicates removed shape: ', primary_meta.shape)

# Remove CESC adeno samples
primary_meta = primary_meta.loc[primary_meta['Neoplasm Histologic Type Name'] != 'Adenosquamous']
print('Adeno removed shape: ', primary_meta.shape)

# Write compiled clinical data
primary_meta.to_csv('TCGA_clinical_data.csv', sep=",")

Initial shape:  (2658, 213)
Duplicates removed shape:  (1347, 213)
Adeno removed shape:  (1341, 213)


  primary_meta['Patient'] = primary_meta.index


In [5]:
# List of patients with counts and meta data
patient_list = []
for pt in counts_pd.columns:
    if pt in primary_meta.index:
        patient_list.append(pt)
        
# Include only patients with meta data
counts_pd = counts_pd[patient_list]

# Remove duplicates
counts_pd = counts_pd.T
counts_pd['Patient'] = counts_pd.index
counts_pd = counts_pd.drop_duplicates('Patient', keep='first')
counts_pd = counts_pd.drop('Patient', axis=1)
counts_pd = counts_pd.T
print('New counts_pd shape: ', counts_pd.shape)

# Write counts matrix
counts_pd.to_csv('raw_TCGA_counts.csv', sep=',')

New counts_pd shape:  (60483, 1300)


In [6]:
hnsc_pt = primary_meta.loc[primary_meta['Study ID'] == 'hnsc_tcga'].index
hnsc_counts_pt = []
for pt in hnsc_pt:
    if pt in counts_pd.columns:
        hnsc_counts_pt.append(pt)
        
hnsc_counts = counts_pd[hnsc_counts_pt]
hnsc_counts.to_csv('raw_TCGA_HNSC_counts.csv', sep=',')

In [7]:
# ---------- CPTAC ----------
# Read in counts matrix
counts_pd = pd.read_csv('gdc_rnacount.tsv', sep="\t", index_col = 0, header = 0)
print("CPTAC counts_pd: ", counts_pd.shape)

# Read in meta data
meta_pd = pd.read_csv('gdc_metadata.tsv', sep="\t", index_col = 0, header = 0)
print("CPTAC meta_pd: ", meta_pd.shape)

# Read in clinical data
clinical_pd = pd.read_csv('gdc_clinical.tsv', sep="\t", index_col = 0, header = 0)
print("CPTAC clinical_pd: ", clinical_pd.shape)

CPTAC counts_pd:  (60660, 3023)
CPTAC meta_pd:  (5644, 7)
CPTAC clinical_pd:  (3770, 157)


In [8]:
# Reassign sample labels based on patient id rather than data id
file_list = []
project_list = []
pt_list = []
primary_list = []
type_list = []

# Finding the samples and their relevant info
count = 0
for file_id in counts_pd.columns:
    proj = meta_pd.loc[file_id]['Project ID']
    pt = meta_pd.loc[file_id]['Case ID']
    subset_primary = clinical_pd.loc[clinical_pd['case_submitter_id'] == pt]['tissue_or_organ_of_origin']
    subset_type = clinical_pd.loc[clinical_pd['case_submitter_id'] == pt]['primary_diagnosis']
    
    try:
        primary = list(set(subset_primary.values))[0]
        type_l = list(set(subset_type.values))[0]
        primary_list.append(primary)
        type_list.append(type_l)
        
        project_list.append(proj)

        pt_list.append(pt)
        
        file_list.append(file_id)
    except:
        count += 1

In [9]:
# Only include samples from primary tumor
primary_tumor = meta_pd.loc[meta_pd['Sample Type'] == 'Primary Tumor'].index


# Subset relevant info to only tumor samples
file_list_tumor = []
pt_list_tumor = []
project_list_tumor = []
primary_list_tumor = []
type_list_tumor = []
for i in range(len(file_list)):
    file = file_list[i]
    pt = pt_list[i]
    proj = project_list[i]
    primary = primary_list[i]
    t = type_list[i]
    
    if file in primary_tumor:
        file_list_tumor.append(file)
        pt_list_tumor.append(pt)
        project_list_tumor.append(proj)
        primary_list_tumor.append(primary)
        type_list_tumor.append(t)

In [10]:
# Separate out SCC samples
df = pd.DataFrame(project_list_tumor, index = pt_list_tumor, columns = ["Project"])
df['Primary'] = primary_list_tumor
df['Type'] = type_list_tumor

CPTAC_2 = df.loc[df['Project'] == 'CPTAC-2']
CPTAC_2 = CPTAC_2.loc[CPTAC_2['Type'] == 'Squamous cell carcinoma, NOS']
print('Primary: ', set(CPTAC_2['Primary']))
print('Type: ', set(CPTAC_2['Type']))

CPTAC_3 = df.loc[df['Project'] == 'CPTAC-3']
CPTAC_3 = CPTAC_3.loc[CPTAC_3['Type'] == 'Squamous cell carcinoma, NOS']
print('Primary: ', set(CPTAC_3['Primary']))
print('Type: ', set(CPTAC_3['Type']))

Primary:  {'Breast, NOS'}
Type:  {'Squamous cell carcinoma, NOS'}
Primary:  {'Base of tongue, NOS', 'Larynx, NOS', 'Tongue, NOS', 'Oropharynx, NOS', 'Lung, NOS', 'Gum, NOS', 'Head, face or neck, NOS', 'Lower lobe, lung', 'Upper lobe, lung', 'Tonsil, NOS', 'Lip, NOS', 'Floor of mouth, NOS', 'Cheek mucosa', 'Overlapping lesion of lip, oral cavity and pharynx', 'Middle lobe, lung'}
Type:  {'Squamous cell carcinoma, NOS'}


In [11]:
# Subset counts to only include tumor samples and use patient ids to identify the column
subcounts_pd = pd.DataFrame(counts_pd[file_list_tumor].values, columns = pt_list_tumor, index = counts_pd.index)

# Use SCC patient list for subsetting counts
val_patient_list = CPTAC_3.index
val_counts_pd = subcounts_pd[val_patient_list]
print(val_counts_pd.shape)

# Remove . in gene names
gene_list = []
for gene in val_counts_pd.index:
    gene_list.append(gene.split(".")[0])
    
val_counts_pd = pd.DataFrame(val_counts_pd.values, index = gene_list, columns = val_counts_pd.columns)

val_counts_pd['Gene'] = val_counts_pd.index
val_counts_pd = val_counts_pd.drop_duplicates('Gene', keep='first')
val_counts_pd = val_counts_pd.drop('Gene', axis=1)

# Remove duplicates
val_counts_pd = val_counts_pd.T
val_counts_pd['Patient'] = val_counts_pd.index
val_counts_pd = val_counts_pd.drop_duplicates('Patient', keep='first')
val_counts_pd = val_counts_pd.drop('Patient', axis=1)
val_counts_pd = val_counts_pd.T
print('New counts_pd shape: ', val_counts_pd.shape)

# Write counts matrix
val_counts_pd.to_csv('raw_CPTAC_counts.csv', sep=',')

(60660, 177)
New counts_pd shape:  (60616, 174)


In [12]:
sum_3 = CPTAC_3.loc[CPTAC_3['Primary'] != 'Lower lobe, lung']
sum_3 = sum_3.loc[sum_3['Primary'] != 'Upper lobe, lung']
sum_3 = sum_3.loc[sum_3['Primary'] != 'Middle lobe, lung']
sum_3 = sum_3.loc[sum_3['Primary'] != 'Lung, NOS']

val_hnsc_counts = val_counts_pd[sum_3.index]
val_hnsc_counts.to_csv('raw_CPTAC_HNSC_counts.csv', sep=',')

### -------------------------- STOP ---------------------------------
### Run R code for normalization and CoGAPS
### ---------------------------------------------------------------------