{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import pandas as pd\n", "import re\n", "import matplotlib as mpl\n", "mpl.rcParams['pdf.fonttype'] = 42\n", "mpl.rcParams['ps.fonttype'] = 42" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# rename patient names in a txt\n", "def re_col_names(RNA_set):\n", " '''rename patient names in a txt '''\n", " Patient_name = [re.findall('(?<=star_Gencode.v39/).+(?=/)', i) for i in RNA_set.columns.tolist()[6:]]\n", " Patient_name = sum(Patient_name, [])\n", " New_colnames = ['Geneid', 'Chr', 'Start', 'End', 'Strand', 'Length'] + Patient_name \n", " RNA_set.columns = New_colnames\n", " return RNA_set\n", "\n", "# get patient code list \n", "def get_patient_code(RNA_set):\n", " '''get patient names in a txt '''\n", " Patient_name = RNA_set.columns.tolist()[6:]\n", " return Patient_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Helsinki" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Import data \n", "- All expression data were annotated with gencode.v39.primary_assembly.annotation.gtf as the reference" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GeneidChrStartEndStrandLengthFM1338FM2086FM2440FM2532...FM3690_2FM3764FM3796FM3799FM3800FM3814FM3835HL_VL_AMP08JE_VL_AMP05KO_AMP09
0ENSG00000223972.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr111869;12010;12179;12613;12613;12975;13221;1322...12227;12057;12227;12721;12697;13052;14409;1337...+;+;+;+;+;+;+;+;+1735.02.02.01.01.0...0.01.08.01.00.01.00.00.00.00.0
1ENSG00000227232.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;c...14404;15005;15796;16607;16858;17233;17606;1791...14501;15038;15947;16765;17055;17368;17742;1806...-;-;-;-;-;-;-;-;-;-;-1351.0412.0370.0417.0711.0...207.0281.0454.0263.0449.0207.0695.0572.0881.0708.0
2ENSG00000278267.1chr11736917436-68.036.015.041.071.0...19.015.055.06.052.025.053.040.056.034.0
3ENSG00000243485.5chr1;chr1;chr1;chr1;chr129554;30267;30564;30976;3097630039;30667;30667;31109;31097+;+;+;+;+1021.03.00.00.01.0...0.00.01.03.01.00.01.00.00.00.0
4ENSG00000284332.1chr13036630503+138.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
61584ENSG00000276017.1KI270734.17241174814+2404.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
61585ENSG00000278817.1KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI...131494;131836;135443;136159;136845131755;131996;135543;136299;137392+;+;+;+;+1213.01043.01151.0282.01116.0...747.0876.0592.0479.0250.0405.0892.0617.0927.0687.0
61586ENSG00000277196.4KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI...138082;138082;138743;138743;142194;142194;1436...138667;138667;138831;138831;142292;142292;1437...-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;...2405.0311.0719.049.0876.0...147.0240.0415.048.0405.0150.0305.01023.0429.02899.0
61587ENSG00000278625.1KI270744.15100951114-106.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
61588ENSG00000277374.1KI270750.1148668148843+176.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", "

61589 rows × 26 columns

\n", "
" ], "text/plain": [ " Geneid Chr \\\n", "0 ENSG00000223972.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1 \n", "1 ENSG00000227232.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;c... \n", "2 ENSG00000278267.1 chr1 \n", "3 ENSG00000243485.5 chr1;chr1;chr1;chr1;chr1 \n", "4 ENSG00000284332.1 chr1 \n", "... ... ... \n", "61584 ENSG00000276017.1 KI270734.1 \n", "61585 ENSG00000278817.1 KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI... \n", "61586 ENSG00000277196.4 KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI... \n", "61587 ENSG00000278625.1 KI270744.1 \n", "61588 ENSG00000277374.1 KI270750.1 \n", "\n", " Start \\\n", "0 11869;12010;12179;12613;12613;12975;13221;1322... \n", "1 14404;15005;15796;16607;16858;17233;17606;1791... \n", "2 17369 \n", "3 29554;30267;30564;30976;30976 \n", "4 30366 \n", "... ... \n", "61584 72411 \n", "61585 131494;131836;135443;136159;136845 \n", "61586 138082;138082;138743;138743;142194;142194;1436... \n", "61587 51009 \n", "61588 148668 \n", "\n", " End \\\n", "0 12227;12057;12227;12721;12697;13052;14409;1337... \n", "1 14501;15038;15947;16765;17055;17368;17742;1806... \n", "2 17436 \n", "3 30039;30667;30667;31109;31097 \n", "4 30503 \n", "... ... \n", "61584 74814 \n", "61585 131755;131996;135543;136299;137392 \n", "61586 138667;138667;138831;138831;142292;142292;1437... \n", "61587 51114 \n", "61588 148843 \n", "\n", " Strand Length FM1338 \\\n", "0 +;+;+;+;+;+;+;+;+ 1735.0 2.0 \n", "1 -;-;-;-;-;-;-;-;-;-;- 1351.0 412.0 \n", "2 - 68.0 36.0 \n", "3 +;+;+;+;+ 1021.0 3.0 \n", "4 + 138.0 0.0 \n", "... ... ... ... \n", "61584 + 2404.0 0.0 \n", "61585 +;+;+;+;+ 1213.0 1043.0 \n", "61586 -;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;... 2405.0 311.0 \n", "61587 - 106.0 0.0 \n", "61588 + 176.0 0.0 \n", "\n", " FM2086 FM2440 FM2532 ... FM3690_2 FM3764 FM3796 FM3799 FM3800 \\\n", "0 2.0 1.0 1.0 ... 0.0 1.0 8.0 1.0 0.0 \n", "1 370.0 417.0 711.0 ... 207.0 281.0 454.0 263.0 449.0 \n", "2 15.0 41.0 71.0 ... 19.0 15.0 55.0 6.0 52.0 \n", "3 0.0 0.0 1.0 ... 0.0 0.0 1.0 3.0 1.0 \n", "4 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... ... \n", "61584 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 \n", "61585 1151.0 282.0 1116.0 ... 747.0 876.0 592.0 479.0 250.0 \n", "61586 719.0 49.0 876.0 ... 147.0 240.0 415.0 48.0 405.0 \n", "61587 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 \n", "61588 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 \n", "\n", " FM3814 FM3835 HL_VL_AMP08 JE_VL_AMP05 KO_AMP09 \n", "0 1.0 0.0 0.0 0.0 0.0 \n", "1 207.0 695.0 572.0 881.0 708.0 \n", "2 25.0 53.0 40.0 56.0 34.0 \n", "3 0.0 1.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... \n", "61584 0.0 0.0 0.0 0.0 0.0 \n", "61585 405.0 892.0 617.0 927.0 687.0 \n", "61586 150.0 305.0 1023.0 429.0 2899.0 \n", "61587 0.0 0.0 0.0 0.0 0.0 \n", "61588 0.0 0.0 0.0 0.0 0.0 \n", "\n", "[61589 rows x 26 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Oxford0_poly = re_col_names(pd.read_csv('./Helsinki data/novogene_may2022_polyA_PE_samples_fragmentcounts_primary.txt', sep=\"\t\"))\n", "Oxford13_poly = re_col_names(pd.read_csv('./Helsinki data/oxford1_oxford3_polyA_PE_samples_fragmentcounts_primary.txt', sep=\"\t\"))\n", "Oxford4_ribo = re_col_names(pd.read_csv('./Helsinki data/oxford4_riboD_PE_samples_fragmentcounts_primary.txt', sep=\"\t\"))\n", "Oxford7_poly = re_col_names(pd.read_csv('./Helsinki data/oxford7_polyA_PE_samples_fragmentcounts_primary.txt', sep=\"\t\"))\n", "Oxford8_poly = re_col_names(pd.read_csv('./Helsinki data/oxford8_polyA_PE_samples_fragmentcounts_primary.txt', sep=\"\t\"))\n", "Oxford9_poly = re_col_names(pd.read_csv('./Helsinki data/oxford9_polyA_PE_samples_fragmentcounts_primary.txt', sep=\"\t\"))\n", "\n", "Oxford9_poly" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GeneidChrStartEndStrandLength380444456172AS_TA_AMP01...HL_VL_AMP08IA_VL_AMP03JE_VL_AMP05JHJH_VL_AMP04KO_AMP09MR_VL_AMP07V-P_VL_AMP10VA_VL_AMP06VM_TA_AMP02
0ENSG00000223972.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr111869;12010;12179;12613;12613;12975;13221;1322...12227;12057;12227;12721;12697;13052;14409;1337...+;+;+;+;+;+;+;+;+17350000...0001000000
1ENSG00000227232.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;c...14404;15005;15796;16607;16858;17233;17606;1791...14501;15038;15947;16765;17055;17368;17742;1806...-;-;-;-;-;-;-;-;-;-;-135148793893...12468109100971306866213112
2ENSG00000278267.1chr11736917436-6811564...28111761910581216
3ENSG00000243485.5chr1;chr1;chr1;chr1;chr129554;30267;30564;30976;3097630039;30667;30667;31109;31097+;+;+;+;+10210000...0000000000
4ENSG00000284332.1chr13036630503+1380000...0000000000
..................................................................
61582ENSG00000276017.1KI270734.17241174814+24040000...0000000000
61583ENSG00000278817.1KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI...131494;131836;135443;136159;136845131755;131996;135543;136299;137392+;+;+;+;+12133081684341202...2612764027681023382215404145315
61584ENSG00000277196.4KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI...138082;138082;138743;138743;142194;142194;1436...138667;138667;138831;138831;142292;142292;1437...-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;...24059184471432...514187189329525981314520422584226
61585ENSG00000278625.1KI270744.15100951114-1060000...0000000000
61586ENSG00000277374.1KI270750.1148668148843+1760000...0000000000
\n", "

61587 rows × 60 columns

\n", "
" ], "text/plain": [ " Geneid Chr \\\n", "0 ENSG00000223972.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1 \n", "1 ENSG00000227232.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;c... \n", "2 ENSG00000278267.1 chr1 \n", "3 ENSG00000243485.5 chr1;chr1;chr1;chr1;chr1 \n", "4 ENSG00000284332.1 chr1 \n", "... ... ... \n", "61582 ENSG00000276017.1 KI270734.1 \n", "61583 ENSG00000278817.1 KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI... \n", "61584 ENSG00000277196.4 KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI... \n", "61585 ENSG00000278625.1 KI270744.1 \n", "61586 ENSG00000277374.1 KI270750.1 \n", "\n", " Start \\\n", "0 11869;12010;12179;12613;12613;12975;13221;1322... \n", "1 14404;15005;15796;16607;16858;17233;17606;1791... \n", "2 17369 \n", "3 29554;30267;30564;30976;30976 \n", "4 30366 \n", "... ... \n", "61582 72411 \n", "61583 131494;131836;135443;136159;136845 \n", "61584 138082;138082;138743;138743;142194;142194;1436... \n", "61585 51009 \n", "61586 148668 \n", "\n", " End \\\n", "0 12227;12057;12227;12721;12697;13052;14409;1337... \n", "1 14501;15038;15947;16765;17055;17368;17742;1806... \n", "2 17436 \n", "3 30039;30667;30667;31109;31097 \n", "4 30503 \n", "... ... \n", "61582 74814 \n", "61583 131755;131996;135543;136299;137392 \n", "61584 138667;138667;138831;138831;142292;142292;1437... \n", "61585 51114 \n", "61586 148843 \n", "\n", " Strand Length 3804 4445 \\\n", "0 +;+;+;+;+;+;+;+;+ 1735 0 0 \n", "1 -;-;-;-;-;-;-;-;-;-;- 1351 48 79 \n", "2 - 68 11 5 \n", "3 +;+;+;+;+ 1021 0 0 \n", "4 + 138 0 0 \n", "... ... ... ... ... \n", "61582 + 2404 0 0 \n", "61583 +;+;+;+;+ 1213 308 168 \n", "61584 -;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;... 2405 91 84 \n", "61585 - 106 0 0 \n", "61586 + 176 0 0 \n", "\n", " 6172 AS_TA_AMP01 ... HL_VL_AMP08 IA_VL_AMP03 JE_VL_AMP05 JH \\\n", "0 0 0 ... 0 0 0 1 \n", "1 38 93 ... 124 68 109 100 \n", "2 6 4 ... 28 11 17 6 \n", "3 0 0 ... 0 0 0 0 \n", "4 0 0 ... 0 0 0 0 \n", "... ... ... ... ... ... ... ... \n", "61582 0 0 ... 0 0 0 0 \n", "61583 434 1202 ... 261 276 402 768 \n", "61584 47 1432 ... 514 187 189 3295 \n", "61585 0 0 ... 0 0 0 0 \n", "61586 0 0 ... 0 0 0 0 \n", "\n", " JH_VL_AMP04 KO_AMP09 MR_VL_AMP07 V-P_VL_AMP10 VA_VL_AMP06 \\\n", "0 0 0 0 0 0 \n", "1 97 130 68 66 213 \n", "2 19 10 5 8 12 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 \n", "... ... ... ... ... ... \n", "61582 0 0 0 0 0 \n", "61583 1023 382 215 404 145 \n", "61584 2598 1314 520 422 584 \n", "61585 0 0 0 0 0 \n", "61586 0 0 0 0 0 \n", "\n", " VM_TA_AMP02 \n", "0 0 \n", "1 112 \n", "2 16 \n", "3 0 \n", "4 0 \n", "... ... \n", "61582 0 \n", "61583 315 \n", "61584 226 \n", "61585 0 \n", "61586 0 \n", "\n", "[61587 rows x 60 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Oxford13_poly" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Oxford0_poly_P: ['FM1812', 'FM22', 'FM2670', 'FM3310', 'FM3311', 'FM3591', 'FM3894', 'FM3901', 'FM4021', 'FM4024', 'FM4025', 'FM4028', 'FM4087', 'FM466', 'FM806', 'FM951', 'FM963', 'NEM_3613mbl', 'NEM_3613tubes', 'NEM_4743fbl', 'NEM_4743mbl', 'NEM_4743tubes', 'NEM_WTmbl', 'NEM_WTtubes']\n", "Oxford13_poly_P: ['3804', '4445', '6172', 'AS_TA_AMP01', 'FM1834', 'FM2259', 'FM2422', 'FM2428', 'FM2432', 'FM2433', 'FM2438', 'FM2511', 'FM2606', 'FM2616', 'FM2628', 'FM2633', 'FM2746', 'FM2864', 'FM2988', 'FM3017', 'FM3022', 'FM3060', 'FM3142', 'FM3206', 'FM3261', 'FM3262', 'FM3281', 'FM3288', 'FM3406', 'FM3407', 'FM3408', 'FM3409', 'FM3410', 'FM3411', 'FM3413', 'FM3415', 'FM3416', 'FM3417', 'FM3418', 'FM3421', 'FM3442', 'FM3459', 'FM846', 'FM957', 'HL_VL_AMP08', 'IA_VL_AMP03', 'JE_VL_AMP05', 'JH', 'JH_VL_AMP04', 'KO_AMP09', 'MR_VL_AMP07', 'V-P_VL_AMP10', 'VA_VL_AMP06', 'VM_TA_AMP02']\n", "Oxford4_ribo_P: ['3804', '4445', '6172', 'FM1834', 'FM2511', 'FM2616', 'FM3308', 'FM3206', 'FM3261', 'FM3262', 'FM3281', 'FM3288']\n", "Oxford7_poly_P: ['4974', '5930', '8061', 'FM2612', 'FM2621', 'FM2627', 'FM3198', 'FM3443', 'FM3479', 'FM3491', 'FM356', 'FM4220']\n", "Oxford8_poly_P: ['AS_TA_AMP01', 'FM1760', 'FM2251', 'FM2425', 'FM2481', 'FM2485', 'FM2763', 'FM2764', 'FM2827', 'FM3485', 'FM3597', 'FM3688', 'FM3689', 'FM3690', 'FM3692', 'HL_VL_AMP08', 'IA_VL_AMP03', 'JE_VL_AMP05', 'JH_VL_AMP04', 'KO_AMP09', 'MR_VL_AMP07', 'V-P_VL_AMP10', 'VA_VL_AMP06', 'VM_TA_AMP02']\n", "Oxford9_poly_P: ['FM1338', 'FM2086', 'FM2440', 'FM2532', 'FM2854', 'FM2861', 'FM3238', 'FM3257', 'FM3659', 'FM3690_1', 'FM3690_2', 'FM3764', 'FM3796', 'FM3799', 'FM3800', 'FM3814', 'FM3835', 'HL_VL_AMP08', 'JE_VL_AMP05', 'KO_AMP09']\n", "['AS_TA_AMP01', 'HL_VL_AMP08', 'IA_VL_AMP03', 'JE_VL_AMP05', 'JH_VL_AMP04', 'KO_AMP09', 'MR_VL_AMP07', 'V-P_VL_AMP10', 'VA_VL_AMP06', 'VM_TA_AMP02', 'HL_VL_AMP08', 'JE_VL_AMP05', 'KO_AMP09']\n" ] } ], "source": [ "Oxford0_poly_P = get_patient_code(Oxford0_poly)\n", "print(f'Oxford0_poly_P: {Oxford0_poly_P}')\n", "\n", "Oxford13_poly_P = get_patient_code(Oxford13_poly)\n", "print(f'Oxford13_poly_P: {Oxford13_poly_P}')\n", "\n", "Oxford4_ribo_P = get_patient_code(Oxford4_ribo)\n", "print(f'Oxford4_ribo_P: {Oxford4_ribo_P}')\n", "\n", "Oxford7_poly_P = get_patient_code(Oxford7_poly)\n", "print(f'Oxford7_poly_P: {Oxford7_poly_P}')\n", "\n", "Oxford8_poly_P = get_patient_code(Oxford8_poly)\n", "print(f'Oxford8_poly_P: {Oxford8_poly_P}')\n", "\n", "Oxford9_poly_P = get_patient_code(Oxford9_poly)\n", "print(f'Oxford9_poly_P: {Oxford9_poly_P}')\n", "\n", "All_names = Oxford0_poly_P + Oxford13_poly_P + Oxford7_poly_P + Oxford8_poly_P + Oxford9_poly_P\n", "\n", "# find duplicate \n", "\n", "Duplicate = [All_names[i] for i in range(len(All_names)) if not i == All_names.index(All_names[i])]\n", "print(Duplicate)\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['AS_TA_AMP01',\n", " 'HL_VL_AMP08',\n", " 'IA_VL_AMP03',\n", " 'JE_VL_AMP05',\n", " 'JH_VL_AMP04',\n", " 'KO_AMP09',\n", " 'MR_VL_AMP07',\n", " 'V-P_VL_AMP10',\n", " 'VA_VL_AMP06',\n", " 'VM_TA_AMP02',\n", " 'HL_VL_AMP08',\n", " 'JE_VL_AMP05',\n", " 'KO_AMP09']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Duplicate" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Add batch number to sample name" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['9_FM1338',\n", " '9_FM2086',\n", " '9_FM2440',\n", " '9_FM2532',\n", " '9_FM2854',\n", " '9_FM2861',\n", " '9_FM3238',\n", " '9_FM3257',\n", " '9_FM3659',\n", " '9_FM3690_1',\n", " '9_FM3690_2',\n", " '9_FM3764',\n", " '9_FM3796',\n", " '9_FM3799',\n", " '9_FM3800',\n", " '9_FM3814',\n", " '9_FM3835',\n", " '9_HL_VL_AMP08',\n", " '9_JE_VL_AMP05',\n", " '9_KO_AMP09']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# add str to every patient name\n", "\n", "def addstrto_code(RNA_set, str):\n", " '''add str to every patient name '''\n", " Patient_name = RNA_set.columns.tolist()[6:]\n", " Patient_name = [str + x for x in Patient_name]\n", " New_colnames = ['Geneid', 'Chr', 'Start', 'End', 'Strand', 'Length'] + Patient_name \n", " RNA_set.columns = New_colnames\n", " return RNA_set\n", "\n", "Oxford0_poly = addstrto_code(Oxford0_poly, '0_')\n", "Oxford13_poly = addstrto_code(Oxford13_poly, '13_')\n", "Oxford7_poly = addstrto_code(Oxford7_poly, '7_')\n", "Oxford8_poly = addstrto_code(Oxford8_poly, '8_')\n", "Oxford9_poly = addstrto_code(Oxford9_poly, '9_')\n", "\n", "Oxford9_poly.columns.tolist()[6:]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Merge" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Geneid',\n", " 'Chr',\n", " 'Start',\n", " 'End',\n", " 'Strand',\n", " 'Length',\n", " '0_FM1812',\n", " '0_FM22',\n", " '0_FM2670',\n", " '0_FM3310',\n", " '0_FM3311',\n", " '0_FM3591',\n", " '0_FM3894',\n", " '0_FM3901',\n", " '0_FM4021',\n", " '0_FM4024',\n", " '0_FM4025',\n", " '0_FM4028',\n", " '0_FM4087',\n", " '0_FM466',\n", " '0_FM806',\n", " '0_FM951',\n", " '0_FM963',\n", " '0_NEM_3613mbl',\n", " '0_NEM_3613tubes',\n", " '0_NEM_4743fbl',\n", " '0_NEM_4743mbl',\n", " '0_NEM_4743tubes',\n", " '0_NEM_WTmbl',\n", " '0_NEM_WTtubes',\n", " '13_3804',\n", " '13_4445',\n", " '13_6172',\n", " '13_AS_TA_AMP01',\n", " '13_FM1834',\n", " '13_FM2259',\n", " '13_FM2422',\n", " '13_FM2428',\n", " '13_FM2432',\n", " '13_FM2433',\n", " '13_FM2438',\n", " '13_FM2511',\n", " '13_FM2606',\n", " '13_FM2616',\n", " '13_FM2628',\n", " '13_FM2633',\n", " '13_FM2746',\n", " '13_FM2864',\n", " '13_FM2988',\n", " '13_FM3017',\n", " '13_FM3022',\n", " '13_FM3060',\n", " '13_FM3142',\n", " '13_FM3206',\n", " '13_FM3261',\n", " '13_FM3262',\n", " '13_FM3281',\n", " '13_FM3288',\n", " '13_FM3406',\n", " '13_FM3407',\n", " '13_FM3408',\n", " '13_FM3409',\n", " '13_FM3410',\n", " '13_FM3411',\n", " '13_FM3413',\n", " '13_FM3415',\n", " '13_FM3416',\n", " '13_FM3417',\n", " '13_FM3418',\n", " '13_FM3421',\n", " '13_FM3442',\n", " '13_FM3459',\n", " '13_FM846',\n", " '13_FM957',\n", " '13_HL_VL_AMP08',\n", " '13_IA_VL_AMP03',\n", " '13_JE_VL_AMP05',\n", " '13_JH',\n", " '13_JH_VL_AMP04',\n", " '13_KO_AMP09',\n", " '13_MR_VL_AMP07',\n", " '13_V-P_VL_AMP10',\n", " '13_VA_VL_AMP06',\n", " '13_VM_TA_AMP02',\n", " '7_4974',\n", " '7_5930',\n", " '7_8061',\n", " '7_FM2612',\n", " '7_FM2621',\n", " '7_FM2627',\n", " '7_FM3198',\n", " '7_FM3443',\n", " '7_FM3479',\n", " '7_FM3491',\n", " '7_FM356',\n", " '7_FM4220',\n", " '8_AS_TA_AMP01',\n", " '8_FM1760',\n", " '8_FM2251',\n", " '8_FM2425',\n", " '8_FM2481',\n", " '8_FM2485',\n", " '8_FM2763',\n", " '8_FM2764',\n", " '8_FM2827',\n", " '8_FM3485',\n", " '8_FM3597',\n", " '8_FM3688',\n", " '8_FM3689',\n", " '8_FM3690',\n", " '8_FM3692',\n", " '8_HL_VL_AMP08',\n", " '8_IA_VL_AMP03',\n", " '8_JE_VL_AMP05',\n", " '8_JH_VL_AMP04',\n", " '8_KO_AMP09',\n", " '8_MR_VL_AMP07',\n", " '8_V-P_VL_AMP10',\n", " '8_VA_VL_AMP06',\n", " '8_VM_TA_AMP02',\n", " '9_FM1338',\n", " '9_FM2086',\n", " '9_FM2440',\n", " '9_FM2532',\n", " '9_FM2854',\n", " '9_FM2861',\n", " '9_FM3238',\n", " '9_FM3257',\n", " '9_FM3659',\n", " '9_FM3690_1',\n", " '9_FM3690_2',\n", " '9_FM3764',\n", " '9_FM3796',\n", " '9_FM3799',\n", " '9_FM3800',\n", " '9_FM3814',\n", " '9_FM3835',\n", " '9_HL_VL_AMP08',\n", " '9_JE_VL_AMP05',\n", " '9_KO_AMP09']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_L.columns.tolist()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GeneidChrStartEndStrandLength0_FM18120_FM220_FM26700_FM3310...9_FM3690_29_FM37649_FM37969_FM37999_FM38009_FM38149_FM38359_HL_VL_AMP089_JE_VL_AMP059_KO_AMP09
0ENSG00000223972.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr111869;12010;12179;12613;12613;12975;13221;1322...12227;12057;12227;12721;12697;13052;14409;1337...+;+;+;+;+;+;+;+;+173511227...0.01.08.01.00.01.00.00.00.00.0
1ENSG00000227232.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;c...14404;15005;15796;16607;16858;17233;17606;1791...14501;15038;15947;16765;17055;17368;17742;1806...-;-;-;-;-;-;-;-;-;-;-135112222913974...207.0281.0454.0263.0449.0207.0695.0572.0881.0708.0
2ENSG00000278267.1chr11736917436-68222898...19.015.055.06.052.025.053.040.056.034.0
3ENSG00000243485.5chr1;chr1;chr1;chr1;chr129554;30267;30564;30976;3097630039;30667;30667;31109;31097+;+;+;+;+102161062...0.00.01.03.01.00.01.00.00.00.0
4ENSG00000284332.1chr13036630503+1380000...0.00.00.00.00.00.00.00.00.00.0
..................................................................
61582ENSG00000276017.1KI270734.17241174814+24040000...0.00.00.00.00.00.00.00.00.00.0
61583ENSG00000278817.1KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI...131494;131836;135443;136159;136845131755;131996;135543;136299;137392+;+;+;+;+1213264186660472...747.0876.0592.0479.0250.0405.0892.0617.0927.0687.0
61584ENSG00000277196.4KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI...138082;138082;138743;138743;142194;142194;1436...138667;138667;138831;138831;142292;142292;1437...-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;...240510659107261...147.0240.0415.048.0405.0150.0305.01023.0429.02899.0
61585ENSG00000278625.1KI270744.15100951114-1060000...0.00.00.00.00.00.00.00.00.00.0
61586ENSG00000277374.1KI270750.1148668148843+1760000...0.00.00.00.00.00.00.00.00.00.0
\n", "

61587 rows × 140 columns

\n", "
" ], "text/plain": [ " Geneid Chr \\\n", "0 ENSG00000223972.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1 \n", "1 ENSG00000227232.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;c... \n", "2 ENSG00000278267.1 chr1 \n", "3 ENSG00000243485.5 chr1;chr1;chr1;chr1;chr1 \n", "4 ENSG00000284332.1 chr1 \n", "... ... ... \n", "61582 ENSG00000276017.1 KI270734.1 \n", "61583 ENSG00000278817.1 KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI... \n", "61584 ENSG00000277196.4 KI270734.1;KI270734.1;KI270734.1;KI270734.1;KI... \n", "61585 ENSG00000278625.1 KI270744.1 \n", "61586 ENSG00000277374.1 KI270750.1 \n", "\n", " Start \\\n", "0 11869;12010;12179;12613;12613;12975;13221;1322... \n", "1 14404;15005;15796;16607;16858;17233;17606;1791... \n", "2 17369 \n", "3 29554;30267;30564;30976;30976 \n", "4 30366 \n", "... ... \n", "61582 72411 \n", "61583 131494;131836;135443;136159;136845 \n", "61584 138082;138082;138743;138743;142194;142194;1436... \n", "61585 51009 \n", "61586 148668 \n", "\n", " End \\\n", "0 12227;12057;12227;12721;12697;13052;14409;1337... \n", "1 14501;15038;15947;16765;17055;17368;17742;1806... \n", "2 17436 \n", "3 30039;30667;30667;31109;31097 \n", "4 30503 \n", "... ... \n", "61582 74814 \n", "61583 131755;131996;135543;136299;137392 \n", "61584 138667;138667;138831;138831;142292;142292;1437... \n", "61585 51114 \n", "61586 148843 \n", "\n", " Strand Length 0_FM1812 \\\n", "0 +;+;+;+;+;+;+;+;+ 1735 1 \n", "1 -;-;-;-;-;-;-;-;-;-;- 1351 122 \n", "2 - 68 22 \n", "3 +;+;+;+;+ 1021 6 \n", "4 + 138 0 \n", "... ... ... ... \n", "61582 + 2404 0 \n", "61583 +;+;+;+;+ 1213 264 \n", "61584 -;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;... 2405 106 \n", "61585 - 106 0 \n", "61586 + 176 0 \n", "\n", " 0_FM22 0_FM2670 0_FM3310 ... 9_FM3690_2 9_FM3764 9_FM3796 \\\n", "0 12 2 7 ... 0.0 1.0 8.0 \n", "1 229 139 74 ... 207.0 281.0 454.0 \n", "2 28 9 8 ... 19.0 15.0 55.0 \n", "3 10 6 2 ... 0.0 0.0 1.0 \n", "4 0 0 0 ... 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... \n", "61582 0 0 0 ... 0.0 0.0 0.0 \n", "61583 186 660 472 ... 747.0 876.0 592.0 \n", "61584 59 107 261 ... 147.0 240.0 415.0 \n", "61585 0 0 0 ... 0.0 0.0 0.0 \n", "61586 0 0 0 ... 0.0 0.0 0.0 \n", "\n", " 9_FM3799 9_FM3800 9_FM3814 9_FM3835 9_HL_VL_AMP08 9_JE_VL_AMP05 \\\n", "0 1.0 0.0 1.0 0.0 0.0 0.0 \n", "1 263.0 449.0 207.0 695.0 572.0 881.0 \n", "2 6.0 52.0 25.0 53.0 40.0 56.0 \n", "3 3.0 1.0 0.0 1.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... \n", "61582 0.0 0.0 0.0 0.0 0.0 0.0 \n", "61583 479.0 250.0 405.0 892.0 617.0 927.0 \n", "61584 48.0 405.0 150.0 305.0 1023.0 429.0 \n", "61585 0.0 0.0 0.0 0.0 0.0 0.0 \n", "61586 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", " 9_KO_AMP09 \n", "0 0.0 \n", "1 708.0 \n", "2 34.0 \n", "3 0.0 \n", "4 0.0 \n", "... ... \n", "61582 0.0 \n", "61583 687.0 \n", "61584 2899.0 \n", "61585 0.0 \n", "61586 0.0 \n", "\n", "[61587 rows x 140 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_L = pd.merge(Oxford0_poly, Oxford13_poly.drop(Oxford13_poly.loc[:, 'Chr':'Length'].columns,axis = 1), how='left', on='Geneid')\n", "merge_L = pd.merge(merge_L, Oxford7_poly.drop(Oxford7_poly.loc[:, 'Chr':'Length'].columns,axis = 1), how='left', on='Geneid')\n", "merge_L = pd.merge(merge_L, Oxford8_poly.drop(Oxford8_poly.loc[:, 'Chr':'Length'].columns,axis = 1), how='left', on='Geneid')\n", "merge_L = pd.merge(merge_L, Oxford9_poly.drop(Oxford9_poly.loc[:, 'Chr':'Length'].columns,axis = 1), how='left', on='Geneid')\n", "merge_L" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "merge_L.drop(Oxford9_poly.loc[:, 'Chr':'Length'].columns,axis = 1).to_csv('Helsinki data/merge_L_Helsinki.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "scanpy", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "e9f8ff62c5711ee4bd4c1f699a5549e03309d5474ddcc1a5f24509da60d4b59d" } } }, "nbformat": 4, "nbformat_minor": 2 }