{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import argparse\n", "import pandas as pd\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import seaborn as sns\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "genes_df=pd.read_table(\"U00096.3.gtf\",header=None)\n", "genes_df[\"Gene name\"]=genes_df[8].apply(lambda x: x.split(';')[4].split('=')[1])\n", "genes_df=genes_df.loc[:,[6,'Gene name']]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/britney/miniconda3/lib/python3.7/site-packages/ipykernel_launcher.py:15: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", "of pandas will change to not sort by default.\n", "\n", "To accept the future behavior, pass 'sort=False'.\n", "\n", "To retain the current behavior and silence the warning, pass 'sort=True'.\n", "\n", " from ipykernel import kernelapp as app\n" ] } ], "source": [ "sample='bm03'\n", "plus_df=pd.read_table('2018-05-22_RNAseq/bm03_fwd_end50genes.txt',header=None)\n", "plus_df=plus_df.iloc[0:4419,:]\n", "plus_df.columns=['Gene name','plus_count']\n", "plus_df=plus_df.merge(genes_df, on ='Gene name')\n", "minus_df=pd.read_table('2018-05-22_RNAseq/bm03_rev_end50genes.txt',header=None)\n", "minus_df=minus_df.iloc[0:4419,:]\n", "minus_df.columns=['Gene name','minus_count']\n", "minus_df=minus_df.merge(genes_df, on ='Gene name')\n", "merge_df=pd.merge(plus_df,minus_df)\n", "plusgenes_df=merge_df[merge_df[6]=='+']\n", "plusgenes_df.columns=['Gene name',sample+'_TS',6,sample+'_NTS']\n", "minusgenes_df=merge_df[merge_df[6]=='-']\n", "minusgenes_df.columns=['Gene name',sample+'_NTS',6,sample+'_TS']\n", "joined_df=pd.concat([plusgenes_df,minusgenes_df])\n", "joined_df=joined_df.sort_index()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "filtered=joined_df[joined_df['bm03_NTS']/joined_df['bm03_TS']>2]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "filtered=filtered[filtered['bm03_NTS']>20]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#filtered.to_csv('test_high_NTS')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Gene name | \n", "bm03_TS | \n", "6 | \n", "bm03_NTS | \n", "
---|---|---|---|---|
3 | \n", "aaeX | \n", "17 | \n", "- | \n", "136 | \n", "
12 | \n", "abrB | \n", "19 | \n", "- | \n", "43 | \n", "
31 | \n", "acrD | \n", "23 | \n", "+ | \n", "3420 | \n", "
50 | \n", "aes | \n", "30 | \n", "- | \n", "61 | \n", "
57 | \n", "agaS | \n", "10 | \n", "+ | \n", "24 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4030 | \n", "ytfE | \n", "46 | \n", "- | \n", "169 | \n", "
4041 | \n", "ytiA | \n", "6 | \n", "- | \n", "53 | \n", "
4044 | \n", "yzcX | \n", "37 | \n", "+ | \n", "93 | \n", "
4045 | \n", "yzfA | \n", "0 | \n", "- | \n", "106 | \n", "
4046 | \n", "yzgL | \n", "74 | \n", "- | \n", "1782 | \n", "
183 rows × 4 columns
\n", "\n", " | Gene name | \n", "bm03_TS | \n", "6 | \n", "bm03_NTS | \n", "
---|---|---|---|---|
2 | \n", "aaeR | \n", "28 | \n", "+ | \n", "43 | \n", "
3 | \n", "aaeX | \n", "17 | \n", "- | \n", "136 | \n", "
7 | \n", "abgB | \n", "3 | \n", "- | \n", "17 | \n", "
8 | \n", "abgR | \n", "22 | \n", "+ | \n", "39 | \n", "
12 | \n", "abrB | \n", "19 | \n", "- | \n", "43 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4041 | \n", "ytiA | \n", "6 | \n", "- | \n", "53 | \n", "
4044 | \n", "yzcX | \n", "37 | \n", "+ | \n", "93 | \n", "
4045 | \n", "yzfA | \n", "0 | \n", "- | \n", "106 | \n", "
4046 | \n", "yzgL | \n", "74 | \n", "- | \n", "1782 | \n", "
4061 | \n", "zraP | \n", "16 | \n", "- | \n", "32 | \n", "
492 rows × 4 columns
\n", "